Xlsx duplicate header error fix

The tabular renderer will no longer overwrite the values for headers that have the same name. Instead it will rename all duplicated headers in the format `name (1)`
CenterForOpenScience · AddisonSchiller · Oct 10, 2017 · Nov 16, 2017 · Nov 16, 2017 · Nov 16, 2017
commit 6658c827c13ca38a91bdf3e23647f84a4d401f1e
diff --git a/mfr/extensions/tabular/libs/xlrd_tools.py b/mfr/extensions/tabular/libs/xlrd_tools.py
@@ -1,13 +1,15 @@
-import xlrd
+import uuid
 from collections import OrderedDict
-from ..exceptions import TableTooBigError
 
-from ..utilities import header_population
+import xlrd
+
 from mfr.extensions.tabular.compat import range, basestring
+from mfr.extensions.tabular.utilities import header_population
+from mfr.extensions.tabular.exceptions import TableTooBigError
 
 
 def xlsx_xlrd(fp):
-    """Read and convert a xlsx file to JSON format using the xlrd library
+    """Read and convert a xlsx file to JSON format using the xlrd library.
     :param fp: File pointer object
     :return: tuple of table headers and data
     """
@@ -34,6 +36,30 @@ def xlsx_xlrd(fp):
             for index, value in enumerate(fields)
         ]
 
+        # Duplicate header fields create errors, we need to rename them
+        duplicate_fields = set([x for x in fields if fields.count(x) > 1])
+        if len(duplicate_fields):
+            counts = {}
+            for name in duplicate_fields:
+                counts[name] = 1
+
+            for x in range(len(fields)):
+                if fields[x] in duplicate_fields:
+                    name = fields[x]
+                    increased_name = name + ' ({})'.format(counts[name])
+                    # this triggers if you try to rename a header, and that new name
+                    # already exists in fields. it will then increment to look for the
+                    # next available name.
+                    iteration = 0
+                    while increased_name in fields:
+                        iteration += 1
+                        if iteration > 5000:
+                            increased_name = name + ' ({})'.format(uuid.uuid4())
+                        else:
+                            counts[name] += 1
+                            increased_name = name + ' ({})'.format(counts[name])
+
+                    fields[x] = increased_name
         data = []
         for i in range(1, sheet.nrows):
             row = []

diff --git a/tests/extensions/tabular/files/test_duplicate.xlsx b/tests/extensions/tabular/files/test_duplicate.xlsx
diff --git a/tests/extensions/tabular/files/test_duplicate_uuid.xlsx b/tests/extensions/tabular/files/test_duplicate_uuid.xlsx
diff --git a/tests/extensions/tabular/test_xlsx_tools.py b/tests/extensions/tabular/test_xlsx_tools.py
@@ -1,5 +1,7 @@
 import os
 
+import pytest
+
 from mfr.extensions.tabular.libs import xlrd_tools
 
 BASE = os.path.dirname(os.path.abspath(__file__))
@@ -17,4 +19,34 @@ def test_xlsx_xlrd(self):
         assert sheet[0][2] == {'field': 'three', 'id': 'three', 'name': 'three', 'sortable': True}
         assert sheet[1][0] == {'one': 'a', 'two': 'b', 'three': 'c'}
         assert sheet[1][1] == {'one': 1.0, 'two': 2.0, 'three': 3.0}
-        assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98','three': u'set\\x99'}
+        assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98', 'three': u'set\\x99'}
+
+    def test_xlsx_xlrd_duplicate_fields(self):
+        with open(os.path.join(BASE, 'files', 'test_duplicate.xlsx')) as fp:
+            sheets = xlrd_tools.xlsx_xlrd(fp)
+
+        sheet = sheets.popitem()[1]
+        assert sheet[0][0] == {'id': 'Name', 'name': 'Name', 'field': 'Name', 'sortable': True}
+        assert sheet[0][1] == {'id': 'Dup (1)', 'name': 'Dup (1)',
+                                'field': 'Dup (1)', 'sortable': True}
+        assert sheet[0][2] == {'id': 'Dup (2)', 'name': 'Dup (2)',
+                                'field': 'Dup (2)', 'sortable': True}
+        assert sheet[0][3] == {'id': 'Dup (3)', 'name': 'Dup (3)',
+                                'field': 'Dup (3)', 'sortable': True}
+        assert sheet[0][4] == {'id': 'Dup (4)', 'name': 'Dup (4)',
+                                'field': 'Dup (4)', 'sortable': True}
+        assert sheet[0][5] == {'id': 'Not Dup', 'name': 'Not Dup',
+                                'field': 'Not Dup', 'sortable': True}
+        assert sheet[1][0] == {'Name': 1.0, 'Dup (1)': 2.0, 'Dup (2)': 3.0,
+                            'Dup (3)': 4.0, 'Dup (4)': 5.0, 'Not Dup': 6.0}
+
+    # After demo it was suggested the iteration cap be raised. The value ended up to be about 5,000
+    # Unsure best way to test this case with such a high cap. Going to leave the test for now
+    # def test_xlsx_xlrd_duplicate_fields_handle_naming(self):
+    #     with open(os.path.join(BASE, 'files', 'test_duplicate_uuid.xlsx')) as fp:
+    #         sheets = xlrd_tools.xlsx_xlrd(fp)
+    #     print(sheets)
+    #     sheet = sheets.popitem()[1]
+
+    #     assert sheet[0][1]['field'] != 'dup (12)'
+    #     assert len(sheet[0][1]['field']) > 24