Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SVCS-530] Xlsx duplicate header error fix #288

Open
wants to merge 4 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Xlsx duplicate header error fix
The tabular renderer will no longer overwrite the values
for headers that have the same name. Instead it will rename
all duplicated headers in the format `name (1)`
  • Loading branch information
AddisonSchiller committed Oct 12, 2017
commit 6658c827c13ca38a91bdf3e23647f84a4d401f1e
34 changes: 30 additions & 4 deletions mfr/extensions/tabular/libs/xlrd_tools.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
import xlrd
import uuid
from collections import OrderedDict
from ..exceptions import TableTooBigError

from ..utilities import header_population
import xlrd

from mfr.extensions.tabular.compat import range, basestring
from mfr.extensions.tabular.utilities import header_population
from mfr.extensions.tabular.exceptions import TableTooBigError


def xlsx_xlrd(fp):
"""Read and convert a xlsx file to JSON format using the xlrd library
"""Read and convert a xlsx file to JSON format using the xlrd library.
:param fp: File pointer object
:return: tuple of table headers and data
"""
Expand All @@ -34,6 +36,30 @@ def xlsx_xlrd(fp):
for index, value in enumerate(fields)
]

# Duplicate header fields create errors, we need to rename them
duplicate_fields = set([x for x in fields if fields.count(x) > 1])
if len(duplicate_fields):
counts = {}
for name in duplicate_fields:
counts[name] = 1

for x in range(len(fields)):
if fields[x] in duplicate_fields:
name = fields[x]
increased_name = name + ' ({})'.format(counts[name])
# this triggers if you try to rename a header, and that new name
# already exists in fields. it will then increment to look for the
# next available name.
iteration = 0
while increased_name in fields:
iteration += 1
if iteration > 5000:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Set iteration cap as a default argument and use a lower number for testing.

increased_name = name + ' ({})'.format(uuid.uuid4())
else:
counts[name] += 1
increased_name = name + ' ({})'.format(counts[name])

fields[x] = increased_name
data = []
for i in range(1, sheet.nrows):
row = []
Expand Down
Binary file not shown.
Binary file not shown.
34 changes: 33 additions & 1 deletion tests/extensions/tabular/test_xlsx_tools.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import os

import pytest

from mfr.extensions.tabular.libs import xlrd_tools

BASE = os.path.dirname(os.path.abspath(__file__))
Expand All @@ -17,4 +19,34 @@ def test_xlsx_xlrd(self):
assert sheet[0][2] == {'field': 'three', 'id': 'three', 'name': 'three', 'sortable': True}
assert sheet[1][0] == {'one': 'a', 'two': 'b', 'three': 'c'}
assert sheet[1][1] == {'one': 1.0, 'two': 2.0, 'three': 3.0}
assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98','three': u'set\\x99'}
assert sheet[1][2] == {'one': u'wierd\\x97', 'two': u'char\\x98', 'three': u'set\\x99'}

def test_xlsx_xlrd_duplicate_fields(self):
with open(os.path.join(BASE, 'files', 'test_duplicate.xlsx')) as fp:
sheets = xlrd_tools.xlsx_xlrd(fp)

sheet = sheets.popitem()[1]
assert sheet[0][0] == {'id': 'Name', 'name': 'Name', 'field': 'Name', 'sortable': True}
assert sheet[0][1] == {'id': 'Dup (1)', 'name': 'Dup (1)',
'field': 'Dup (1)', 'sortable': True}
assert sheet[0][2] == {'id': 'Dup (2)', 'name': 'Dup (2)',
'field': 'Dup (2)', 'sortable': True}
assert sheet[0][3] == {'id': 'Dup (3)', 'name': 'Dup (3)',
'field': 'Dup (3)', 'sortable': True}
assert sheet[0][4] == {'id': 'Dup (4)', 'name': 'Dup (4)',
'field': 'Dup (4)', 'sortable': True}
assert sheet[0][5] == {'id': 'Not Dup', 'name': 'Not Dup',
'field': 'Not Dup', 'sortable': True}
assert sheet[1][0] == {'Name': 1.0, 'Dup (1)': 2.0, 'Dup (2)': 3.0,
'Dup (3)': 4.0, 'Dup (4)': 5.0, 'Not Dup': 6.0}

# After demo it was suggested the iteration cap be raised. The value ended up to be about 5,000
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As suggested above, use a default arg for iterations and set it lower. You can then use this for this test instead of having to make a file to iterate 5000 times.

# Unsure best way to test this case with such a high cap. Going to leave the test for now
# def test_xlsx_xlrd_duplicate_fields_handle_naming(self):
# with open(os.path.join(BASE, 'files', 'test_duplicate_uuid.xlsx')) as fp:
# sheets = xlrd_tools.xlsx_xlrd(fp)
# print(sheets)
# sheet = sheets.popitem()[1]

# assert sheet[0][1]['field'] != 'dup (12)'
# assert len(sheet[0][1]['field']) > 24