Skip to content

Commit

Permalink
this test just shows that we can't escape CR in a quoted string. wc -…
Browse files Browse the repository at this point in the history
…l says the dataset has 10 rows, h2o says it has 20 rows

EOL here is the normal unix LF. If you have more than one EOL in your dataset, you have to use some other tool to turn it into a one-EOL dataset. (also, h2o doesn't tell you your dataset is not what you expect). There are no warnings to the stdout here. Since the CR is in the middle of a quoted string and terminates that string parsing, everything back to the opening double quote is lost, and the col is NA'ed (and the next col started on a new row. (which causes NAs because it doesn't have enough cols to fill the row)
  • Loading branch information
Kevin Normoyle committed Oct 16, 2014
1 parent c522b38 commit 55eafc7
Showing 1 changed file with 108 additions and 0 deletions.
108 changes: 108 additions & 0 deletions py/testdir_single_jvm/test_parse_specific_case6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import unittest, random, sys, time, os
sys.path.extend(['.','..','py'])

import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i
import codecs, unicodedata
print "create some specific small datasets with exp row/col combinations"
print "This is CR in quoted string. EOL is LF. Shows that we can't escape the alternate EOLs"

# toDoList = range(0x20,0x80)
toDoList = [0x0d] # CR ...we put it in quoted string below

def removeIfThere(d):
if d in toDoList:
toDoList.remove(d)

removeIfThere(0xa) # lf. this is the unix eol

tryList = []
for i in toDoList:
unicodeSymbol = unichr(i)

tryList.append(
# the nul char I think is causing extra rows and also wiping out the next char?
# I got nulls when concat'ing files with dd. may be used for padding somehow?
((
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
'a,b,c,"d' + unicodeSymbol + 's",n\n'
), 10, 4, [0,0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum', 'Enum'], i)
)

# h2o incorrectly will match this
# 1, 1, [0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum']),

# u = unichr(0x2018) + unichr(6000) + unichr(0x2019)
# for i, c in enumerate(u):
# print i, '%04x' % ord(c), unicodedata.category(c),
# print unicodedata.name(c)

def write_syn_dataset(csvPathname, dataset):
dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
encoded = dataset.encode('utf-8')
print "utf8:" , repr(encoded), type(encoded)
print "str or utf8:" , repr(dataset), type(dataset)
dsf.write(dataset)
dsf.close()

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global SEED, localhost
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(java_heap_GB=1)
else:
h2o_hosts.build_cloud_with_hosts()

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_parse_specific_case6(self):
SYNDATASETS_DIR = h2o.make_syn_dir()
hex_key = "a.hex"

for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList:
csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
write_syn_dataset(csvPathname, dataset)

parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
hex_key=hex_key, timeoutSecs=10, doSummary=False)
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)

print "Parsed with special unichr(%s) which is %s:" % (unicodeNum, unichr(unicodeNum))
print "inspect:", h2o.dump_json(inspect)
numRows = inspect['numRows']
self.assertEqual(numRows, expNumRows, msg='Using unichr(0x%x) Wrong numRows: %s Expected: %s' % \
(unicodeNum, numRows, expNumRows))
numCols = inspect['numCols']
self.assertEqual(numCols, expNumCols, msg='Using unichr(0x%x) Wrong numCols: %s Expected: %s' % \
(unicodeNum, numCols, expNumCols))

# this is required for the test setup
assert(len(expNaCnt)>=expNumCols)
assert(len(expType)>=expNumCols)

for k in range(expNumCols):
naCnt = inspect['cols'][k]['naCnt']
self.assertEqual(expNaCnt[k], naCnt, msg='Using unichr(0x%x) col: %s naCnt: %d should be: %s' % \
(unicodeNum, k, naCnt, expNaCnt[k]))
stype = inspect['cols'][k]['type']
self.assertEqual(expType[k], stype, msg='Using unichr(0x%x) col: %s type: %s should be: %s' % \
(unicodeNum, k, stype, expType[k]))

if __name__ == '__main__':
h2o.unit_main()

0 comments on commit 55eafc7

Please sign in to comment.