Skip to content

Commit

Permalink
tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
Kevin Normoyle committed Oct 16, 2014
1 parent 36d86bc commit c522b38
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 12 deletions.
4 changes: 2 additions & 2 deletions py/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -1297,9 +1297,9 @@ def put_file(self, f, key=None, timeoutSecs=60):
def poll_url(self, response,
timeoutSecs=10, retryDelaySecs=0.5, initialDelaySecs=0, pollTimeoutSecs=180,
noise=None, benchmarkLogging=None, noPoll=False, reuseFirstPollUrl=False, noPrint=False):
### print "poll_url: pollTimeoutSecs", pollTimeoutSecs
verboseprint('poll_url input: response:', dump_json(response))
print "at top of poll_url, timeoutSecs: ", timeoutSecs
### print "poll_url: pollTimeoutSecs", pollTimeoutSecs
### print "at top of poll_url, timeoutSecs: ", timeoutSecs

# for the rev 2 stuff..the job_key, destination_key and redirect_url are just in the response
# look for 'response'..if not there, assume the rev 2
Expand Down
4 changes: 2 additions & 2 deletions py/testdir_single_jvm/test_parse_specific_case1.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@

def write_syn_dataset(csvPathname, dataset):
dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
decoded = dataset.decode('utf-8')
print "utf8:" , repr(decoded), type(decoded)
encoded = dataset.decode('utf-8')
print "utf8:" , repr(encoded), type(encoded)
print "str or utf8:" , repr(dataset), type(dataset)
dsf.write(dataset)
dsf.close()
Expand Down
6 changes: 3 additions & 3 deletions py/testdir_single_jvm/test_parse_specific_case2.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@

def write_syn_dataset(csvPathname, dataset):
dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
decoded = dataset.decode('utf-8')
print "utf8:" , repr(decoded), type(decoded)
encoded = dataset.encode('utf-8')
print "utf8:" , repr(encoded), type(encoded)
print "str or utf8:" , repr(dataset), type(dataset)
dsf.write(dataset)
dsf.close()
Expand Down Expand Up @@ -67,7 +67,7 @@ def test_parse_specific_case2(self):
hex_key = "a.hex"

for (dataset, expNumRows, expNumCols, expNaCnt, expType) in tryList:
csvFilename = 'specific_' + str(expNumRows) + str(expNumCols) + '.csv'
csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
write_syn_dataset(csvPathname, dataset)

Expand Down
95 changes: 95 additions & 0 deletions py/testdir_single_jvm/test_parse_specific_case2a.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import unittest, random, sys, time, os
sys.path.extend(['.','..','py'])

import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i
import codecs, unicodedata
print "create some specific small datasets with exp row/col combinations"
print "I'll keep it to one case per file"

# this works
# unicodeNull = unichr(0x33)
# this fails
unicodeNull = unichr(0x0)

tryList = [
# the nul char I think is causing extra rows and also wiping out the next char?
# I got nulls when concat'ing files with dd. may be used for padding somehow?
((
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
'a,b,c,d' + unicodeNull + ',n\n'
), 10, 5, [0,0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum', 'Enum']),
]

# h2o incorrectly will match this
# 1, 1, [0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum']),

# u = unichr(0x2018) + unichr(6000) + unichr(0x2019)
# for i, c in enumerate(u):
# print i, '%04x' % ord(c), unicodedata.category(c),
# print unicodedata.name(c)

def write_syn_dataset(csvPathname, dataset):
dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
encoded = dataset.encode('utf-8')
print "utf8:" , repr(encoded), type(encoded)
print "str or utf8:" , repr(dataset), type(dataset)
dsf.write(dataset)
dsf.close()

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global SEED, localhost
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(java_heap_GB=1)
else:
h2o_hosts.build_cloud_with_hosts()

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_parse_specific_case2a(self):
SYNDATASETS_DIR = h2o.make_syn_dir()
hex_key = "a.hex"

for (dataset, expNumRows, expNumCols, expNaCnt, expType) in tryList:
csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
write_syn_dataset(csvPathname, dataset)

parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
hex_key=hex_key, timeoutSecs=10, doSummary=False)
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)

print "inspect:", h2o.dump_json(inspect)
numRows = inspect['numRows']
self.assertEqual(numRows, expNumRows, msg='Wrong numRows: %s Expected: %s' % (numRows, expNumRows))
numCols = inspect['numCols']
self.assertEqual(numCols, expNumCols, msg='Wrong numCols: %s Expected: %s' % (numCols, expNumCols))

# this is required for the test setup
assert(len(expNaCnt)>=expNumCols)
assert(len(expType)>=expNumCols)

for k in range(expNumCols):
naCnt = inspect['cols'][k]['naCnt']
self.assertEqual(expNaCnt[k], naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, expNaCnt[k]))
stype = inspect['cols'][k]['type']
self.assertEqual(expType[k], stype, msg='col %s type %s should be %s' % (k, stype, expType[k]))

if __name__ == '__main__':
h2o.unit_main()
8 changes: 3 additions & 5 deletions py/testdir_single_jvm/test_parse_specific_case3.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
print "create some specific small datasets with exp row/col combinations"
print "I'll keep it to one case per file"

print "I would expect the unmatched double quote turns this in a 4 col dataset, not 1"
print "apparently turns it into all NAs"
# toDoList = range(0x20,0x80)
toDoList = [0x22] # double quote

Expand Down Expand Up @@ -48,8 +46,8 @@ def removeIfThere(d):

def write_syn_dataset(csvPathname, dataset):
dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
decoded = dataset.decode('utf-8')
print "utf8:" , repr(decoded), type(decoded)
encoded = dataset.encode('utf-8')
print "utf8:" , repr(encoded), type(encoded)
print "str or utf8:" , repr(dataset), type(dataset)
dsf.write(dataset)
dsf.close()
Expand Down Expand Up @@ -77,7 +75,7 @@ def test_parse_specific_case3(self):
hex_key = "a.hex"

for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList:
csvFilename = 'specific_' + str(expNumRows) + str(expNumCols) + '.csv'
csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
write_syn_dataset(csvPathname, dataset)

Expand Down
121 changes: 121 additions & 0 deletions py/testdir_single_jvm/test_parse_specific_case4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import unittest, random, sys, time, os
sys.path.extend(['.','..','py'])

import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i
import codecs, unicodedata
print "create some specific small datasets with exp row/col combinations"
print "I'll keep it to one case per file"

# toDoList = range(0x0,0x80)
# 0x1 can be the hive separator? if we force comma it should be treated as char
# should try without and change expected cols
toDoList = range(0x00, 0x100)

def removeIfThere(d):
if d in toDoList:
toDoList.remove(d)

H2O_COL_SEPARATOR = 0x2c # comma
# H2O_COL_SEPARATOR = 0x1 # hive separator

# removeIfThere(0x1) # hive separator okay if we force comma below

removeIfThere(0x0) # nul. known issue
removeIfThere(0xa) # LF. causes EOL
removeIfThere(0xd) # CR. causes EOL
removeIfThere(0x22) # double quote. known issue
removeIfThere(0x2c) # comma. don't mess up my expected col count

# could try single quote if enabled, to see if does damage. probably like double quote

tryList = []
for i in toDoList:
unicodeSymbol = unichr(i)

tryList.append(
# the nul char I think is causing extra rows and also wiping out the next char?
# I got nulls when concat'ing files with dd. may be used for padding somehow?
((
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
'a,b,c,d' + unicodeSymbol + 's,n\n'
), 10, 5, [0,0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum', 'Enum'], i)
)

# h2o incorrectly will match this
# 1, 1, [0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum']),

# u = unichr(0x2018) + unichr(6000) + unichr(0x2019)
# for i, c in enumerate(u):
# print i, '%04x' % ord(c), unicodedata.category(c),
# print unicodedata.name(c)

def write_syn_dataset(csvPathname, dataset):
dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
encoded = dataset.encode('utf-8')
print "utf8:" , repr(encoded), type(encoded)
print "str or utf8:" , repr(dataset), type(dataset)
dsf.write(dataset)
dsf.close()

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global SEED, localhost
SEED = h2o.setup_random_seed()
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(java_heap_GB=1)
else:
h2o_hosts.build_cloud_with_hosts()

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_parse_specific_case4(self):
SYNDATASETS_DIR = h2o.make_syn_dir()
hex_key = "a.hex"

for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList:
csvFilename = 'specific_' + str(expNumRows) + str(expNumCols) + '.csv'
csvPathname = SYNDATASETS_DIR + '/' + csvFilename
write_syn_dataset(csvPathname, dataset)

parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
hex_key=hex_key, timeoutSecs=10, doSummary=False, separator=H2O_COL_SEPARATOR) # force comma separator
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)

print "Parsed with special unichr(%s) which is %s:" % (unicodeNum, unichr(unicodeNum))
# print "inspect:", h2o.dump_json(inspect)
numRows = inspect['numRows']
self.assertEqual(numRows, expNumRows, msg='Using unichr(0x%x) Wrong numRows: %s Expected: %s' % \
(unicodeNum, numRows, expNumRows))
numCols = inspect['numCols']
self.assertEqual(numCols, expNumCols, msg='Using unichr(0x%x) Wrong numCols: %s Expected: %s' % \
(unicodeNum, numCols, expNumCols))

# this is required for the test setup
assert(len(expNaCnt)>=expNumCols)
assert(len(expType)>=expNumCols)

for k in range(expNumCols):
naCnt = inspect['cols'][k]['naCnt']
self.assertEqual(expNaCnt[k], naCnt, msg='Using unichr(0x%x) col: %s naCnt: %d should be: %s' % \
(unicodeNum, k, naCnt, expNaCnt[k]))
stype = inspect['cols'][k]['type']
self.assertEqual(expType[k], stype, msg='Using unichr(0x%x) col: %s type: %s should be: %s' % \
(unicodeNum, k, stype, expType[k]))

if __name__ == '__main__':
h2o.unit_main()

0 comments on commit c522b38

Please sign in to comment.