tweaks

AI-Cdrone · Oct 16, 2014 · c522b38 · c522b38
1 parent 36d86bc
commit c522b38
Show file tree

Hide file tree

Showing 6 changed files with 226 additions and 12 deletions.
diff --git a/py/h2o.py b/py/h2o.py
@@ -1297,9 +1297,9 @@ def put_file(self, f, key=None, timeoutSecs=60):
     def poll_url(self, response,
                  timeoutSecs=10, retryDelaySecs=0.5, initialDelaySecs=0, pollTimeoutSecs=180,
                  noise=None, benchmarkLogging=None, noPoll=False, reuseFirstPollUrl=False, noPrint=False):
-        ### print "poll_url: pollTimeoutSecs", pollTimeoutSecs
         verboseprint('poll_url input: response:', dump_json(response))
-        print "at top of poll_url, timeoutSecs: ", timeoutSecs
+        ### print "poll_url: pollTimeoutSecs", pollTimeoutSecs
+        ### print "at top of poll_url, timeoutSecs: ", timeoutSecs
 
         # for the rev 2 stuff..the job_key, destination_key and redirect_url are just in the response
         # look for 'response'..if not there, assume the rev 2

diff --git a/py/testdir_single_jvm/test_parse_specific_case1.py b/py/testdir_single_jvm/test_parse_specific_case1.py
@@ -32,8 +32,8 @@
 
 def write_syn_dataset(csvPathname, dataset):
     dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
-    decoded = dataset.decode('utf-8')
-    print "utf8:" , repr(decoded), type(decoded)
+    encoded = dataset.decode('utf-8')
+    print "utf8:" , repr(encoded), type(encoded)
     print "str or utf8:" , repr(dataset), type(dataset)
     dsf.write(dataset)
     dsf.close()

diff --git a/py/testdir_single_jvm/test_parse_specific_case2.py b/py/testdir_single_jvm/test_parse_specific_case2.py
@@ -38,8 +38,8 @@
 
 def write_syn_dataset(csvPathname, dataset):
     dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
-    decoded = dataset.decode('utf-8')
-    print "utf8:" , repr(decoded), type(decoded)
+    encoded = dataset.encode('utf-8')
+    print "utf8:" , repr(encoded), type(encoded)
     print "str or utf8:" , repr(dataset), type(dataset)
     dsf.write(dataset)
     dsf.close()
@@ -67,7 +67,7 @@ def test_parse_specific_case2(self):
         hex_key = "a.hex"
 
         for (dataset, expNumRows, expNumCols, expNaCnt, expType) in tryList:
-            csvFilename = 'specific_' + str(expNumRows) + str(expNumCols) + '.csv'
+            csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
             csvPathname = SYNDATASETS_DIR + '/' + csvFilename
             write_syn_dataset(csvPathname, dataset)
 

diff --git a/py/testdir_single_jvm/test_parse_specific_case2a.py b/py/testdir_single_jvm/test_parse_specific_case2a.py
@@ -0,0 +1,95 @@
+import unittest, random, sys, time, os
+sys.path.extend(['.','..','py'])
+
+import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i
+import codecs, unicodedata
+print "create some specific small datasets with exp row/col combinations"
+print "I'll keep it to one case per file"
+
+# this works
+# unicodeNull = unichr(0x33)
+# this fails
+unicodeNull = unichr(0x0)
+
+tryList = [
+    # the nul char I think is causing extra rows and also wiping out the next char?
+    # I got nulls when concat'ing files with dd. may be used for padding somehow?
+    ((
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    'a,b,c,d' + unicodeNull + ',n\n'
+    ), 10, 5, [0,0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum', 'Enum']),
+]
+
+# h2o incorrectly will match this
+# 1, 1, [0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum']),
+
+# u = unichr(0x2018) + unichr(6000) + unichr(0x2019)
+# for i, c in enumerate(u):
+#    print i, '%04x' % ord(c), unicodedata.category(c),
+#    print unicodedata.name(c)
+
+def write_syn_dataset(csvPathname, dataset):
+    dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
+    encoded = dataset.encode('utf-8')
+    print "utf8:" , repr(encoded), type(encoded)
+    print "str or utf8:" , repr(dataset), type(dataset)
+    dsf.write(dataset)
+    dsf.close()
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global SEED, localhost
+        SEED = h2o.setup_random_seed()
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(java_heap_GB=1)
+        else:
+            h2o_hosts.build_cloud_with_hosts()
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_parse_specific_case2a(self):
+        SYNDATASETS_DIR = h2o.make_syn_dir()
+        hex_key = "a.hex"
+
+        for (dataset, expNumRows, expNumCols, expNaCnt, expType) in tryList:
+            csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
+            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
+            write_syn_dataset(csvPathname, dataset)
+
+            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
+                hex_key=hex_key, timeoutSecs=10, doSummary=False)
+            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
+
+            print "inspect:", h2o.dump_json(inspect)
+            numRows = inspect['numRows']
+            self.assertEqual(numRows, expNumRows, msg='Wrong numRows: %s Expected: %s' % (numRows, expNumRows))
+            numCols = inspect['numCols']
+            self.assertEqual(numCols, expNumCols, msg='Wrong numCols: %s Expected: %s' % (numCols, expNumCols))
+
+            # this is required for the test setup
+            assert(len(expNaCnt)>=expNumCols)
+            assert(len(expType)>=expNumCols)
+
+            for k in range(expNumCols):
+                naCnt = inspect['cols'][k]['naCnt']
+                self.assertEqual(expNaCnt[k], naCnt, msg='col %s naCnt %d should be %s' % (k, naCnt, expNaCnt[k]))
+                stype = inspect['cols'][k]['type']
+                self.assertEqual(expType[k], stype, msg='col %s type %s should be %s' % (k, stype, expType[k]))
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/py/testdir_single_jvm/test_parse_specific_case3.py b/py/testdir_single_jvm/test_parse_specific_case3.py
@@ -6,8 +6,6 @@
 print "create some specific small datasets with exp row/col combinations"
 print "I'll keep it to one case per file"
 
-print "I would expect the unmatched double quote turns this in a 4 col dataset, not 1"
-print "apparently turns it into all NAs"
 # toDoList = range(0x20,0x80)
 toDoList = [0x22] # double quote
 
@@ -48,8 +46,8 @@ def removeIfThere(d):
 
 def write_syn_dataset(csvPathname, dataset):
     dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
-    decoded = dataset.decode('utf-8')
-    print "utf8:" , repr(decoded), type(decoded)
+    encoded = dataset.encode('utf-8')
+    print "utf8:" , repr(encoded), type(encoded)
     print "str or utf8:" , repr(dataset), type(dataset)
     dsf.write(dataset)
     dsf.close()
@@ -77,7 +75,7 @@ def test_parse_specific_case3(self):
         hex_key = "a.hex"
 
         for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList:
-            csvFilename = 'specific_' + str(expNumRows) + str(expNumCols) + '.csv'
+            csvFilename = 'specific_' + str(expNumRows) + "x" + str(expNumCols) + '.csv'
             csvPathname = SYNDATASETS_DIR + '/' + csvFilename
             write_syn_dataset(csvPathname, dataset)
 

diff --git a/py/testdir_single_jvm/test_parse_specific_case4.py b/py/testdir_single_jvm/test_parse_specific_case4.py
@@ -0,0 +1,121 @@
+import unittest, random, sys, time, os
+sys.path.extend(['.','..','py'])
+
+import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i
+import codecs, unicodedata
+print "create some specific small datasets with exp row/col combinations"
+print "I'll keep it to one case per file"
+
+# toDoList = range(0x0,0x80)
+# 0x1 can be the hive separator? if we force comma it should be treated as char
+# should try without and change expected cols
+toDoList = range(0x00, 0x100)
+
+def removeIfThere(d):
+    if d in toDoList:
+        toDoList.remove(d)
+
+H2O_COL_SEPARATOR = 0x2c # comma
+# H2O_COL_SEPARATOR = 0x1 # hive separator
+
+# removeIfThere(0x1) # hive separator okay if we force comma below
+
+removeIfThere(0x0) # nul. known issue
+removeIfThere(0xa) # LF. causes EOL
+removeIfThere(0xd) # CR. causes EOL
+removeIfThere(0x22) # double quote. known issue
+removeIfThere(0x2c) # comma. don't mess up my expected col count
+
+# could try single quote if enabled, to see if does damage. probably like double quote
+
+tryList = []
+for i in toDoList:
+    unicodeSymbol = unichr(i)
+
+    tryList.append(
+        # the nul char I think is causing extra rows and also wiping out the next char?
+        # I got nulls when concat'ing files with dd. may be used for padding somehow?
+        ((
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        'a,b,c,d' + unicodeSymbol + 's,n\n'
+        ), 10, 5, [0,0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum', 'Enum'], i)
+    )
+
+# h2o incorrectly will match this
+# 1, 1, [0,0,0,0], ['Enum', 'Enum', 'Enum', 'Enum']),
+
+# u = unichr(0x2018) + unichr(6000) + unichr(0x2019)
+# for i, c in enumerate(u):
+#    print i, '%04x' % ord(c), unicodedata.category(c),
+#    print unicodedata.name(c)
+
+def write_syn_dataset(csvPathname, dataset):
+    dsf = codecs.open(csvPathname, encoding='utf-8', mode='w+')
+    encoded = dataset.encode('utf-8')
+    print "utf8:" , repr(encoded), type(encoded)
+    print "str or utf8:" , repr(dataset), type(dataset)
+    dsf.write(dataset)
+    dsf.close()
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global SEED, localhost
+        SEED = h2o.setup_random_seed()
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(java_heap_GB=1)
+        else:
+            h2o_hosts.build_cloud_with_hosts()
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_parse_specific_case4(self):
+        SYNDATASETS_DIR = h2o.make_syn_dir()
+        hex_key = "a.hex"
+
+        for (dataset, expNumRows, expNumCols, expNaCnt, expType, unicodeNum) in tryList:
+            csvFilename = 'specific_' + str(expNumRows) + str(expNumCols) + '.csv'
+            csvPathname = SYNDATASETS_DIR + '/' + csvFilename
+            write_syn_dataset(csvPathname, dataset)
+
+            parseResult = h2i.import_parse(path=csvPathname, schema='put', header=0,
+                hex_key=hex_key, timeoutSecs=10, doSummary=False, separator=H2O_COL_SEPARATOR) # force comma separator
+            inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=60)
+
+            print "Parsed with special unichr(%s) which is %s:" % (unicodeNum, unichr(unicodeNum))
+            # print "inspect:", h2o.dump_json(inspect)
+            numRows = inspect['numRows']
+            self.assertEqual(numRows, expNumRows, msg='Using unichr(0x%x) Wrong numRows: %s Expected: %s' % \
+                (unicodeNum, numRows, expNumRows))
+            numCols = inspect['numCols']
+            self.assertEqual(numCols, expNumCols, msg='Using unichr(0x%x) Wrong numCols: %s Expected: %s' % \
+                (unicodeNum, numCols, expNumCols))
+
+            # this is required for the test setup
+            assert(len(expNaCnt)>=expNumCols)
+            assert(len(expType)>=expNumCols)
+
+            for k in range(expNumCols):
+                naCnt = inspect['cols'][k]['naCnt']
+                self.assertEqual(expNaCnt[k], naCnt, msg='Using unichr(0x%x) col: %s naCnt: %d should be: %s' % \
+                    (unicodeNum, k, naCnt, expNaCnt[k]))
+                stype = inspect['cols'][k]['type']
+                self.assertEqual(expType[k], stype, msg='Using unichr(0x%x) col: %s type: %s should be: %s' % \
+                    (unicodeNum, k, stype, expType[k]))
+
+if __name__ == '__main__':
+    h2o.unit_main()