move shared code to h2o_cmd for info from inspect

new test that fails: mixed number + string in col causes NA
a-b · Mar 15, 2013 · 38895c7 · 38895c7
1 parent 609c7dd
commit 38895c7
Show file tree

Hide file tree

Showing 7 changed files with 418 additions and 95 deletions.
diff --git a/py/h2o_cmd.py b/py/h2o_cmd.py
@@ -18,6 +18,33 @@ def parseFile(node=None, csvPathname=None, key=None, key2=None,
         timeoutSecs=timeoutSecs, retryDelaySecs=retryDelaySecs, pollTimeoutSecs=pollTimeoutSecs,
         noise=noise, noPoll=noPoll)
 
+
+def info_from_inspect(inspect, csvPathname):
+    # need more info about this dataset for debug
+    cols = inspect['cols']
+    # look for nonzero num_missing_values count in each col
+    sum_num_missing_values = 0
+    for i, colDict in enumerate(cols):
+        num_missing_values = colDict['num_missing_values']
+        if num_missing_values != 0:
+            print "%s: col: %d, num_missing_values: %d" % (csvPathname, i, num_missing_values)
+            sum_num_missing_values += num_missing_values
+
+    num_cols = inspect['num_cols']
+    num_rows = inspect['num_rows']
+    row_size = inspect['row_size']
+    ptype = inspect['type']
+    value_size_bytes = inspect['value_size_bytes']
+    response = inspect['response']
+    ptime = response['time']
+
+    print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
+           value_size_bytes: %s, time: %s" % \
+           (num_cols, num_rows, row_size, ptype, value_size_bytes, ptime)
+    # sum of num_missing_values from all the columns
+    return sum_num_missing_values
+
+
 def runInspect(node=None, key=None, timeoutSecs=5, **kwargs):
     if not key: raise Exception('No key for Inspect specified')
     if not node: node = h2o.nodes[0]

diff --git a/py/testdir_hosts/pytest_config-kevin.json b/py/testdir_hosts/pytest_config-kevin.json
@@ -8,7 +8,7 @@
 
     "h2o_per_host": 1, 
     "sigar": true,
-    "java_heap_GB": 4, 
+    "java_heap_GB": 10, 
 
     "hdfs_name_node": "192.168.1.176",
     "hdfs_version": "cdh3u5",
@@ -20,7 +20,12 @@
         "192.168.1.172",
         "192.168.1.173",
         "192.168.1.174",
-        "192.168.1.175"
+        "192.168.1.175",
+        "192.168.1.176",
+        "192.168.1.177",
+        "192.168.1.178",
+        "192.168.1.179",
+        "192.168.1.180"
     ]
 }
 
diff --git a/py/testdir_single_jvm/test_GLM_binomial_goalies.py b/py/testdir_single_jvm/test_GLM_binomial_goalies.py
@@ -33,28 +33,6 @@ def define_params():
         }
     return paramDict
 
-def info_from_inspect(inspect, csvPathname):
-    # need more info about this dataset for debug
-    cols = inspect['cols']
-    # look for nonzero num_missing_values count in each col
-    for i, colDict in enumerate(cols):
-        num_missing_values = colDict['num_missing_values']
-        if num_missing_values != 0:
-            ### print "%s: col: %d, num_missing_values: %d" % (csvPathname, i, num_missing_values)
-            pass
-
-    num_cols = inspect['num_cols']
-    num_rows = inspect['num_rows']
-    row_size = inspect['row_size']
-    ptype = inspect['type']
-    value_size_bytes = inspect['value_size_bytes']
-    response = inspect['response']
-    ptime = response['time']
-
-    print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
-           value_size_bytes: %s, response: %s, time: %s" % \
-           (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime)
-
 class Basic(unittest.TestCase):
     def tearDown(self):
         h2o.check_sandbox_for_errors()
@@ -74,7 +52,7 @@ def test_loop_random_param_covtype(self):
         inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
 
         # need more info about the dataset for debug
-        info_from_inspect(inspect, csvPathname)
+        h2o_cmd.info_from_inspect(inspect, csvPathname)
 
         # for determinism, I guess we should spit out the seed?
         # random.seed(SEED)

diff --git a/py/testdir_single_jvm/test_GLM_poisson_goalies_admm.py b/py/testdir_single_jvm/test_GLM_poisson_goalies_admm.py
@@ -30,28 +30,6 @@ def define_params():
         }
     return paramDict
 
-def info_from_inspect(inspect, csvPathname):
-    # need more info about this dataset for debug
-    cols = inspect['cols']
-    # look for nonzero num_missing_values count in each col
-    for i, colDict in enumerate(cols):
-        num_missing_values = colDict['num_missing_values']
-        if num_missing_values != 0:
-            ### print "%s: col: %d, num_missing_values: %d" % (csvPathname, i, num_missing_values)
-            pass
-
-    num_cols = inspect['num_cols']
-    num_rows = inspect['num_rows']
-    row_size = inspect['row_size']
-    ptype = inspect['type']
-    value_size_bytes = inspect['value_size_bytes']
-    response = inspect['response']
-    ptime = response['time']
-
-    print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
-           value_size_bytes: %s, response: %s, time: %s" % \
-           (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime)
-
 class Basic(unittest.TestCase):
     def tearDown(self):
         h2o.check_sandbox_for_errors()
@@ -70,7 +48,7 @@ def test_loop_random_param_covtype(self):
         inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
 
         # need more info about the dataset for debug
-        info_from_inspect(inspect, csvPathname)
+        h2o_cmd.info_from_inspect(inspect, csvPathname)
 
         # for determinism, I guess we should spit out the seed?
         # random.seed(SEED)

diff --git a/py/testdir_single_jvm/test_GLM_poisson_goalies_gg.py b/py/testdir_single_jvm/test_GLM_poisson_goalies_gg.py
@@ -30,28 +30,6 @@ def define_params():
         }
     return paramDict
 
-def info_from_inspect(inspect, csvPathname):
-    # need more info about this dataset for debug
-    cols = inspect['cols']
-    # look for nonzero num_missing_values count in each col
-    for i, colDict in enumerate(cols):
-        num_missing_values = colDict['num_missing_values']
-        if num_missing_values != 0:
-            ### print "%s: col: %d, num_missing_values: %d" % (csvPathname, i, num_missing_values)
-            pass
-
-    num_cols = inspect['num_cols']
-    num_rows = inspect['num_rows']
-    row_size = inspect['row_size']
-    ptype = inspect['type']
-    value_size_bytes = inspect['value_size_bytes']
-    response = inspect['response']
-    ptime = response['time']
-
-    print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
-           value_size_bytes: %s, response: %s, time: %s" % \
-           (num_cols, num_rows, row_size, ptype, value_size_bytes, response, ptime)
-
 class Basic(unittest.TestCase):
     def tearDown(self):
         h2o.check_sandbox_for_errors()
@@ -70,7 +48,7 @@ def test_loop_random_param_covtype(self):
         inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
 
         # need more info about the dataset for debug
-        info_from_inspect(inspect, csvPathname)
+        h2o_cmd.info_from_inspect(inspect, csvPathname)
 
         # for determinism, I guess we should spit out the seed?
         # random.seed(SEED)

diff --git a/py/testdir_single_jvm/test_rf_covtype_train_oobe.py b/py/testdir_single_jvm/test_rf_covtype_train_oobe.py
@@ -30,29 +30,6 @@
     'exclusive_split_limit': 0,
     }
 
-def info_from_inspect(inspect, csvPathname):
-    # need more info about this dataset for debug
-    cols = inspect['cols']
-    # look for nonzero num_missing_values count in each col
-    for i, colDict in enumerate(cols):
-        num_missing_values = colDict['num_missing_values']
-        if num_missing_values != 0:
-            print "%s: col: %d, num_missing_values: %d" % (csvPathname, i, num_missing_values)
-            pass
-
-    num_cols = inspect['num_cols']
-    num_rows = inspect['num_rows']
-    row_size = inspect['row_size']
-    ptype = inspect['type']
-    value_size_bytes = inspect['value_size_bytes']
-    response = inspect['response']
-    ptime = response['time']
-
-    print "num_cols: %s, num_rows: %s, row_size: %s, ptype: %s, \
-           value_size_bytes: %s, time: %s" % \
-           (num_cols, num_rows, row_size, ptype, value_size_bytes, ptime)
-
-
 class Basic(unittest.TestCase):
     def tearDown(self):
         h2o.check_sandbox_for_errors()
@@ -83,7 +60,7 @@ def test_rf_covtype_train_oobe(self):
 
 
         inspect = h2o_cmd.runInspect(None, parseKey['destination_key'])
-        info_from_inspect(inspect, csvPathname)
+        h2o_cmd.info_from_inspect(inspect, csvPathname)
 
         for trial in range(1):
             # params is mutable. This is default.