Merge branch 'master' of github.com:0xdata/h2o

a-b · Jul 31, 2013 · 7c67304 · 7c67304
2 parents 60a753d + 4658b04
commit 7c67304
Show file tree

Hide file tree

Showing 21 changed files with 706 additions and 54 deletions.
diff --git a/py/h2o.py b/py/h2o.py
@@ -538,7 +538,7 @@ def check_sandbox_for_errors(sandbox_ignore_errors=False):
                     # don't detect these class loader info messags as errors
                     #[Loaded java.lang.Error from /usr/lib/jvm/java-7-oracle/jre/lib/rt.jar]
                     foundBad = regex1.search(line) and not (
-                        ('error rate' in line) or ('[Loaded ' in line) or
+                        ('error rate' in line) or ('[Loaded ' in line) or ('class.error' in line) or
                         ('[WARN]' in line) or ('CalcSquareErrorsTasks' in line))
 
                 if (printing==0 and foundBad):
@@ -586,17 +586,15 @@ def check_sandbox_for_errors(sandbox_ignore_errors=False):
             justInfo &= re.match("INFO:", e) or ("apache" in e)
 
         if not justInfo:
-            emsg1 = " check_sandbox_for_errors: Errors in sandbox stdout or stderr.\n" + \
+            emsg1 = " check_sandbox_for_errors: Errors in sandbox stdout or stderr (including R stdout/stderr).\n" + \
                      "Could have occurred at any prior time\n\n"
             emsg2 = "".join(errLines)
             if nodes: 
                 nodes[0].sandbox_error_report(True)
 
-            # can build a cloud that ignores all sandbox things that normally fatal the test
-            # kludge, test will set this directly if it wants, rather than thru build_cloud
-            # parameter. 
-            # we need the sandbox_ignore_errors, for the test teardown_cloud..the state 
-            # disappears!
+            # Can build a cloud that ignores all sandbox things that normally fatal the test
+            # Kludge, test will set this directly if it wants, rather than thru build_cloud parameter. 
+            # we need the sandbox_ignore_errors, for the test teardown_cloud..the state disappears!
             if sandbox_ignore_errors or (nodes and nodes[0].sandbox_ignore_errors):
                 pass
             else:
@@ -1174,7 +1172,7 @@ def random_forest_view(self, data_key, model_key, timeoutSecs=300, print_params=
             'model_key': model_key,
             'out_of_bag_error_estimate': 1, 
             'class_weights': None,
-            'response_variable': -3, # put -3 here to make H2O blow up if it's not specified. required to be correct (same as RF)
+            'response_variable': None, 
             }
         browseAlso = kwargs.pop('browseAlso',False)
 

diff --git a/py/h2o_cmd.py b/py/h2o_cmd.py
@@ -269,16 +269,24 @@ def wait_for_live_port(ip, port, retries=3):
 # see how it's used in tests named above
 def delete_csv_key(csvFilename, importFullList):
     # remove the original data key
-    for k in importFullList['succeeded']:
-        ### print "possible delete:", deleteKey
-        # don't delete any ".hex" keys. the parse results above have .hex
-        # this is the name of the multi-file (it comes in as a single file?)
-        # This deletes the source key?
-        key = k['key']
-        if csvFilename in key:
-            print "\nRemoving", key
-            removeKeyResult = h2o.nodes[0].remove_key(key=key)
-            ### print "removeKeyResult:", h2o.dump_json(removeKeyResult)
+    # the list could be from hdfs/s3 or local. They have to different list structures
+    if 'succeeded' in importFullList:
+        kDict = importFullList['succeeded']
+        for k in kDict:
+            key = k['key']
+            if csvFilename in key:
+                print "\nRemoving", key
+                removeKeyResult = h2o.nodes[0].remove_key(key=key)
+    elif 'keys' in importFullList:
+        kDict = importFullList['keys']
+        for k in kDict:
+            key = k
+            if csvFilename in key:
+                print "\nRemoving", key
+                removeKeyResult = h2o.nodes[0].remove_key(key=key)
+    else:
+        raise Exception ("Can't find 'files' or 'succeeded' in your file dict. why? not from hdfs/s3 or local?")
+
 
 # checks the key distribution in the cloud, and prints warning if delta against avg
 # is > expected

diff --git a/py/h2o_glm.py b/py/h2o_glm.py
@@ -336,15 +336,16 @@ def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, *
 
 # get input from this.
 #   (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
-#                h2o_cmd.columnInfoFromInspect(parseKey, exceptionOnMissingValues=False, timeoutSecs=300)
+#                h2o_cmd.columnInfoFromInspect(parseKey['destination_key', 
+#                exceptionOnMissingValues=False, timeoutSecs=300)
 
 def goodXFromColumnInfo(y, 
     num_cols=None, missingValuesDict=None, constantValuesDict=None, enumSizeDict=None, colTypeDict=None, colNameDict=None, 
     keepPattern=None, key=None, timeoutSecs=120, forRF=False):
 
     y = str(y)
 
-    # if we pass a parseKey, means we want to get the info ourselves here
+    # if we pass a key, means we want to get the info ourselves here
     if key is not None:
         (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
             h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, timeoutSecs=timeoutSecs)

diff --git a/py/h2o_rf.py b/py/h2o_rf.py
@@ -121,13 +121,32 @@ def simpleCheckRFView(node, rfv, noPrint=False, **kwargs):
     time = response['time']
 
     trees = rfv['trees'] # Dict
-    depth = trees['depth']
+    depth = trees['depth'] # Dict
     # zero depth okay?
     ## if ' 0.0 ' in depth:
     ##     raise Exception("depth in RFView seems wrong. depth:", depth)
-    leaves = trees['leaves']
+    leaves = trees['leaves'] # Dict
     if ' 0.0 ' in leaves:
         raise Exception("leaves in RFView seems wrong. leaves:", leaves)
+
+    print """
+ Leaves: {0} / {1} / {2}
+  Depth: {3} / {4} / {5}
+   mtry: {6}
+   Type: {7}
+    Err: {8} %
+""".format(
+        rfv['trees']['leaves']['min'],
+        rfv['trees']['leaves']['mean'],
+        rfv['trees']['leaves']['max'],
+        rfv['trees']['depth']['min'],
+        rfv['trees']['depth']['mean'],
+        rfv['trees']['depth']['max'],
+        rfv['mtry'],
+        rfv['confusion_matrix']['type'],
+        rfv['confusion_matrix']['classification_error'] *100,
+        )
+
     number_built = trees['number_built']
     if (number_built<=0 or number_built>20000):
         raise Exception("number_built in RFView seems wrong. number_built:", number_built)
@@ -160,6 +179,7 @@ def scoreRF(scoreParseKey, trainResult, **kwargs):
 
     start = time.time()
     data_key = scoreParseKey['destination_key']
+    # NOTE: response_variable is required, and passed from kwargs here
     scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree, **kwargs)
 
     rftime      = time.time()-start 
@@ -196,10 +216,6 @@ def pp_rf_result(rf):
    mtry: {6}
    Type: {7}
     Err: {8} %
-   Time: {9} seconds
-
-   Confusion matrix:
-{10}
 """.format(
         rf['trees']['leaves']['min'],
         rf['trees']['leaves']['mean'],
@@ -210,6 +226,5 @@ def pp_rf_result(rf):
         rf['mtry'], 
         rf['confusion_matrix']['type'],
         rf['confusion_matrix']['classification_error'] *100,
-        rf['response']['time'],
         cm)
 
diff --git a/py/testdir_0xdata_only/mnist_to_csv.py b/py/testdir_0xdata_only/mnist_to_csv.py
@@ -5,6 +5,8 @@
 import numpy
 from numpy import append, array, int8, uint8, zeros
 
+DO_REALS=True
+
 # gzip infile to gzfile
 def file_gzip(infile, gzfile):
     import gzip
@@ -16,7 +18,7 @@ def file_gzip(infile, gzfile):
     zipped_file.close()
     print "\nGzip:", gzfile, "done"
 
-def read(digits, dataset = "training", path = "."):
+def read(digits, dataset="training", path="."):
     """
     Loads MNIST files into 3D numpy arrays
 
@@ -32,7 +34,7 @@ def read(digits, dataset = "training", path = "."):
         fname_img = os.path.join(path, 't10k-images-idx3-ubyte')
         fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte')
     else:
-        raise ValueError, "dataset must be 'testing' or 'training'"
+        raise ValueError, "dataset must be 'testing' or 'training"
 
     flbl = open(fname_lbl, 'rb')
     magic_nr, size = struct.unpack(">II", flbl.read(8))
@@ -47,8 +49,13 @@ def read(digits, dataset = "training", path = "."):
     ind = [ k for k in xrange(size) if lbl[k] in digits ]
     N = len(ind)
 
-    images = zeros((N, rows, cols), dtype=uint8)
-    labels = zeros((N, 1), dtype=int8)
+    if DO_REALS:
+        images = zeros((N, rows, cols), dtype=float)
+        labels = zeros((N, 1), dtype=int8) # always need these to be int for H2O RF output
+    else:
+        images = zeros((N, rows, cols), dtype=int8)
+        labels = zeros((N, 1), dtype=int8)
+
     for i in xrange(len(ind)):
         images[i] = array(img[ ind[i]*rows*cols : (ind[i]+1)*rows*cols ]).reshape((rows, cols))
         labels[i] = lbl[ind[i]]
@@ -60,36 +67,57 @@ def read(digits, dataset = "training", path = "."):
     from pylab import *
     # from numpy import *
 
-    def doit(f):
+    def doit(prefix, f):
         print "we want all the images"
         images, labels = read(range(10), f) 
+        if DO_REALS:
+            # If you want the values as floats between 0.0 and 1.0, just do
+            images /= 255.0
+        print images[0]
+
         print "labels.shape", labels.shape
         print "images.shape", images.shape
         print "images[0].shape", images[0].shape
         (a,b,c) = images.shape
+        if DO_REALS:
+            # If you want the values as floats between 0.0 and 1.0, just do
+            images /= 255.0
+
         imagesF = images.reshape(a,b*c)
         labelsF = labels
 
         # stick label and pixels together
         bothF = numpy.concatenate((labelsF, imagesF), 1)
         print "labelsF.shape", labelsF.shape
         print "imagesF.shape", imagesF.shape
-        print "both.shape", bothF.shape
+        print "bothF.shape", bothF.shape
 
         # the output label was first in the concatenate. do the same for header 
         headerList = ['label']
         headerList += ['p' + str(i) for i in range(784)]
         # comma separated!
         header = ','.join(map(str,headerList))
         print header # just so we can see it.
-        numpy.savetxt('mnist_'+ f + '.csv', bothF, header=header, delimiter=',', fmt='%d')
+        if DO_REALS:
+            # first has to be integer for stupid h2o rf output (doesn't take fp)
+            # have to create a format string for each one as a result!
+            fmt = ",".join(["%i"] + ["%f"] * imagesF.shape[1])
+        else:
+            fmt = '%d'
+        numpy.savetxt(prefix + f + '.csv', bothF, header=header, delimiter=',', fmt=fmt)
 
     # create the two csv files
-    doit('training')
-    doit('testing')
+    if DO_REALS:
+        prefix = "mnist_reals_"
+    else:
+        prefix = "mnist_"
+
+    doit(prefix, 'training')
+    doit(prefix, 'testing')
+
     # we can copy this multiple times to get bigger parsed gz
-    file_gzip('mnist_training.csv', 'mnist_training.csv.gz')
-    file_gzip('mnist_testing.csv', 'mnist_testing.csv.gz')
+    file_gzip(prefix + 'training.csv', prefix + 'training.csv.gz')
+    file_gzip(prefix + 'testing.csv',  prefix + 'testing.csv.gz')
 
     # show merged images
     if 1==0:

diff --git a/py/testdir_0xdata_only/test_GLM_mnist_reals.py b/py/testdir_0xdata_only/test_GLM_mnist_reals.py
@@ -0,0 +1,120 @@
+import unittest
+import random, sys, time, re
+sys.path.extend(['.','..','py'])
+
+import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_glm, h2o_util
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        # assume we're at 0xdata with it's hdfs namenode
+        global localhost
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(1)
+        else:
+            # all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
+            h2o_hosts.build_cloud_with_hosts(1, 
+                # this is for our amazon ec hdfs
+                # see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
+                hdfs_name_node='10.78.14.235:9000',
+                hdfs_version='0.20.2')
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_GLM_mnist_reals(self):
+        importFolderPath = "/home/0xdiag/datasets/mnist"
+        csvFilelist = [
+            ("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz",    600), 
+        ]
+        # IMPORT**********************************************
+        # since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
+        importFolderResult = h2i.setupImportFolder(None, importFolderPath)
+        ### print "importHDFSResult:", h2o.dump_json(importFolderResult)
+        succeededList = importFolderResult['files']
+        ### print "succeededList:", h2o.dump_json(succeededList)
+
+        self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
+        # why does this hang? can't look at storeview after import?
+        print "\nTrying StoreView after the import folder"
+        h2o_cmd.runStoreView(timeoutSecs=30)
+
+        trial = 0
+        for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
+            trialStart = time.time()
+
+            # PARSE test****************************************
+            testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
+            start = time.time()
+            parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath,
+                key2=testKey2, timeoutSecs=timeoutSecs)
+            elapsed = time.time() - start
+            print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
+                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
+            print "parse result:", parseKey['destination_key']
+
+            print "We won't use this pruning of x on test data. See if it prunes the same as the training"
+            y = 0 # first column is pixel value
+            print "y:"
+            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)
+
+            # PARSE train****************************************
+            trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
+            start = time.time()
+            parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath,
+                key2=trainKey2, timeoutSecs=timeoutSecs)
+            elapsed = time.time() - start
+            print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
+                "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
+            print "parse result:", parseKey['destination_key']
+
+            # GLM****************************************
+            print "This is the pruned x we'll use"
+            x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)
+            print "x:", x
+
+            params = {
+                'x': x, 
+                'y': y,
+                'case_mode': '=',
+                'case': 0,
+                'family': 'binomial',
+                'lambda': 1.0E-5,
+                'alpha': 0.0,
+                'max_iter': 5,
+                'thresholds': 0.5,
+                'n_folds': 1,
+                'weight': 1,
+                'beta_epsilon': 1.0E-4,
+                }
+
+            for c in [0,1,2,3,4,5,6,7,8,9]:
+                kwargs = params.copy()
+                print "Trying binomial with case:", c
+                kwargs['case'] = c
+
+                timeoutSecs = 1800
+                start = time.time()
+                glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
+                elapsed = time.time() - start
+                print "GLM completed in", elapsed, "seconds.", \
+                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
+
+                h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
+                GLMModel = glm['GLMModel']
+                modelKey = GLMModel['model_key']
+
+                start = time.time()
+                glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5",
+                    timeoutSecs=60)
+                elapsed = time.time() - start
+                print "GLMScore in",  elapsed, "secs", \
+                    "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
+                h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)
+
+if __name__ == '__main__':
+    h2o.unit_main()