conflict resolve merge

Richiexy · Sep 5, 2013 · 5b828fc · 5b828fc
2 parents 628092c + fe2c27d
commit 5b828fc
Show file tree

Hide file tree

Showing 12 changed files with 272 additions and 122 deletions.
diff --git a/py/h2o.py b/py/h2o.py
@@ -1383,7 +1383,7 @@ def random_forest_treeview(self, tree_number, data_key, model_key,
             time.sleep(3) # to be able to see it
         return a
 
-    def GBM(self, data_key, timeoutSecs=600, **kwargs):
+    def gbm(self, data_key, timeoutSecs=600, **kwargs):
         params_dict = {
             'destination_key':None,
             'source':data_key,
@@ -1394,19 +1394,21 @@ def GBM(self, data_key, timeoutSecs=600, **kwargs):
             'vresponse':None
         }        
         params_dict.update(kwargs)
-        a = self.__do_json_request('GBM.json',timeout=timeoutSecs,params=params_dict)        
+        a = self.__do_json_request('GBM.json',timeout=timeoutSecs,params=params_dict)
+        verboseprint("\nGBM result:", dump_json(a))
         return a
 
-    def PCA(self, data_key, timeoutSecs=600, **kwargs):
+    def pca(self, data_key, timeoutSecs=600, **kwargs):
         params_dict = {
             'destination_key':None,
-            'key':None,
+            'key':data_key,
             'ignore':None,
             'tolerance':None,
             'standardize':None
         }
         params_dict.update(kwargs)
         a = self.__do_json_request('PCA.json',timeout=timeoutSecs,params=params_dict)
+        verboseprint("\npca result:", dump_json(a))
         return a
 
     def summary_page(self, key, max_column_display=1000, timeoutSecs=60, noPrint=True, **kwargs):

diff --git a/py/testdir_multi_jvm/test_KMeans_sphere100.py b/py/testdir_multi_jvm/test_KMeans_sphere100.py
@@ -15,7 +15,7 @@
 # should do this, but does it make h2o kmeans fail?
 SHUFFLE_SPHERES = True
 R_NOISE = True
-ALLOWED_CENTER_DELTA = 1
+ALLOWED_CENTER_DELTA = 3
 
 def get_xyz_sphere(R):
     u = random.random() # 0 to 1

diff --git a/py/testdir_single_jvm/test_GBM_mnist.py b/py/testdir_single_jvm/test_GBM_mnist.py
@@ -45,7 +45,7 @@ def test_GBM_mnist(self):
         timeoutSecs = 1800
         start = time.time()
         node = h2o.nodes[0]
-        GBMResult = node.GBM(data_key=trainKey, **kwargs)
+        GBMResult = node.gbm(data_key=trainKey, **kwargs)
         elapsed = time.time() - start
         print "GBM completed in", elapsed, "seconds.", \
             "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

diff --git a/py/testdir_single_jvm/test_PCA_UCIwine.py b/py/testdir_single_jvm/test_PCA_UCIwine.py
@@ -0,0 +1,50 @@
+import unittest
+import random, sys, time, re
+sys.path.extend(['.','..','py'])
+
+import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import2 as h2i, h2o_glm, h2o_util, h2o_rf
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        h2o.build_cloud(1, java_heap_GB=8)
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_PCA_UCIwine(self):
+        csvFilename = "wine.data"
+        timeoutSecs=180
+        trialStart = time.time()
+
+        # PARSE ****************************************
+        trainKey = csvFilename + "_" + ".hex"
+        start = time.time()
+        parseResult = h2i.import_parse(bucket='smalldata', path=csvFilename,
+            hex_key=trainKey, timeoutSecs=timeoutSecs)
+        elapsed = time.time() - start
+        print "parse end on ", csvFilename, 'took', elapsed, 'seconds',\
+            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
+        print "parse result:", parseResult['destination_key']
+
+        # PCA****************************************
+        params = { 
+            'destination_key': "python_PCA_key",
+            'ignore':0,
+            'tolerance':0.0,
+            'standardize':1
+            }   
+
+        kwargs = params.copy()
+        start = time.time()
+        node = h2o.nodes[0]
+        PCAResult = node.pca(data_key=trainKey, **kwargs)
+        elapsed = time.time() - start
+        print "PCA completed in", elapsed, "seconds.", \
+            "%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/py/testdir_single_jvm/test_fp_many_cols.py b/py/testdir_single_jvm/test_fp_many_cols.py
@@ -2,6 +2,8 @@
 sys.path.extend(['.','..','py'])
 import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import2 as h2i, h2o_exec as h2e
 
+H2O_SUPPORTS_OVER_100K_COLS = False
+
 print "Stress the # of cols with fp reals here." 
 print "Can pick fp format but will start with just the first (e0)"
 def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel):
@@ -114,16 +116,24 @@ def test_many_cols_and_values_with_syn(self):
             (100, 70000, 'cD', 30, 120),
             (100, 90000, 'cE', 30, 120),
             (100, 100000, 'cF', 30, 120),
-            (100, 200000, 'cG', 30, 120),
-            (100, 300000, 'cH', 30, 120),
-            (100, 400000, 'cI', 30, 120),
-            (100, 500000, 'cJ', 30, 120),
-            (100, 600000, 'cK', 30, 120),
-            (100, 700000, 'cL', 30, 120),
-            (100, 800000, 'cM', 30, 120),
-            (100, 900000, 'cN', 30, 120),
-            (100, 1000000, 'cO', 30, 120),
+        ]
+
+        if not H2O_SUPPORTS_OVER_100K_COLS:
+            print "Restricting number of columns tested to 100,000"
+        else:
+            tryList = tryList + [
+                (100, 200000, 'cG', 30, 120),
+                (100, 300000, 'cH', 30, 120),
+                (100, 400000, 'cI', 30, 120),
+                (100, 500000, 'cJ', 30, 120),
+                (100, 600000, 'cK', 30, 120),
+                (100, 700000, 'cL', 30, 120),
+                (100, 800000, 'cM', 30, 120),
+                (100, 900000, 'cN', 30, 120),
+                (100, 1000000, 'cO', 30, 120),
             ]
+
+
 
         for (rowCount, colCount, hex_key, timeoutSecs, timeoutSecs2) in tryList:
             SEEDPERFILE = random.randint(0, sys.maxint)