Merge branch 'master' of https://github.com/0xdata/h2o

ctyeong · Mar 11, 2014 · ef2e92d · ef2e92d
2 parents aab568b + 1984de6
commit ef2e92d
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 15 deletions.
diff --git a/h2o-perf/bench/tests/gbm_covtype/parse.R b/h2o-perf/bench/tests/gbm_covtype/parse.R
@@ -2,10 +2,10 @@ source("../../R/h2oPerf/prologue.R")
 
 data_source <<- "home-0xdiag-datasets"
 
-trainData   <<-  "/home/0xdiag/datasets/standard/covtype200x.data"
+trainData   <<-  "/home/0xdiag/datasets/standard/covtype20x.data"
 response <<- "C55"
 
-num_train_rows  <<- 116202400
+num_train_rows  <<- 11620240
 num_explan_cols <<- 54
 
 upload.VA("parsed.hex", trainData)

diff --git a/h2o-perf/web/index.html b/h2o-perf/web/index.html
@@ -366,7 +366,7 @@ <h4 class="panel-title">
 
                                 <form action="../prototype/php/post.php" method="post">
                                     <div>
-                                        <textarea name="parse_1" style="width: 700px; height: 700px;">i
+                                        <textarea name="parse_1" style="width: 700px; height: 700px;">
 SELECT
 FROM_UNIXTIME(tr.start_epoch_ms / 1000, '%Y-%m-%d') as run_date,
     tr.test_name,

diff --git a/lib/resources/h2o/css/main.css b/lib/resources/h2o/css/main.css
@@ -51,11 +51,11 @@ body {
 }
 
 td {
-  min-width: 125px;
+  min-width: 115px;
 }
 
-.table th, .table td {
-  text-align: center;
-}
+/*.table th, .table td {*/
+  /*text-align: left;*/
+/*}*/
 
 
diff --git a/py/testdir_single_jvm/test_exec_enums_rand_cut.py b/py/testdir_single_jvm/test_exec_enums_rand_cut.py
@@ -1,6 +1,8 @@
 import unittest, random, sys, time, re
 sys.path.extend(['.','..','py'])
 import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_glm, h2o_util
+import h2o_gbm
+import getpass
 
 # details:
 # we want to seed a random dictionary for our enums
@@ -16,6 +18,13 @@
 MIN_ENUM_WIDTH = 2
 MAX_ENUM_WIDTH = 8
 RAND_ENUM_LENGTH = True
+
+DO_PLOT = getpass.getuser()=='kevin'
+
+DO_MEDIAN = True
+MAX_QBINS = 1000
+MULTI_PASS = 1
+
 def random_enum(n, randChars=randChars, quoteChars=quoteChars):
     # randomly return None 10% of the time
     if random.randint(0,9)==0:
@@ -74,6 +83,7 @@ def write_syn_dataset(csvPathname, rowCount, inCount=1, outCount=1, SEED='123456
         dsf.write(rowDataCsv)
     dsf.close()
 
+
 class Basic(unittest.TestCase):
     def tearDown(self):
         h2o.check_sandbox_for_errors()
@@ -90,7 +100,7 @@ def setUpClass(cls):
 
     @classmethod
     def tearDownClass(cls):
-        ### time.sleep(3600)
+        h2o.sleep(3600)
         h2o.tear_down_cloud()
 
     def test_exec_enums_rand_cut(self):
@@ -104,9 +114,18 @@ def test_exec_enums_rand_cut(self):
             (n, 10, 9, 'cE', 300), 
             ]
 
-        ### h2b.browseTheCloud()
+        # create key names to use for exec
+        eKeys = ['e%s' % i for i in range(10)]
+
+        h2b.browseTheCloud()
+        trial = 0
+        xList = []
+        eList = []
+        fList = []
         for repeat in range(10):
             for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:
+
+                # CREATE DATASET*******************************************
                 colCount = iColCount + oColCount
                 SEEDPERFILE = random.randint(0, sys.maxint)
                 csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
@@ -115,7 +134,13 @@ def test_exec_enums_rand_cut(self):
                 print "Creating random", csvPathname
                 write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE)
 
-                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)
+                # PARSE*******************************************************
+                # should be two different keys in the sample
+                e = random.sample(eKeys,2)
+                fKey = e[0]
+                eKey = e[1]
+
+                parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=eKey, timeoutSecs=30)
                 print "Parse result['destination_key']:", parseResult['destination_key']
                 inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
                 print h2o.dump_json(inspect)
@@ -126,12 +151,56 @@ def test_exec_enums_rand_cut(self):
                 # error if any col has constant values
                 if len(constantValuesDict) != 0:
                     raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)
+
+                # EXEC*******************************************************
+                # don't use exec_expr to avoid issues with Inspect following etc.
+                randICol = random.randint(0,iColCount-1)
+                randOCol = random.randint(iColCount, iColCount+oColCount-1)
+
+                start = time.time()
+                h2o.nodes[0].exec_query(str='%s=%s[%s,]' % (fKey, eKey, randOCol))
+                elapsed = time.time() - start
+                print "exec1 end on ", csvFilename, 'took', elapsed, 'seconds.'
+                execTime = elapsed
 
+                gKey = random.choice(eKeys)
 
                 start = time.time()
+                h2o.nodes[0].exec_query(str='%s=%s' % (gKey, fKey))
+                print "exec2 end on ", csvFilename, 'took', elapsed, 'seconds.'
                 elapsed = time.time() - start
-                print "predict end on ", csvFilename, 'took', elapsed, 'seconds.'
+                execTime = elapsed
+
+                # QUANTILE*******************************************************
+                quantile = 0.5 if DO_MEDIAN else .999
+                # first output col
+                column = iColCount
+                start = time.time()
+                q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
+                elapsed = time.time() - start
+                print "quantile end on ", csvFilename, 'took', elapsed, 'seconds.'
+                quantileTime = elapsed
+
+
+                # remove all keys*******************************************************
+                start = time.time()
                 h2o.nodes[0].remove_all_keys()
+                elapsed = time.time() - start
+                print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'
+
+                trial += 1
+                xList.append(trial)
+                eList.append(execTime)
+                fList.append(quantileTime)
+
+        if DO_PLOT:
+            xLabel = 'trial'
+            eLabel = 'exec cut time'
+            fLabel = 'quantile time'
+            eListTitle = ""
+            fListTitle = ""
+            h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)
+
 
 
 if __name__ == '__main__':

diff --git a/src/main/java/hex/Quantiles.java b/src/main/java/hex/Quantiles.java
@@ -187,11 +187,13 @@ else if ( !Double.isNaN(_min) ) {
     } 
     else { // vec does not contain finite numbers
       // do we care here? have to think about whether multiPass is disabled/
+      // okay this one entry hcnt2 stuff is making the algo die ( I guess the min was nan above)
+      // for now, just make it length 2
       _start2 = vec.min();
       _binsz2 = Double.POSITIVE_INFINITY;
-      hcnt2 = new long[1];
-      hcnt2_min = new double[1];
-      hcnt2_max = new double[1];
+      hcnt2 = new long[2];
+      hcnt2_min = new double[2];
+      hcnt2_max = new double[2];
     }
     hcnt2_low = 0;
     hcnt2_high = 0;
@@ -527,7 +529,8 @@ else if ( hcnt2[k]==1 && targetCntFract!=0 ) {
       // Just need to check the one bin below and above k, if they exist. 
       // They might have zero entries, but then it's okay to ignore them.
       // update: use the closest edge in the next bin. better forward progress for small bin counts
-      // This code may make the practical min bin count around 4 or so (not 2)
+      // This code may make the practical min bin count around 4 or so (not 2).
+      // what has length 1 hcnt2 that makese this fail? Enums? shouldn't get here.
       newValStart = hcnt2_min[k];
       if ( k > 0 ) {
         if ( hcnt2[k-1]>0 && (hcnt2_max[k-1]<hcnt2_min[k]) ) {
@@ -539,6 +542,7 @@ else if ( hcnt2[k]==1 && targetCntFract!=0 ) {
       // k might be pointing to one less than that (like k=0 for 1 bin case)
       newValEnd = hcnt2_max[k];
       if ( k < (maxBinCnt-1) )  {
+        assert k+1 < hcnt2.length : k+" "+hcnt2.length+" "+_valMaxBinCnt+" "+_isEnum+" "+_isInt;
         if ( hcnt2[k+1]>0 && (hcnt2_min[k+1]>hcnt2_max[k]) ) {
           newValEnd = hcnt2_min[k+1];
         }