Merge branch 'master' of github.com:h2oai/h2o

christianmatei · Dec 19, 2014 · b731076 · b731076
2 parents 8ea863e + 704541c
commit b731076
Show file tree

Hide file tree

Showing 7 changed files with 56 additions and 38 deletions.
diff --git a/Makefile b/Makefile
@@ -300,6 +300,7 @@ dw_3:
 	sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/deployment/hadoop_tutorial.html
 	sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/Ruser/Rinstall.html
 	sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/deployment/ec2_build_ami.html
+	sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/deployment/H2O_Hadoop_Mapr.html
 
 #
 # Set appropriately for your data size to quickly try out H2O.

diff --git a/h2o-docs/source/deployment/H2O_Hadoop_Mapr.md b/h2o-docs/source/deployment/H2O_Hadoop_Mapr.md
@@ -10,25 +10,25 @@ All mappers must be able to communicate with each other and need to run at the s
 
 0. Log in to the Hadoop cluster: 
 
-	`ssh <username>@<HadoopClusterName>`
+	`ssh <username>@<HadoopNodeAddress>`
 
 	If you are asked if you want to continue connecting, enter `yes`.
 0. Enter the following: 
 
-	`wget http://h2o-release.s3.amazonaws.com/h2o/master/1624/h2o-2.9.0.1624.zip`
+	`wget http://h2o-release.s3.amazonaws.com/h2o/SUBST_RELEASE_NAME/SUBST_BUILD_NUMBER/h2o-SUBST_PROJECT_VERSION.zip`
 
 0. Wait while H2O downloads - the progress bar indicates completion. 
 
 	`100%[=================================>] 140,951,040 2.23M/s   in 65s`
 
 0. 	Enter the following: 
 
-	`unzip h2o-2.9.0.1624.zip`
+	`unzip h2o-SUBST_PROJECT_VERSION.zip`
 
 0. Wait while the H2O package unzips. 
 0. On the Hadoop node, change the current directory to the location of the Hadoop and H2O driver jar files: 
 
-	`cd h2o-2.9.0.1624/hadoop`
+	`cd h2o-SUBST_PROJECT_VERSION/hadoop`
 
 0. Enter the following: 
 

diff --git a/py/h2o_exec.py b/py/h2o_exec.py
@@ -78,7 +78,7 @@ def fill_in_expr_template(exprTemplate, colX=None, n=None, row=None, keyX=None,
     # just a string? 
     execExpr = exprTemplate
     if colX is not None:
-        print "Assume colX %s is zero-based..added 1 for R based exec2" % colX
+        ### print "Assume colX %s is zero-based..added 1 for R based exec2" % colX
         execExpr = re.sub('<col1>', str(colX+1), execExpr)
         # this is just another value
         execExpr = re.sub('<col2>', str(colX+2), execExpr)
@@ -93,11 +93,11 @@ def fill_in_expr_template(exprTemplate, colX=None, n=None, row=None, keyX=None,
         execExpr = re.sub('<m>', str(m), execExpr)
         execExpr = re.sub('<m-1>', str(m-1), execExpr)
     ### verboseprint("\nexecExpr:", execExpr)
-    print "execExpr:", execExpr
+    ### print "execExpr:", execExpr
     return execExpr
 
 
-def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False):
+def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False, **kwargs):
     if not node:
         node = h2o_nodes.nodes[0]
     start = time.time()
@@ -129,7 +129,7 @@ def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2
             print "function return"
             result = resultExec['funstr']
         else:
-            print "scalar return"
+            ### print "scalar return"
             result = resultExec['scalar']
 
     return resultExec, result
@@ -192,7 +192,7 @@ def exec_expr_list_rand(lenNodes, exprList, keyX,
         print "Trial #", trial, "completed\n"
 
 def exec_expr_list_across_cols(lenNodes, exprList, keyX, 
-    minCol=0, maxCol=55, timeoutSecs=10, incrementingResult=True):
+    minCol=0, maxCol=55, timeoutSecs=10, incrementingResult=True, **kwargs):
     colResultList = []
     for colX in range(minCol, maxCol):
         for i, exprTemplate in enumerate(exprList):
@@ -215,16 +215,16 @@ def exec_expr_list_across_cols(lenNodes, exprList, keyX,
                 resultKey = keyX
 
             # v2
-            (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs)
-            print "\nexecResult:", dump_json(resultExec)
+            (resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs, **kwargs)
+            # print "\nexecResult:", dump_json(resultExec)
 
             ### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
             # slows things down to check every iteration, but good for isolation
             if check_sandbox_for_errors():
                 raise Exception(
                     "Found errors in sandbox stdout or stderr, on trial #%s." % trial)
 
-        print "Column #", colX, "completed\n"
+        ### print "Column #", colX, "completed\n"
         colResultList.append(result)
 
     return colResultList

diff --git a/py/h2o_methods.py b/py/h2o_methods.py
@@ -557,7 +557,7 @@ def import_files(self, path, timeoutSecs=180):
     return a
 
 # 'destination_key', 'escape_nan' 'expression'
-def exec_query(self, timeoutSecs=20, ignoreH2oError=False, print_params=True, **kwargs):
+def exec_query(self, timeoutSecs=20, ignoreH2oError=False, print_params=False, **kwargs):
     # only v2 now
     params_dict = {
         'str': None,

diff --git a/py/testdir_multi_jvm/test_many_fp_formats_libsvm_2_fvec.py b/py/testdir_multi_jvm/test_many_fp_formats_libsvm_2_fvec.py
@@ -2,17 +2,19 @@
 sys.path.extend(['.','..','../..','py'])
 import h2o, h2o_cmd, h2o_browse as h2b, h2o_import as h2i, h2o_exec as h2e, h2o_glm
 import h2o_util
+from collections import OrderedDict
 
 zeroList = [
         'Result0 = 0',
 ]
 # the first column should use this
 exprList = [
-        'Result<n> = sum(<keyX>[<col1>])',
+        'Result<n> = sum(<keyX>[,<col1>])',
     ]
 
 DO_SUMMARY = False
-DO_COMPARE_SUM = False
+DO_COMPARE_SUM = False 
+DO_BAD_SEED = True
 
 def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution):
     # we can do all sorts of methods off the r object
@@ -41,6 +43,7 @@ def addRandValToRowStuff(colNumber, valMin, valMax, rowData, synColSumDict):
     classMin = -36
     classMax = 36
     dsf = open(csvPathname, "w+")
+    # ordinary dict
     synColSumDict = {0: 0} # guaranteed to have col 0 for output
     # even though we try to get a max colCount with random, we might fall short
     # track what max we really got
@@ -88,15 +91,19 @@ def tearDown(self):
     @classmethod
     def setUpClass(cls):
         global SEED
-        SEED = h2o.setup_random_seed()
+        if DO_BAD_SEED:
+            SEED = h2o.setup_random_seed(seed=5605820711843900818)
+        else:
+            SEED = h2o.setup_random_seed()
         h2o.init(2,java_heap_GB=5)
 
     @classmethod
     def tearDownClass(cls):
+        # h2o.sleep(3600)
         h2o.tear_down_cloud()
 
     def test_many_fp_formats_libsvm_2_fvec(self):
-        # h2b.browseTheCloud()
+        h2b.browseTheCloud()
         SYNDATASETS_DIR = h2o.make_syn_dir()
         tryList = [
             (100, 10000, 'cA', 300, 'sparse50'),
@@ -146,18 +153,20 @@ def test_many_fp_formats_libsvm_2_fvec(self):
                 if DO_COMPARE_SUM:
                     h2e.exec_zero_list(zeroList)
                     colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
-                        timeoutSecs=timeoutSecs)
-                    print "\n*************"
-                    print "colResultList", colResultList
-                    print "*************"
+                        timeoutSecs=timeoutSecs, print_params=False)
+                    #print "\n*************"
+                    #print "colResultList", colResultList
+                    #print "*************"
 
                 self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
                 # need to fix this for compare to expected
                 # we should be able to keep the list of fp sums per col above
                 # when we generate the dataset
-                ### print "\nsynColSumDict:", synColSumDict
 
-                for k,v in synColSumDict.iteritems():
+                sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
+                print sortedColSumDict
+                for k,v in sortedColSumDict.iteritems():
+                    print k
                     if DO_COMPARE_SUM:
                         # k should be integers that match the number of cols
                         self.assertTrue(k>=0 and k<len(colResultList))
@@ -172,8 +181,15 @@ def test_many_fp_formats_libsvm_2_fvec(self):
                     # enums don't have mean, but we're not enums
                     mean = float(inspect['cols'][k]['mean'])
                     # our fp formats in the syn generation sometimes only have two places?
-                    self.assertAlmostEqual(mean, synMean, places=0,
-                        msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))
+                    if not h2o_util.approxEqual(mean, synMean, tol=1e-4):
+                        execExpr = 'sum(%s[,%s])' % (selKey2, k+1)
+                        resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300) 
+                        print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec)
+                        print "Result of remembered sum on failing col:..:", k, v
+                        print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
+                        print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean
+                        sys.stdout.flush()
+                        raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))
 
                     naCnt = inspect['cols'][k]['naCnt']
                     self.assertEqual(0, naCnt,

diff --git a/src/main/java/water/fvec/CXDChunk.java b/src/main/java/water/fvec/CXDChunk.java
@@ -44,7 +44,7 @@ protected final double getFValue(int off){
     return (long)d;
   }
   @Override protected double atd_impl(int idx) {
-    int off = _offCache;
+/*    int off = _offCache;
     int prevIdx = getId(off);
     if(prevIdx == idx)
       return getFValue(off);
@@ -55,8 +55,8 @@ protected final double getFValue(int off){
         _offCache = (off += _ridsz + _valsz);
         return getFValue(off);
       }
-    }
-    off = findOffset(idx);
+    }*/
+    int off = findOffset(idx);
     if(getId(off) != idx)return 0;
     return getFValue(off);
   }

diff --git a/src/main/java/water/fvec/CXIChunk.java b/src/main/java/water/fvec/CXIChunk.java
@@ -18,7 +18,7 @@ public class CXIChunk extends Chunk {
   protected transient int _ridsz; // byte size of stored (chunk-relative) row nums
   protected static final int OFF = 6;
   protected transient int _lastOff = OFF;
-
+  protected transient volatile int _offCache = OFF;
 
   private static final long [] NAS = {C1Chunk._NA,C2Chunk._NA,C4Chunk._NA,C8Chunk._NA};
 
@@ -57,7 +57,7 @@ protected CXIChunk(int len, int nzs, int valsz, byte [] buf){
   @Override boolean setNA_impl(int idx)         { return false; }
 
   @Override protected long at8_impl(int idx) {
-    int off = _offCache;
+/*    int off = _offCache;
     int prevIdx = getId(off);
     if(prevIdx == idx)
       return getIValue(off);
@@ -68,10 +68,10 @@ protected CXIChunk(int len, int nzs, int valsz, byte [] buf){
         _offCache = (off += _ridsz + _valsz);
         return getIValue(off);
       }
-    }
-    off = findOffset(idx);
+    } */
+    int off = findOffset(idx);
     if(getId(off) != idx)return 0;
-    _offCache = off;
+//    _offCache = off;
     long v = getIValue(off);
     if( v== NAS[_valsz_log])
       throw new IllegalArgumentException("at8 but value is missing");
@@ -186,18 +186,19 @@ protected final int findOffset(int idx) {
     return this;
   }
 
-  protected transient volatile int _offCache = OFF;
+
   @Override public final int nextNZ(int rid){
-    if(rid == -1) {
+    final int off = rid == -1?OFF:findOffset(rid);
+/*    if(rid == -1) {
       _offCache = OFF;
       return getId(OFF);
     }
-    int off = _offCache;
+    int off = _offCache; */
     int x = getId(off);
-    if(x != rid) {
+/*    if(x != rid) {
       off = _offCache = rid == -1 ? OFF : findOffset(rid);
       x = getId(off);
-    }
+    }*/
     if(x > rid)return x;
     if(off < _mem.length - _ridsz - _valsz)
       return getId(off + _ridsz + _valsz);