From 3038ca536e1cba405d173723ca8aa860c6a7d445 Mon Sep 17 00:00:00 2001
From: mmalohlava <michal.malohlava@gmail.com>
Date: Mon, 11 Nov 2013 12:32:15 -0800
Subject: [PATCH 01/11] Fix in computing tree stats for GBM trees.

---
 src/main/java/hex/drf/DRF.java                    | 10 ++--------
 src/main/java/hex/gbm/GBM.java                    |  9 +++++----
 src/main/java/hex/gbm/SharedTreeModelBuilder.java |  7 +++++++
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/src/main/java/hex/drf/DRF.java b/src/main/java/hex/drf/DRF.java
index 28c872c283..ddc6f0eab8 100644
--- a/src/main/java/hex/drf/DRF.java
+++ b/src/main/java/hex/drf/DRF.java
@@ -345,20 +345,14 @@ private DTree[] buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random r
       }
     }.doAll(fr);
 
+    // Collect leaves stats
+    for (int i=0; i<ktrees.length; i++) ktrees[i].leaves = ktrees[i].len() - leafs[i];
     // DEBUG: Print the generated K trees
     //printGenerateTrees(ktrees);
-    for (int i=0; i<ktrees.length; i++) ktrees[i].leaves = ktrees[i].len() - leafs[i];
 
     return ktrees;
   }
 
-  @SuppressWarnings("unused") // helper for debugging
-  private void printGenerateTrees(DTree[] trees) {
-    for( int k=0; k<_nclass; k++ )
-      if( trees[k] != null )
-        System.out.println(trees[k].root().toString2(new StringBuilder(),0));
-  }
-
   // Read the 'tree' columns, do model-specific math and put the results in the
   // ds[] array, and return the sum.  Dividing any ds[] element by the sum
   // turns the results into a probability distribution.
diff --git a/src/main/java/hex/gbm/GBM.java b/src/main/java/hex/gbm/GBM.java
index 70315c7319..4af5426070 100644
--- a/src/main/java/hex/gbm/GBM.java
+++ b/src/main/java/hex/gbm/GBM.java
@@ -235,6 +235,7 @@ private DTree[] buildNextKTrees(Frame fr) {
           if( dn._split._col == -1 ) udn.do_not_split();
           else did_split = true;
         }
+        tree.depth++;
         leafs[k]=tmax;          // Setup leafs for next tree level
       }
 
@@ -308,10 +309,10 @@ private DTree[] buildNextKTrees(Frame fr) {
       }
     }.doAll(fr);
 
-    // Print the generated K trees
-    //for( int k=0; k<_nclass; k++ )
-    //  if( ktrees[k] != null )
-    //    System.out.println(ktrees[k].root().toString2(new StringBuilder(),0));
+    // Collect leaves stats
+    for (int i=0; i<ktrees.length; i++) ktrees[i].leaves = ktrees[i].len() - leafs[i];
+    // DEBUG: Print the generated K trees
+    //printGenerateTrees(ktrees);
 
     return ktrees;
   }
diff --git a/src/main/java/hex/gbm/SharedTreeModelBuilder.java b/src/main/java/hex/gbm/SharedTreeModelBuilder.java
index f1d42b07ab..981c929ff3 100644
--- a/src/main/java/hex/gbm/SharedTreeModelBuilder.java
+++ b/src/main/java/hex/gbm/SharedTreeModelBuilder.java
@@ -447,4 +447,11 @@ public Score report( Sys tag, int ntree, DTree[] trees ) {
   public static Random createRNG(long seed) {
     return new MersenneTwisterRNG(new int[] { (int)(seed>>32L),(int)seed });
   }
+
+  // helper for debugging
+  static protected void printGenerateTrees(DTree[] trees) {
+    for( int k=0; k<trees.length; k++ )
+      if( trees[k] != null )
+        System.out.println(trees[k].root().toString2(new StringBuilder(),0));
+  }
 }

From 6973fd78e522f130b17bf20bf29da1b75e0cfb20 Mon Sep 17 00:00:00 2001
From: Kevin Normoyle <kevin@0xdata.com>
Date: Mon, 11 Nov 2013 16:17:40 -0800
Subject: [PATCH 02/11] added some Exec2 testing and more stressful gz
 compression ratio

---
 py/h2o.py                                     |  25 ++--
 py/h2o_exec.py                                | 114 +++++++++++++-----
 py/testdir_single_jvm/test_GBM_fvec.py        |   1 -
 py/testdir_single_jvm/test_exec2_operators.py |  59 +++++++++
 .../test_parse_syn_gz_cat.py                  |  27 +++--
 py/testdir_single_jvm/test_rf_syn_gz_cat.py   |  13 +-
 6 files changed, 189 insertions(+), 50 deletions(-)
 create mode 100644 py/testdir_single_jvm/test_exec2_operators.py

diff --git a/py/h2o.py b/py/h2o.py
index 0986ca8931..e95fb2473d 100644
--- a/py/h2o.py
+++ b/py/h2o.py
@@ -895,7 +895,8 @@ def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=N
             raise Exception("Could not decode any json from the request. Do you have beta features turned on? beta_features: ", beta_features)
 
         for e in ['error', 'Error', 'errors', 'Errors']:
-            if e in rjson:
+            # error can be null (python None). This happens in exec2
+            if e in rjson and rjson[e]:
                 verboseprint(dump_json(rjson))
                 emsg = 'rjson %s in %s: %s' % (e, inspect.stack()[1][3], rjson[e])
                 if ignoreH2oError:
@@ -905,7 +906,8 @@ def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=N
                     raise Exception(emsg)
 
         for w in ['warning', 'Warning', 'warnings', 'Warnings']:
-            if w in rjson:
+            # warning can be null (python None).
+            if w in rjson and rjson[w]:
                 verboseprint(dump_json(rjson))
                 print 'rjson %s in %s: %s' % (w, inspect.stack()[1][3], rjson[w])
 
@@ -1388,15 +1390,20 @@ def import_hdfs(self, path, timeoutSecs=180):
 
     # 'destination_key', 'escape_nan' 'expression'
     def exec_query(self, timeoutSecs=20, ignoreH2oError=False, **kwargs):
-        params_dict = {
-            'expression': None,
-            ## 'escape_nan': 0,
-            ## 'destination_key': "Result.hex", # curious as to whether specifying destination key messes anything up.
-            }
+        if beta_features:
+            params_dict = {
+                'str': None,
+                }
+        else:
+            params_dict = {
+                'expression': None,
+                ## 'escape_nan': 0,
+                }
+
         browseAlso = kwargs.pop('browseAlso',False)
-        params_dict.update(kwargs)
+        check_params_update_kwargs(params_dict, kwargs, 'exec_query', print_params=True)
         verboseprint("\nexec_query:", params_dict)
-        a = self.__do_json_request('Exec.json',
+        a = self.__do_json_request('2/Exec2.json' if beta_features else 'Exec.json',
             timeout=timeoutSecs, ignoreH2oError=ignoreH2oError, params=params_dict)
         verboseprint("\nexec_query result:", dump_json(a))
         return a
diff --git a/py/h2o_exec.py b/py/h2o_exec.py
index d312c69d8a..2fb13e581b 100644
--- a/py/h2o_exec.py
+++ b/py/h2o_exec.py
@@ -32,30 +32,48 @@ def checkScalarResult(resultInspect, resultKey):
     # weird..it's a tuple, not a list? when the extra level of hier is there
     # this works:
     if type(resultInspect) is not dict:
-        ### print "Trimming resultInspect hier."
         resultInspect0 = resultInspect[0]
     else:
         resultInspect0 = resultInspect
 
     emsg = None
     while(True):
-        if 'type' not in resultInspect0:
-            emsg = "'type' missing. Look at the json just printed"
-            break 
-        t = resultInspect0["type"]
-        if t != 'parsed':
-            emsg = resultKey + " 'type' is not 'parsed'. Look at the json just printed"
-            break 
-
-        if 'rows' not in resultInspect0:
-            emsg = "Inspect response: 'rows' missing. Look at the json just printed"
-            break 
-        rows = resultInspect0["rows"]
-
-        if 'cols' not in resultInspect0:
-            emsg = "Inspect response: 'cols' missing. Look at the json just printed"
-            break 
-        cols = resultInspect0["cols"]
+
+        if h2o.beta_features:
+            if 'num_rows' not in resultInspect0:
+                emsg = "Inspect response: 'num_rows' missing. Look at the json just printed"
+                break 
+            rows = resultInspect0["num_rows"]
+
+            if 'cols' not in resultInspect0:
+                emsg = "Inspect response: 'num_cols' missing. Look at the json just printed"
+                break 
+            cols = resultInspect0["cols"]
+
+            print "cols:", h2o.dump_json(cols)
+
+            num_cols = resultInspect0["num_cols"]
+
+        else:
+
+            if 'type' not in resultInspect0:
+                emsg = "'type' missing. Look at the json just printed"
+                break 
+            t = resultInspect0["type"]
+
+            if t != 'parsed':
+                emsg = resultKey + " 'type' is not 'parsed'. Look at the json just printed"
+                break 
+
+            if 'rows' not in resultInspect0:
+                emsg = "Inspect response: 'rows' missing. Look at the json just printed"
+                break 
+            rows = resultInspect0["rows"]
+
+            if 'cols' not in resultInspect0:
+                emsg = "Inspect response: 'cols' missing. Look at the json just printed"
+                break 
+            cols = resultInspect0["cols"]
 
         break
 
@@ -69,7 +87,10 @@ def checkScalarResult(resultInspect, resultKey):
     # FIX! the key for the value can be 0 or 1 or ?? (apparently col?) Should change H2O here
     metaDict = cols[0]
     for key,value in metaDict.items():
-        h2o.verboseprint("Inspect metadata:", key, value)
+        if h2o.beta_features:
+            print "Inspect metaDict:", key, value
+        else:
+            h2o.verboseprint("Inspect metaDict:", key, value)
             
     min_value = metaDict['min']
     checkForBadFP(min_value)
@@ -104,22 +125,61 @@ def exec_expr(node=None, execExpr=None, resultKey="Result.hex", timeoutSecs=10,
     start = time.time()
     # FIX! Exec has 'escape_nan' arg now. should we test?
     # 5/14/13 removed escape_nan=0
-    resultExec = h2o_cmd.runExec(node, expression=execExpr, 
-        timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError)
+
+    if h2o.beta_features:
+        kwargs = {'str': execExpr} 
+        resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs)
+    else:
+        kwargs = {'expression': execExpr} 
+        resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs)
+
     h2o.verboseprint(resultExec)
     h2o.verboseprint('exec took', time.time() - start, 'seconds')
     ### print 'exec took', time.time() - start, 'seconds'
 
     h2o.verboseprint("\nfirst look at the default Result key")
     # new offset=-1 to get the metadata?
-    defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1)
-    checkScalarResult(defaultInspectM1, "Result.hex")
+    if h2o.beta_features: # default assign not present in v2?
+        # constants don't create keys.
+        # so the only way to see the results is to do another exec?
+        kwargs = {'str': resultKey} 
+        resultExec2 = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs)
+        print "resultExec2:", h2o.dump_json(resultExec2)
+
+        # maybe return 'scalar' in some cases?
+        return resultExec2, resultExec2['cols'][0]['min']
+        # exec_query parameters: {'str': 'Result0 = c(0)'}
+        # exec_query parameters: {'str': 'Result0'}
+        # resultExec2: {
+        #   "Request2": 0, 
+        #   "cols": [
+        #     {
+        #       "max": 0.0, 
+        #       "mean": 0.0, 
+        #       "min": 0.0, 
+        #       "naCnt": 0, 
+        #       "name": "c", 
+        #       "type": "Int"
+        #     }
+        #   ], 
+        #   "error": null, 
+        #   "funstr": null, 
+        #   "key": null, 
+        #   "num_cols": 1, 
+        #   "num_rows": 1, 
+        #   "result": "c \n0 \n", 
+        #   "scalar": 0.0
+        # }
+
+    else:
+        defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1)
+        checkScalarResult(defaultInspectM1, "Result.hex")
 
-    h2o.verboseprint("\nNow look at the assigned " + resultKey + " key")
-    resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1)
-    min_value = checkScalarResult(resultInspectM1, resultKey)
+        h2o.verboseprint("\nNow look at the assigned " + resultKey + " key")
+        resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1)
+        min_value = checkScalarResult(resultInspectM1, resultKey)
 
-    return resultInspectM1, min_value
+        return resultInspectM1, min_value
 
 
 def exec_zero_list(zeroList):
diff --git a/py/testdir_single_jvm/test_GBM_fvec.py b/py/testdir_single_jvm/test_GBM_fvec.py
index 049a8b77b2..227085b60a 100644
--- a/py/testdir_single_jvm/test_GBM_fvec.py
+++ b/py/testdir_single_jvm/test_GBM_fvec.py
@@ -115,7 +115,6 @@ def colIt(x): return "C" + str(x)
                 }
 
             kwargs = params.copy()
-            h2o.beta_features = True
             timeoutSecs = 1800
             start = time.time()
             GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs)
diff --git a/py/testdir_single_jvm/test_exec2_operators.py b/py/testdir_single_jvm/test_exec2_operators.py
new file mode 100644
index 0000000000..df01ba362d
--- /dev/null
+++ b/py/testdir_single_jvm/test_exec2_operators.py
@@ -0,0 +1,59 @@
+import unittest, random, sys, time
+sys.path.extend(['.','..','py'])
+
+import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_hosts
+
+print "FIX! evidently visibility between expressions depends on the type. constants disappear?"
+print "hack by creating vectors"
+initList = [
+        'Result0 = c(0)',
+        'Result1 = c(1)',
+        'Result2 = c(2)',
+        'Result3 = c(3)',
+        ]
+
+# double assign to Result.hex, so the checker doesn't have different names to check?
+exprList = [
+        # 'Result.hex = Result<n> = Result0 * Result<n-1>',
+        'Result.hex = Result<n> = Result1 + Result<n-1>',
+        # 'Result.hex = Result<n> = Result2 / Result<n-1>',
+        # 'Result.hex = Result<n> = Result3 - Result<n-1>',
+        ]
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global SEED, localhost
+        SEED = h2o.setup_random_seed()
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(1)
+        else:
+            h2o_hosts.build_cloud_with_hosts(1)
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_exec_operators(self):
+        h2o.beta_features = True
+
+        for i, execExpr in enumerate(initList):
+            if h2o.beta_features: # no default result
+                resultKey = "Result" + str(i)
+            else:
+                resultKey = "Result.hex"
+            h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4)
+
+        start = time.time()
+        h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10)
+
+        h2o.check_sandbox_for_errors()
+        print "exec end on ", "operators" , 'took', time.time() - start, 'seconds'
+
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/py/testdir_single_jvm/test_parse_syn_gz_cat.py b/py/testdir_single_jvm/test_parse_syn_gz_cat.py
index ebf6f51dbd..241e47d98d 100644
--- a/py/testdir_single_jvm/test_parse_syn_gz_cat.py
+++ b/py/testdir_single_jvm/test_parse_syn_gz_cat.py
@@ -2,7 +2,7 @@
 sys.path.extend(['.','..','py'])
 import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_exec as h2e, h2o_util
 
-print "Create csv with lots of same data (95% 0?), so gz will have high compression ratio"
+print "Create csv with lots of same data (98% 0?), so gz will have high compression ratio"
 print "Cat a bunch of them together, to get an effective large blow up inside h2o"
 print "Can also copy the files to test multi-file gz parse...that will behave differently"
 print "Behavior may be different depending on whether small ints are used, reals or used, or enums are used"
@@ -16,8 +16,11 @@ def write_syn_dataset(csvPathname, rowCount, colCount, SEED):
     for i in range(rowCount):
         rowData = []
         for j in range(colCount):
-            r = h2o_util.choice_with_probability([(1.1, .05), (0.1, .95)])
-            rowData.append(r)
+            # r = h2o_util.choice_with_probability([(1.1, .02), (0.1, .98)])
+            r = h2o_util.choice_with_probability([(1, .001), (0, .999)])
+            # make r a many-digit real, so gzip compresses even more better!
+            # rowData.append('%#034.32e' % r)
+            rowData.append('%.1f' % r)
 
         rowDataCsv = ",".join(map(str,rowData))
         dsf.write(rowDataCsv + "\n")
@@ -34,7 +37,7 @@ def setUpClass(cls):
         SEED = h2o.setup_random_seed()
         localhost = h2o.decide_if_localhost()
         if (localhost):
-            h2o.build_cloud(1,java_heap_GB=14)
+            h2o.build_cloud(3,java_heap_GB=1)
         else:
             h2o_hosts.build_cloud_with_hosts()
 
@@ -46,9 +49,11 @@ def test_parse_syn_gz_cat(self):
         SYNDATASETS_DIR = h2o.make_syn_dir()
         tryList = [
             # summary fails with 100000 cols
-            (10, 5000, 'cE', 600),
-            (10, 10000, 'cF', 600),
-            (10, 50000, 'cF', 600),
+            # overwrite the key each time to save space?
+            (100, 40000, 'cF', 600),
+            (100, 20000, 'cF', 600),
+            (100, 10000, 'cF', 600),
+            (100, 5000, 'cF', 600),
             ]
 
         FILEREPL = 200
@@ -95,10 +100,14 @@ def test_parse_syn_gz_cat(self):
             start = time.time()
             inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs)
             print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds"
+            num_rows = inspect['num_rows']
+            num_cols = inspect['num_cols']
+            value_size_bytes = inspect['value_size_bytes']
             h2o_cmd.infoFromInspect(inspect, csvPathname)
             print "\n" + csvPathname, \
-                "    num_rows:", "{:,}".format(inspect['num_rows']), \
-                "    num_cols:", "{:,}".format(inspect['num_cols'])
+                "\n    num_rows:", "{:,}".format(num_rows), \
+                "\n    num_cols:", "{:,}".format(num_cols), \
+                "\n    value_size_bytes:", "{:,}".format(value_size_bytes)
 
             # should match # of cols in header or ??
             self.assertEqual(inspect['num_cols'], colCount,
diff --git a/py/testdir_single_jvm/test_rf_syn_gz_cat.py b/py/testdir_single_jvm/test_rf_syn_gz_cat.py
index 22590d8464..f4cac912bb 100644
--- a/py/testdir_single_jvm/test_rf_syn_gz_cat.py
+++ b/py/testdir_single_jvm/test_rf_syn_gz_cat.py
@@ -2,7 +2,7 @@
 sys.path.extend(['.','..','py'])
 import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i, h2o_exec as h2e, h2o_util
 
-print "Create csv with lots of same data (95% 0?), so gz will have high compression ratio"
+print "Create csv with lots of same data (98% 0?), so gz will have high compression ratio"
 print "Cat a bunch of them together, to get an effective large blow up inside h2o"
 print "Can also copy the files to test multi-file gz parse...that will behave differently"
 print "Behavior may be different depending on whether small ints are used, reals or used, or enums are used"
@@ -22,16 +22,21 @@ def write_syn_dataset(csvPathname, rowCount, colCount, SEED):
         rowSum = 0
         for j in range(colCount):
             if BASE==2:
-                # we're just doing 50/50 for now, unlike the print says above
-                r = h2o_util.choice_with_probability([(0, .5), (1, .5)])
+                # 50/50
+                # r = h2o_util.choice_with_probability([(0, .5), (1, .5)])
+                # 98/2
+                r = h2o_util.choice_with_probability([(0, .98), (1, .2)])
             else:
                 raise Exception("Unsupported BASE: " + BASE)
 
             rowSum += r
+
+
             rowData.append(r)
 
         responseVar = rowSum % BASE
-        rowData.append(responseVar)
+        # make r a many-digit real, so gzip compresses even more better!
+        rowData.append('%#034.32e' % responseVar)
         rowDataCsv = ",".join(map(str,rowData))
         dsf.write(rowDataCsv + "\n")
 

From f38787117807419bba42f403b8a779adc5829d8b Mon Sep 17 00:00:00 2001
From: anqi <anqi@0xdata.com>
Date: Mon, 11 Nov 2013 18:03:42 -0800
Subject: [PATCH 03/11] Added some basic operations like logical AND/OR

---
 R/h2oRClient-package/R/Classes.R    | 179 ++++++++--------------------
 R/h2oRClient-package/R/Internal.R   |  23 +++-
 src/main/java/water/exec/ASTOp.java |  28 ++++-
 3 files changed, 94 insertions(+), 136 deletions(-)

diff --git a/R/h2oRClient-package/R/Classes.R b/R/h2oRClient-package/R/Classes.R
index ab0e6aee14..f00675e6f0 100644
--- a/R/h2oRClient-package/R/Classes.R
+++ b/R/h2oRClient-package/R/Classes.R
@@ -322,72 +322,6 @@ setMethod("summary", "H2OParsedData", function(object) {
   result
 })
 
-histograms <- function(object) { UseMethod("histograms", object) }
-setMethod("histograms", "H2OParsedData2", function(object) {
-  res = h2o.__remoteSend(object@h2o, h2o.__PAGE_SUMMARY2, source=object@key)
-  list.of.bins <- lapply(res$summaries, function(res) {
-    if (res$rows == 0) {
-      bins <- NULL
-    } else {
-      domains <- res$domains
-      counts <- res$bins
-      breaks <- seq(res$start, by=res$binsz, length.out=length(res$bins) + 1)
-      bins <- list(domains,counts,breaks)
-      names(bins) <- cbind('domains', 'counts', 'breaks')
-    }
-    bins
-  })
-})
-
-setMethod("summary", "H2OParsedData2", function(object) {
-  res = h2o.__remoteSend(object@h2o, h2o.__PAGE_SUMMARY2, source=object@key)
-  col.summaries = res$summaries
-  col.names     = res$names
-  col.means     = res$means
-  col.results   = mapply(c, res$summaries, res$names, res$means, SIMPLIFY=FALSE)
-  for (i in 1:length(col.results))
-    names(col.results[[i]])[(length(col.results[[i]]) - 1) : length(col.results[[i]])] <- c('name', 'mean')
-  result = NULL
-
-  result <- sapply(col.results, function(res) {
-    if(is.null(res$domains)) { # numeric column
-      if(is.null(res$mins) || length(res$mins) == 0) res$mins = NaN
-      if(is.null(res$maxs) || length(res$maxs) == 0) res$maxs = NaN
-      if(is.null(res$percentileValues))
-        params = format(rep(round(as.numeric(col.means[[i]]), 3), 6), nsmall = 3)
-      else
-        params = format(round(as.numeric(c(
-          res$mins[1],
-          res$percentileValues[4],
-          res$percentileValues[6],
-          res$mean,
-          res$percentileValues[8],
-          tail(res$maxs, 1))), 3), nsmall = 3)
-      result = c(paste("Min.   :", params[1], "  ", sep=""), paste("1st Qu.:", params[2], "  ", sep=""),
-                 paste("Median :", params[3], "  ", sep=""), paste("Mean   :", params[4], "  ", sep=""),
-                 paste("3rd Qu.:", params[5], "  ", sep=""), paste("Max.   :", params[6], "  ", sep="")) 
-    }
-    else {
-      domains <- res$domains[res$maxs + 1]
-      counts <- res$bins[res$maxs + 1]
-      width <- max(cbind(nchar(domains), nchar(counts)))
-      result <- paste(domains,
-                      mapply(function(x, y) { paste(rep(' ', max(width + 1 - nchar(x) - nchar(y),0)), collapse='') }, domains, counts),
-                      ":",
-                      counts,
-                      " ",
-                      sep='')
-      result[6] <- NA
-      result
-    }
-  })
-  
-  result = as.table(result)
-  rownames(result) <- rep("", 6)
-  colnames(result) <- col.names
-  result
-})
-
 setMethod("summary", "H2OPCAModel", function(object) {
   # TODO: Save propVar and cumVar from the Java output instead of computing here
   myVar = object@model$sdev^2
@@ -472,7 +406,7 @@ setMethod("h2o.factor", signature(data="H2OParsedData", col="character"),
       h2o.factor(data, ind-1)
 })
 
-#--------------------------------- FluidVecs --------------------------------------#
+#------------------------------------ FluidVecs ----------------------------------------#
 setMethod("show", "H2ORawData2", function(object) {
   print(object@h2o)
   cat("Raw Data Key:", object@key, "\n")
@@ -556,41 +490,58 @@ setMethod("$", "H2OParsedData2", function(x, name) {
     new("H2OParsedData2", h2o=x@h2o, key=res$dest_key)
 })
 
-setMethod("+", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("+", e1, e2) })
-setMethod("-", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("-", e1, e2) })
-setMethod("*", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("*", e1, e2) })
-setMethod("/", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("/", e1, e2) })
-# setMethod("%%", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("%", e1, e2) })
-setMethod("==", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("==", e1, e2) })
-setMethod(">", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">", e1, e2) })
-setMethod("<", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<", e1, e2) })
-setMethod("!=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("!=", e1, e2) })
-setMethod(">=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">=", e1, e2) })
-setMethod("<=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<=", e1, e2) })
-
-setMethod("+", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("+", e1, e2) })
-setMethod("-", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("-", e1, e2) })
-setMethod("*", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("*", e1, e2) })
-setMethod("/", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("/", e1, e2) })
-# setMethod("%%", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("%", e1, e2) })
-setMethod("==", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("==", e1, e2) })
-setMethod(">", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">", e1, e2) })
-setMethod("<", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<", e1, e2) })
-setMethod("!=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("!=", e1, e2) })
-setMethod(">=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">=", e1, e2) })
-setMethod("<=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<=", e1, e2) })
-
-setMethod("+", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("+", e1, e2) })
-setMethod("-", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("-", e1, e2) })
-setMethod("*", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("*", e1, e2) })
-setMethod("/", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("/", e1, e2) })
-# setMethod("%%", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("%", e1, e2) })
-setMethod("==", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("==", e1, e2) })
-setMethod(">", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2(">", e1, e2) })
-setMethod("<", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("<", e1, e2) })
-setMethod("!=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("!=", e1, e2) })
-setMethod(">=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2(">=", e1, e2) })
-setMethod("<=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("<=", e1, e2) })
+setMethod("+", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("+", e1, e2) })
+setMethod("-", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("-", e1, e2) })
+setMethod("*", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("*", e1, e2) })
+setMethod("/", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("/", e1, e2) })
+setMethod("%%", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("%", e1, e2) })
+setMethod("==", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("==", e1, e2) })
+setMethod(">", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">", e1, e2) })
+setMethod("<", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<", e1, e2) })
+setMethod("!=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) })
+setMethod(">=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) })
+setMethod("<=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) })
+setMethod("&", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) })
+setMethod("|", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) })
+
+setMethod("+", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("+", e1, e2) })
+setMethod("-", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("-", e1, e2) })
+setMethod("*", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("*", e1, e2) })
+setMethod("/", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("/", e1, e2) })
+setMethod("%%", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("%", e1, e2) })
+setMethod("==", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("==", e1, e2) })
+setMethod(">", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">", e1, e2) })
+setMethod("<", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<", e1, e2) })
+setMethod("!=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) })
+setMethod(">=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) })
+setMethod("<=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) })
+setMethod("&", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) })
+setMethod("|", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) })
+
+setMethod("+", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("+", e1, e2) })
+setMethod("-", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("-", e1, e2) })
+setMethod("*", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("*", e1, e2) })
+setMethod("/", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("/", e1, e2) })
+setMethod("%%", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("%", e1, e2) })
+setMethod("==", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("==", e1, e2) })
+setMethod(">", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(">", e1, e2) })
+setMethod("<", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("<", e1, e2) })
+setMethod("!=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("!=", e1, e2) })
+setMethod(">=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(">=", e1, e2) })
+setMethod("<=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("<=", e1, e2) })
+setMethod("&", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("&", e1, e2) })
+setMethod("|", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("|", e1, e2) })
+
+setMethod("abs", "H2OParsedData2", function(x) { h2o.__unop2("abs", x) })
+setMethod("sign", "H2OParsedData2", function(x) { h2o.__unop2("sgn", x) })
+setMethod("sqrt", "H2OParsedData2", function(x) { h2o.__unop2("sqrt", x) })
+setMethod("ceiling", "H2OParsedData2", function(x) { h2o.__unop2("ceil", x) })
+setMethod("floor", "H2OParsedData2", function(x) { h2o.__unop2("floor", x) })
+setMethod("log", "H2OParsedData2", function(x) { h2o.__unop2("log", x) })
+setMethod("exp", "H2OParsedData2", function(x) { h2o.__unop2("exp", x) })
+setMethod("sum", "H2OParsedData2", function(x) { h2o.__unop2("sum", x) })
+setMethod("is.na", "H2OParsedData2", function(x) { h2o.__unop2("is.na", x) })
+setMethod("table", "H2OParsedData2", function(x) { h2o.__unop2("table", x) })
 
 setMethod("colnames", "H2OParsedData2", function(x) {
   res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key)
@@ -605,15 +556,6 @@ setMethod("nrow", "H2OParsedData2", function(x) {
 setMethod("ncol", "H2OParsedData2", function(x) {
   res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key); as.numeric(res$numCols) })
 
-setMethod("sign", "H2OParsedData2", function(x) {
-  expr = paste("sgn(", x@key, ")")
-  res = h2o.__exec2(x@h2o, expr)
-  if(res$num_rows == 0 && res$num_cols == 0)
-    res$scalar
-  else
-    new("H2OParsedData2", h2o=x@h2o, key=res$dest_key)
-})
-
 setMethod("min", "H2OParsedData2", function(x) {
   res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key)
   min(sapply(res$cols, function(x) { x$min }))
@@ -630,12 +572,6 @@ setMethod("range", "H2OParsedData2", function(x) {
   c(min(temp[1,]), max(temp[2,]))
 })
 
-setMethod("sum", "H2OParsedData2", function(x) {
-  expr = paste("sum(", x@key, ")", sep="")
-  res = h2o.__exec2(x@h2o, expr)
-  res$scalar
-})
-
 setMethod("colMeans", "H2OParsedData2", function(x) {
   res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key)
   temp = sapply(res$cols, function(x) { x$mean })
@@ -660,15 +596,6 @@ setMethod("tail", "H2OParsedData2", function(x, n = 6L, ...) {
   tail(new("H2OParsedData", h2o=x@h2o, key=x@key), n, ...)
 })
 
-setMethod("is.na", "H2OParsedData2", function(x) {
-  expr = paste("is.na(", x@key, ")")
-  res = h2o.__exec2(x@h2o, expr)
-  if(res$num_rows == 0 && res$num_cols == 0)
-    res$scalar
-  else
-    new("H2OLogicalData2", h2o=x@h2o, key=res$dest_key)
-})
-
 setMethod("is.factor", "H2OParsedData2", function(x) {
   res = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source=x@key)
   temp = sapply(res$summaries, function(x) { is.null(x$domains) })
diff --git a/R/h2oRClient-package/R/Internal.R b/R/h2oRClient-package/R/Internal.R
index 0250b69b07..394d7bb0d9 100644
--- a/R/h2oRClient-package/R/Internal.R
+++ b/R/h2oRClient-package/R/Internal.R
@@ -4,7 +4,7 @@ pkg.env$result_count = 0
 pkg.env$IS_LOGGING = FALSE
 TEMP_KEY = "Last.value"
 RESULT_MAX = 100
-LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=")
+LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&", "|")
 
 # Initialize functions for R logging
 myPath = paste(Sys.getenv("HOME"), "Library/Application Support/h2o", sep="/")
@@ -290,13 +290,24 @@ h2o.__exec2_dest_key <- function(client, expr, destKey) {
   return(res)
 }
 
-h2o.__operator2 <- function(op, x, y) {
+h2o.__unop2 <- function(op, x) {
+  expr = paste(op, "(", x@key, ")")
+  res = h2o.__exec2(x@h2o, expr)
+  if(res$num_rows == 0 && res$num_cols == 0)   # TODO: If logical operator, need to indicate
+    return(res$scalar)
+  if(op %in% LOGICAL_OPERATORS)
+    new("H2OLogicalData2", h2o=myClient, key=res$dest_key)
+  else
+    new("H2OParsedData2", h2o=myClient, key=res$dest_key)
+}
+
+h2o.__binop2 <- function(op, x, y) {
   # if(!((ncol(x) == 1 || class(x) == "numeric") && (ncol(y) == 1 || class(y) == "numeric")))
   #  stop("Can only operate on single column vectors")
-  LHS = ifelse(class(x) == "H2OParsedData2", x@key, x)
-  RHS = ifelse(class(y) == "H2OParsedData2", y@key, y)
+  LHS = ifelse(class(x) == "H2OParsedData2" || class(x) == "H2OLogicalData2", x@key, x)
+  RHS = ifelse(class(y) == "H2OParsedData2" || class(y) == "H2OLogicalData2", y@key, y)
   expr = paste(LHS, op, RHS)
-  if(class(x) == "H2OParsedData2") myClient = x@h2o
+  if(class(x) == "H2OParsedData2" || class(x) == "H2OLogicalData2") myClient = x@h2o
   else myClient = y@h2o
   res = h2o.__exec2(myClient, expr)
 
@@ -306,4 +317,4 @@ h2o.__operator2 <- function(op, x, y) {
     new("H2OLogicalData2", h2o=myClient, key=res$dest_key)
   else
     new("H2OParsedData2", h2o=myClient, key=res$dest_key)
-}
+}
\ No newline at end of file
diff --git a/src/main/java/water/exec/ASTOp.java b/src/main/java/water/exec/ASTOp.java
index 8fc2cccde1..bf218165a2 100644
--- a/src/main/java/water/exec/ASTOp.java
+++ b/src/main/java/water/exec/ASTOp.java
@@ -15,15 +15,23 @@ public abstract class ASTOp extends AST {
   static {
     // Unary ops
     put(new ASTIsNA());
-    put(new ASTSgn ());
     put(new ASTNrow());
     put(new ASTNcol());
+    put(new ASTAbs());
+    put(new ASTSgn ());
+    put(new ASTSqrt());
+    put(new ASTCeil());
+    put(new ASTFlr());
+    put(new ASTLog());
+    put(new ASTExp());
 
     // Binary ops
     put(new ASTPlus());
     put(new ASTSub ());
     put(new ASTMul ());
     put(new ASTDiv ());
+    put(new ASTPow ());
+    put(new ASTMod ());
     put(new ASTMin ());
     put(new ASTMax ());
     put(new ASTLT  ());
@@ -32,6 +40,8 @@ public abstract class ASTOp extends AST {
     put(new ASTGE  ());
     put(new ASTEQ  ());
     put(new ASTNE  ());
+    put(new ASTLA  ());
+    put(new ASTLO  ());
 
     // Misc
     put(new ASTCat ());
@@ -111,8 +121,14 @@ static Type[] newsig() {
   }
 }
 
+class ASTAbs  extends ASTUniOp { String opStr(){ return "abs";   } ASTOp make() {return new ASTAbs ();} double op(double d) { return Math.abs(d);}}
+class ASTSgn  extends ASTUniOp { String opStr(){ return "sgn" ;  } ASTOp make() {return new ASTSgn ();} double op(double d) { return Math.signum(d);}}
+class ASTSqrt extends ASTUniOp { String opStr(){ return "sqrt";  } ASTOp make() {return new ASTSqrt();} double op(double d) { return Math.sqrt(d);}}
+class ASTCeil extends ASTUniOp { String opStr(){ return "ceil";  } ASTOp make() {return new ASTCeil();} double op(double d) { return Math.ceil(d);}}
+class ASTFlr  extends ASTUniOp { String opStr(){ return "floor"; } ASTOp make() {return new ASTFlr(); } double op(double d) { return Math.floor(d);}}
+class ASTLog  extends ASTUniOp { String opStr(){ return "log";   } ASTOp make() {return new ASTLog ();} double op(double d) { return Math.log(d);}}
+class ASTExp  extends ASTUniOp { String opStr(){ return "exp";   } ASTOp make() {return new ASTExp ();} double op(double d) { return Math.exp(d);}}
 class ASTIsNA extends ASTUniOp { String opStr(){ return "is.na"; } ASTOp make() {return new ASTIsNA();} double op(double d) { return Double.isNaN(d)?1:0;}}
-class ASTSgn  extends ASTUniOp { String opStr(){ return "sgn" ; } ASTOp make() {return new ASTSgn ();} double op(double d) { return Math.signum(d);}}
 class ASTNrow extends ASTUniOp {
   ASTNrow() { super(VARS,new Type[]{Type.DBL,Type.ARY}); }
   @Override String opStr() { return "nrow"; }
@@ -201,6 +217,8 @@ class ASTPlus extends ASTBinOp { String opStr(){ return "+"  ;} ASTOp make() {re
 class ASTSub  extends ASTBinOp { String opStr(){ return "-"  ;} ASTOp make() {return new ASTSub ();} double op(double d0, double d1) { return d0-d1;}}
 class ASTMul  extends ASTBinOp { String opStr(){ return "*"  ;} ASTOp make() {return new ASTMul ();} double op(double d0, double d1) { return d0*d1;}}
 class ASTDiv  extends ASTBinOp { String opStr(){ return "/"  ;} ASTOp make() {return new ASTDiv ();} double op(double d0, double d1) { return d0/d1;}}
+class ASTPow  extends ASTBinOp { String opStr(){ return "^"  ;} ASTOp make() {return new ASTPow ();} double op(double d0, double d1) { return Math.pow(d0,d1);}}
+class ASTMod  extends ASTBinOp { String opStr(){ return "%"  ;} ASTOp make() {return new ASTMod ();} double op(double d0, double d1) { return d0%d1;}}
 class ASTMin  extends ASTBinOp { String opStr(){ return "min";} ASTOp make() {return new ASTMin ();} double op(double d0, double d1) { return Math.min(d0,d1);}}
 class ASTMax  extends ASTBinOp { String opStr(){ return "max";} ASTOp make() {return new ASTMax ();} double op(double d0, double d1) { return Math.max(d0,d1);}}
 class ASTLT   extends ASTBinOp { String opStr(){ return "<"  ;} ASTOp make() {return new ASTLT  ();} double op(double d0, double d1) { return d0< d1?1:0;}}
@@ -209,6 +227,8 @@ class ASTGT   extends ASTBinOp { String opStr(){ return ">"  ;} ASTOp make() {re
 class ASTGE   extends ASTBinOp { String opStr(){ return ">=" ;} ASTOp make() {return new ASTGE  ();} double op(double d0, double d1) { return d0>=d1?1:0;}}
 class ASTEQ   extends ASTBinOp { String opStr(){ return "==" ;} ASTOp make() {return new ASTEQ  ();} double op(double d0, double d1) { return d0==d1?1:0;}}
 class ASTNE   extends ASTBinOp { String opStr(){ return "!=" ;} ASTOp make() {return new ASTNE  ();} double op(double d0, double d1) { return d0!=d1?1:0;}}
+class ASTLA   extends ASTBinOp { String opStr(){ return "&"  ;} ASTOp make() {return new ASTLA  ();} double op(double d0, double d1) { return (d0!=0&&d1!=0)?1:0;}}
+class ASTLO   extends ASTBinOp { String opStr(){ return "|"  ;} ASTOp make() {return new ASTLO  ();} double op(double d0, double d1) { return (d0!=0||d1!=0)?1:0;}}
 
 class ASTReduce extends ASTOp {
   static final String VARS[] = new String[]{ "", "op2", "ary"};
@@ -481,7 +501,7 @@ class ASTRApply extends ASTOp {
           nc.close(0,null);
           env.addRef(v = av.close(null));
         } else {                      // Frame results
-          if( env.ary(-1).numCols() != 1 ) 
+          if( env.ary(-1).numCols() != 1 )
             throw new IllegalArgumentException("apply requires that "+op+" return 1 column");
           v = env.popAry().anyVec();// Remove without lowering refcnt
         }
@@ -494,7 +514,7 @@ class ASTRApply extends ASTOp {
       assert env.isAry();
       assert env._sp == oldsp-4+1;
       return;
-    } 
+    }
     if( d==1 || d == -2 )       // Work on rows
       throw H2O.unimpl();
     throw new IllegalArgumentException("MARGIN limited to 1 (rows) or 2 (cols)");

From 5d15f173b61eafbb02cf2da27d6f82636c88fcd6 Mon Sep 17 00:00:00 2001
From: mmalohlava <michal.malohlava@gmail.com>
Date: Mon, 11 Nov 2013 20:10:40 -0800
Subject: [PATCH 04/11] Graph Java API little bit polished.

---
 src/main/java/hex/gbm/DTree.java    | 14 ++++----------
 src/main/java/water/api/DocGen.java | 15 +++++++++++++++
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/src/main/java/hex/gbm/DTree.java b/src/main/java/hex/gbm/DTree.java
index b87feb211a..a7bbbc8c6d 100644
--- a/src/main/java/hex/gbm/DTree.java
+++ b/src/main/java/hex/gbm/DTree.java
@@ -660,16 +660,10 @@ protected void generateHTMLVarImp(StringBuilder sb) {
       sb.append("</tr>");
       DocGen.HTML.arrayTail(sb);
       // Generate a graph - horrible code
-      sb.append("<style scoped>@import url('/h2o/css/graphs.css')</style>");
-      sb.append("<script type=\"text/javascript\" src='/h2o/js/d3.v3.min.js'></script>");
-      sb.append("<script src='/h2o/js/graphs.js'></script>");
-      sb.append("<div id='graphvarimp'>")
-        .append("  <script>")
-        .append("    g_varimp(").append("\"#graphvarimp\", ");
-      DocGen.HTML.toJSArray(sb, Arrays.copyOf(_names, _names.length-1)).append(',');
-      DocGen.HTML.toJSArray(sb, varimp).append(");");
-      sb.append("  </script>")
-        .append("</div>");
+      DocGen.HTML.graph(sb, "graphvarimp", "g_varimp",
+          DocGen.HTML.toJSArray(new StringBuilder(), Arrays.copyOf(_names, _names.length-1)),
+          DocGen.HTML.toJSArray(new StringBuilder(), varimp)
+          );
     }
 
     public static class TreeStats extends Iced {
diff --git a/src/main/java/water/api/DocGen.java b/src/main/java/water/api/DocGen.java
index a382a740fd..edbc89af7a 100644
--- a/src/main/java/water/api/DocGen.java
+++ b/src/main/java/water/api/DocGen.java
@@ -2,6 +2,7 @@
 
 import java.io.*;
 import java.lang.reflect.Field;
+import java.util.Arrays;
 import java.util.Properties;
 
 import water.*;
@@ -327,6 +328,20 @@ public StringBuilder toJSArray(StringBuilder sb, String[] ss) {
       sb.append(']');
       return sb;
     }
+
+    public StringBuilder graph(StringBuilder sb, String gid, String gname, StringBuilder ...gparams) {
+      sb.append("<style scoped>@import url('/h2o/css/graphs.css')</style>");
+      sb.append("<script type=\"text/javascript\" src='/h2o/js/d3.v3.min.js'></script>");
+      sb.append("<script src='/h2o/js/graphs.js'></script>");
+      sb.append("<div id='").append(gid).append("'>")
+        .append("  <script>")
+        .append(gname).append("('").append(gid).append("'");
+      for (int i=0; i<gparams.length; i++) sb.append(", ").append(gparams[i]);
+      sb.append(");");
+      sb.append("  </script>")
+        .append("</div>");
+      return sb;
+    }
   }
 
   // --------------------------------------------------------------------------

From 3bfde24f68962ec06b7c900c420e9280791df325 Mon Sep 17 00:00:00 2001
From: mmalohlava <michal.malohlava@gmail.com>
Date: Mon, 11 Nov 2013 20:11:36 -0800
Subject: [PATCH 05/11] Var imp name tooltip.

---
 lib/resources/h2o/css/graphs.css | 30 ++++++++++++++++++++++++++++++
 lib/resources/h2o/js/graphs.js   | 24 ++++++++++++++++++++----
 2 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/lib/resources/h2o/css/graphs.css b/lib/resources/h2o/css/graphs.css
index 07da832dab..95b1caa568 100644
--- a/lib/resources/h2o/css/graphs.css
+++ b/lib/resources/h2o/css/graphs.css
@@ -5,6 +5,9 @@
 .bar.negative {
   fill: brown;
 }
+.bar:hover {
+  fill: orange;
+}
 
 .axis text {
   font: 10px sans-serif;
@@ -16,3 +19,30 @@
   stroke: #000;
   shape-rendering: crispEdges;
 }
+
+#d3tip {
+  position: absolute;
+  width: 120px;
+  height: auto;
+  padding: 2px;
+  background: lightsteelblue;
+  border: 0px;
+  -webkit-border-radius: 10px;
+  -moz-border-radius: 10px;
+  border-radius: 10px;
+  -webkit-box-shadow: 4px 4px 10px rgba(0, 0, 0, 0.4);
+  -moz-box-shadow: 4px 4px 10px rgba(0, 0, 0, 0.4);
+  box-shadow: 4px 4px 10px rgba(0, 0, 0, 0.4);
+  pointer-events: none;
+}
+
+#d3tip.hidden {
+  display: none;
+}
+
+#d3tip p {
+  text-align: center;
+  margin: 0;
+  font-family: sans-serif;
+  font-size: 12px;
+}
diff --git a/lib/resources/h2o/js/graphs.js b/lib/resources/h2o/js/graphs.js
index 528934c6a2..5b69e09928 100644
--- a/lib/resources/h2o/js/graphs.js
+++ b/lib/resources/h2o/js/graphs.js
@@ -9,8 +9,8 @@ function g_varimp(divid, names, varimp) {
   var dataset = zip(names, varimp);
   // Setup size and axis
   var margin = {top: 30, right: 10, bottom: 10, left: 10},
-      width = 480 - margin.left - margin.right,
-      height = 250 - margin.top - margin.bottom;
+      width = 640 - margin.left - margin.right,
+      height = 450 - margin.top - margin.bottom;
 
   var xScale = d3.scale.linear()
       .range([0, width])
@@ -24,12 +24,17 @@ function g_varimp(divid, names, varimp) {
       .scale(xScale)
       .orient("top");
 
-  var svg = d3.select(divid).append("svg")
+  var svg = d3.select("#"+divid).append("svg")
     .attr("width", width + margin.left + margin.right)
     .attr("height", height + margin.top + margin.bottom)
   .append("g")
     .attr("transform", "translate(" + margin.left + "," + margin.top + ")");
 
+  var tooltip = d3.select("body")
+                  .append("div")
+                  .attr("id", "d3tip")
+                  .classed("hidden", true);
+
   svg.selectAll(".bar")
       .data(dataset)
     .enter().append("rect")
@@ -37,7 +42,18 @@ function g_varimp(divid, names, varimp) {
       .attr("x", function(d) { return xScale(Math.min(0, d[1])); })
       .attr("y", function(d) { return yScale(d[0]); })
       .attr("width", function(d) { return Math.abs(xScale(d[1]) - xScale(0)); })
-      .attr("height", yScale.rangeBand());
+      .attr("height", yScale.rangeBand())
+      .on("mouseover", function (d) {
+        var xPosition = width  + document.getElementById(divid).offsetLeft;
+        var yPosition = parseFloat(d3.select(this).attr("y")) + yScale.rangeBand() / 2 + document.getElementById(divid).offsetTop;
+        tooltip.style("left", xPosition + "px")
+                .style("top", yPosition + "px");
+        tooltip.html("<p>" + d[0] + "<br/>" + d[1] + "</p>");
+        tooltip.classed("hidden", false);
+        })
+      .on("mouseout", function(d) {
+        tooltip.classed("hidden", true);
+        });
 
   svg.append("g")
       .attr("class", "x axis")

From 4bd083ae83b2c3726f5df490cddbfde7fb2eaf40 Mon Sep 17 00:00:00 2001
From: mmalohlava <michal.malohlava@gmail.com>
Date: Mon, 11 Nov 2013 20:14:57 -0800
Subject: [PATCH 06/11] Fix in computing tree stats.

---
 src/main/java/hex/drf/DRF.java   | 2 --
 src/main/java/hex/gbm/DTree.java | 6 ++----
 src/main/java/hex/gbm/GBM.java   | 1 -
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/src/main/java/hex/drf/DRF.java b/src/main/java/hex/drf/DRF.java
index ddc6f0eab8..3d9d4ee049 100644
--- a/src/main/java/hex/drf/DRF.java
+++ b/src/main/java/hex/drf/DRF.java
@@ -126,8 +126,6 @@ public static String link(Key k, String content) {
       tstats.updateBy(ktrees);
       model = doScoring(model, outputKey, fr, ktrees, tid, tstats);
     }
-    // finalize stats
-    tstats.close();
     // Do final scoring with all the trees.
     model = doScoring(model, outputKey, fr, ktrees, tid, tstats);
     if (classification && importance) {
diff --git a/src/main/java/hex/gbm/DTree.java b/src/main/java/hex/gbm/DTree.java
index a7bbbc8c6d..a7bd8673d3 100644
--- a/src/main/java/hex/gbm/DTree.java
+++ b/src/main/java/hex/gbm/DTree.java
@@ -690,12 +690,10 @@ public void updateBy(DTree[] ktrees) {
           sumDepth += tree.depth;
           sumLeaves += tree.leaves;
           numTrees++;
+          meanDepth = (int) (sumDepth / numTrees);
+          meanLeaves = (int) (sumLeaves / numTrees);
         }
       }
-      public void close() {
-        meanDepth = (int) (sumDepth / numTrees);
-        meanLeaves = (int) (sumLeaves / numTrees);
-      }
     }
 
     // --------------------------------------------------------------------------
diff --git a/src/main/java/hex/gbm/GBM.java b/src/main/java/hex/gbm/GBM.java
index 4af5426070..e96070fed7 100644
--- a/src/main/java/hex/gbm/GBM.java
+++ b/src/main/java/hex/gbm/GBM.java
@@ -98,7 +98,6 @@ public static String link(Key k, String content) {
       model = doScoring(model, outputKey, fr, ktrees, tid, tstats);
     }
     // Final scoring
-    tstats.close();
     model = doScoring(model, outputKey, fr, ktrees, tid, tstats);
     cleanUp(fr,t_build); // Shared cleanup
   }

From 685ac1d2e2047a90173601f7f8320d343dfc9027 Mon Sep 17 00:00:00 2001
From: anqi <anqi@0xdata.com>
Date: Mon, 11 Nov 2013 21:15:33 -0800
Subject: [PATCH 07/11] WIP Adding cut method to ExecQuery2

---
 R/examples/H2OExec2Demo.R           | 98 ++++++++++++++---------------
 R/h2oRClient-package/R/Classes.R    | 18 +++---
 R/h2oRClient-package/R/Internal.R   | 10 +--
 src/main/java/water/exec/ASTOp.java | 48 +++++++++++++-
 4 files changed, 111 insertions(+), 63 deletions(-)

diff --git a/R/examples/H2OExec2Demo.R b/R/examples/H2OExec2Demo.R
index 8bfe3b9580..05314bd09c 100644
--- a/R/examples/H2OExec2Demo.R
+++ b/R/examples/H2OExec2Demo.R
@@ -1,51 +1,49 @@
-library(h2o)
-h2o.installDepPkgs()
-myIP = "127.0.0.1"; myPort = 54321
-localH2O = h2o.init(ip = myIP, port = myPort, startH2O = TRUE, silentUpgrade = FALSE, promptUpgrade = TRUE)
-
-# Import iris file to H2O
-prosPath = system.file("extdata", "prostate.csv", package="h2oRClient")
-prostate.hex = h2o.importFile.FV(localH2O, path = prosPath, key = "prostate.hex")
-
-# Print out basic summary
-summary(prostate.hex)
-head(prostate.hex)
-tail(prostate.hex)
-
-# Get quantiles and examine outliers
-prostate.qs = quantile(prostate.hex$PSA)
-prostate.qs
-
-# Note: Right now, assignment must be done manually with h2o.assign!
-outliers.low = prostate.hex[prostate.hex$PSA <= prostate.qs[2],]
-outliers.low = h2o.assign(outliers.low, "PSA.low")
-outliers.high = prostate.hex[prostate.hex$PSA >= prostate.qs[10],]
-outliers.high = h2o.assign(outliers.high, "PSA.high")
-
-nrow(outliers.low) + nrow(outliers.high)
-head(outliers.low); tail(outliers.low)
-head(outliers.high); tail(outliers.high)
-
-# Drop outliers from data
-prostate.trim = prostate.hex[prostate.hex$PSA > prostate.qs[2],]
-prostate.trim = h2o.assign(prostate.trim, "prostate.trim")
-prostate.trim = prostate.trim[prostate.trim$PSA < prostate.qs[10],]
-prostate.trim = h2o.assign(prostate.trim, "prostate.trim")
-nrow(prostate.trim)
-
-# Construct test and training sets
-s = runif(nrow(prostate.hex))
-prostate.train = prostate.hex[s <= 0.8,]
-prostate.train = h2o.assign(prostate.train, "prostate.train")
-prostate.test = prostate.hex[s > 0.8,]
-prostate.test = h2o.assign(prostate.test, "prostate.test")
-nrow(prostate.train) + nrow(prostate.test)
-
-# Run GBM on training set and predict on test set
-myY = "CAPSULE"; myX = setdiff(colnames(prostate.train), c(myY, "ID"))
-prostate.gbm = h2o.gbm(x = myX, y = myY, distribution = "multinomial", data = prostate.train)
-prostate.gbm
-prostate.pred = h2o.predict(prostate.gbm, prostate.test)
-summary(prostate.pred)
-head(prostate.pred)
+library(h2o)
+h2o.installDepPkgs()
+myIP = "127.0.0.1"; myPort = 54321
+localH2O = h2o.init(ip = myIP, port = myPort, startH2O = TRUE, silentUpgrade = FALSE, promptUpgrade = TRUE)
+
+# Import iris file to H2O
+prosPath = system.file("extdata", "prostate.csv", package="h2oRClient")
+prostate.hex = h2o.importFile.FV(localH2O, path = prosPath, key = "prostate.hex")
+
+# Print out basic summary
+summary(prostate.hex)
+head(prostate.hex)
+tail(prostate.hex)
+table(prostate.hex$RACE)    # Note: Currently only works on a single integer/factor column
+
+# Get quantiles and examine outliers
+prostate.qs = quantile(prostate.hex$PSA)
+print(prostate.qs)
+
+# Note: Right now, assignment must be done manually with h2o.assign!
+# PSA.outliers = prostate.hex[prostate.hex$PSA <= prostate.qs[2] | prostate.hex$PSA >= prostate.qs[10],]
+PSA.outliers.ind = prostate.hex$PSA <= prostate.qs[2] | prostate.hex$PSA >= prostate.qs[10]
+PSA.outliers = prostate.hex[PSA.outliers.ind,]
+PSA.outliers = h2o.assign(PSA.outliers, "PSA.outliers")
+nrow(PSA.outliers)
+head(PSA.outliers); tail(PSA.outliers)
+
+# Drop outliers from data
+# prostate.trim = prostate.hex[prostate.hex$PSA > prostate.qs[2] && prostate.hex$PSA < prostate.qs[10],]
+prostate.trim = prostate.hex[!PSA.outliers.ind,]
+prostate.trim = h2o.assign(prostate.trim, "prostate.trim")
+nrow(prostate.trim)
+
+# Construct test and training sets
+s = runif(nrow(prostate.hex))
+prostate.train = prostate.hex[s <= 0.8,]
+prostate.train = h2o.assign(prostate.train, "prostate.train")
+prostate.test = prostate.hex[s > 0.8,]
+prostate.test = h2o.assign(prostate.test, "prostate.test")
+nrow(prostate.train) + nrow(prostate.test)
+
+# Run GBM on training set and predict on test set
+myY = "CAPSULE"; myX = setdiff(colnames(prostate.train), c(myY, "ID"))
+prostate.gbm = h2o.gbm(x = myX, y = myY, distribution = "multinomial", data = prostate.train)
+print(prostate.gbm)
+prostate.pred = h2o.predict(prostate.gbm, prostate.test)
+summary(prostate.pred)
+head(prostate.pred)
 tail(prostate.pred)
\ No newline at end of file
diff --git a/R/h2oRClient-package/R/Classes.R b/R/h2oRClient-package/R/Classes.R
index f00675e6f0..3e05b1453c 100644
--- a/R/h2oRClient-package/R/Classes.R
+++ b/R/h2oRClient-package/R/Classes.R
@@ -415,6 +415,7 @@ setMethod("show", "H2ORawData2", function(object) {
 setMethod("show", "H2OParsedData2", function(object) {
   print(object@h2o)
   cat("Parsed Data Key:", object@key, "\n")
+  if(ncol(object) <= 1000) print(head(object))
 })
 
 setMethod("[", "H2OParsedData2", function(x, i, j, ..., drop = TRUE) {
@@ -501,8 +502,8 @@ setMethod("<", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__b
 setMethod("!=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) })
 setMethod(">=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) })
 setMethod("<=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) })
-setMethod("&", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) })
-setMethod("|", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) })
+setMethod("&", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&&", e1, e2) })
+setMethod("|", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("||", e1, e2) })
 
 setMethod("+", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("+", e1, e2) })
 setMethod("-", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("-", e1, e2) })
@@ -515,8 +516,8 @@ setMethod("<", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("
 setMethod("!=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) })
 setMethod(">=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) })
 setMethod("<=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) })
-setMethod("&", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) })
-setMethod("|", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) })
+setMethod("&", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&&", e1, e2) })
+setMethod("|", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("||", e1, e2) })
 
 setMethod("+", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("+", e1, e2) })
 setMethod("-", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("-", e1, e2) })
@@ -529,9 +530,10 @@ setMethod("<", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("
 setMethod("!=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("!=", e1, e2) })
 setMethod(">=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(">=", e1, e2) })
 setMethod("<=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("<=", e1, e2) })
-setMethod("&", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("&", e1, e2) })
-setMethod("|", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("|", e1, e2) })
+setMethod("&", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("&&", e1, e2) })
+setMethod("|", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("||", e1, e2) })
 
+setMethod("!", "H2OParsedData2", function(x) { h2o.__unop2("!", x) })
 setMethod("abs", "H2OParsedData2", function(x) { h2o.__unop2("abs", x) })
 setMethod("sign", "H2OParsedData2", function(x) { h2o.__unop2("sgn", x) })
 setMethod("sqrt", "H2OParsedData2", function(x) { h2o.__unop2("sqrt", x) })
@@ -541,7 +543,9 @@ setMethod("log", "H2OParsedData2", function(x) { h2o.__unop2("log", x) })
 setMethod("exp", "H2OParsedData2", function(x) { h2o.__unop2("exp", x) })
 setMethod("sum", "H2OParsedData2", function(x) { h2o.__unop2("sum", x) })
 setMethod("is.na", "H2OParsedData2", function(x) { h2o.__unop2("is.na", x) })
-setMethod("table", "H2OParsedData2", function(x) { h2o.__unop2("table", x) })
+
+table <- function(object) { UseMethod("table", object) }
+setMethod("table", "H2OParsedData2", function(object) { h2o.__unop2("table", object) })
 
 setMethod("colnames", "H2OParsedData2", function(x) {
   res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key)
diff --git a/R/h2oRClient-package/R/Internal.R b/R/h2oRClient-package/R/Internal.R
index 394d7bb0d9..2445386cae 100644
--- a/R/h2oRClient-package/R/Internal.R
+++ b/R/h2oRClient-package/R/Internal.R
@@ -1,10 +1,12 @@
 # Hack to get around Exec.json always dumping to same Result.hex key
+# TODO: Need better way to manage temporary/intermediate values in calculations! Right now, overwriting may occur silently
 pkg.env = new.env()
 pkg.env$result_count = 0
+pkg.env$temp_count = 0
 pkg.env$IS_LOGGING = FALSE
 TEMP_KEY = "Last.value"
-RESULT_MAX = 100
-LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&", "|")
+RESULT_MAX = 200
+LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&&", "||", "!")
 
 # Initialize functions for R logging
 myPath = paste(Sys.getenv("HOME"), "Library/Application Support/h2o", sep="/")
@@ -296,9 +298,9 @@ h2o.__unop2 <- function(op, x) {
   if(res$num_rows == 0 && res$num_cols == 0)   # TODO: If logical operator, need to indicate
     return(res$scalar)
   if(op %in% LOGICAL_OPERATORS)
-    new("H2OLogicalData2", h2o=myClient, key=res$dest_key)
+    new("H2OLogicalData2", h2o=x@h2o, key=res$dest_key)
   else
-    new("H2OParsedData2", h2o=myClient, key=res$dest_key)
+    new("H2OParsedData2", h2o=x@h2o, key=res$dest_key)
 }
 
 h2o.__binop2 <- function(op, x, y) {
diff --git a/src/main/java/water/exec/ASTOp.java b/src/main/java/water/exec/ASTOp.java
index bf218165a2..a6114da1d2 100644
--- a/src/main/java/water/exec/ASTOp.java
+++ b/src/main/java/water/exec/ASTOp.java
@@ -24,6 +24,7 @@ public abstract class ASTOp extends AST {
     put(new ASTFlr());
     put(new ASTLog());
     put(new ASTExp());
+    put(new ASTNot());
 
     // Binary ops
     put(new ASTPlus());
@@ -51,6 +52,7 @@ public abstract class ASTOp extends AST {
     put(new ASTIfElse());
     put(new ASTRApply());
     put(new ASTRunif());
+    put(new ASTCut());
   }
   static private void put(ASTOp ast) { OPS.put(ast.opStr(),ast); }
 
@@ -129,6 +131,7 @@ class ASTFlr  extends ASTUniOp { String opStr(){ return "floor"; } ASTOp make()
 class ASTLog  extends ASTUniOp { String opStr(){ return "log";   } ASTOp make() {return new ASTLog ();} double op(double d) { return Math.log(d);}}
 class ASTExp  extends ASTUniOp { String opStr(){ return "exp";   } ASTOp make() {return new ASTExp ();} double op(double d) { return Math.exp(d);}}
 class ASTIsNA extends ASTUniOp { String opStr(){ return "is.na"; } ASTOp make() {return new ASTIsNA();} double op(double d) { return Double.isNaN(d)?1:0;}}
+class ASTNot  extends ASTUniOp { String opStr(){ return "!";     } ASTOp make() {return new ASTNot(); } double op(double d) { return d==0?1:0; }}
 class ASTNrow extends ASTUniOp {
   ASTNrow() { super(VARS,new Type[]{Type.DBL,Type.ARY}); }
   @Override String opStr() { return "nrow"; }
@@ -227,8 +230,8 @@ class ASTGT   extends ASTBinOp { String opStr(){ return ">"  ;} ASTOp make() {re
 class ASTGE   extends ASTBinOp { String opStr(){ return ">=" ;} ASTOp make() {return new ASTGE  ();} double op(double d0, double d1) { return d0>=d1?1:0;}}
 class ASTEQ   extends ASTBinOp { String opStr(){ return "==" ;} ASTOp make() {return new ASTEQ  ();} double op(double d0, double d1) { return d0==d1?1:0;}}
 class ASTNE   extends ASTBinOp { String opStr(){ return "!=" ;} ASTOp make() {return new ASTNE  ();} double op(double d0, double d1) { return d0!=d1?1:0;}}
-class ASTLA   extends ASTBinOp { String opStr(){ return "&"  ;} ASTOp make() {return new ASTLA  ();} double op(double d0, double d1) { return (d0!=0&&d1!=0)?1:0;}}
-class ASTLO   extends ASTBinOp { String opStr(){ return "|"  ;} ASTOp make() {return new ASTLO  ();} double op(double d0, double d1) { return (d0!=0||d1!=0)?1:0;}}
+class ASTLA   extends ASTBinOp { String opStr(){ return "&&" ;} ASTOp make() {return new ASTLA  ();} double op(double d0, double d1) { return (d0!=0 && d1!=0)?1:0;}}
+class ASTLO   extends ASTBinOp { String opStr(){ return "||" ;} ASTOp make() {return new ASTLO  ();} double op(double d0, double d1) { return (d0==0 && d1==0)?0:1;}}
 
 class ASTReduce extends ASTOp {
   static final String VARS[] = new String[]{ "", "op2", "ary"};
@@ -521,3 +524,44 @@ class ASTRApply extends ASTOp {
   }
 }
 
+class ASTCut extends ASTOp {
+  ASTCut() { super(new String[]{"cut", "ary", "dbls"}, new Type[]{Type.ARY, Type.ARY, Type.dblary()}); }
+  @Override String opStr() { return "cut"; }
+  @Override ASTOp make() {return new ASTCut();}
+  @Override void apply(Env env, int argcnt) {
+    if(env.isDbl()) {
+      int nbins = (int) Math.floor(env.popDbl());
+      if(nbins < 2)
+        throw new IllegalArgumentException("Number of intervals must be at least 2");
+
+      Frame fr = env.popAry();
+      String skey = env.key();
+      if(fr.vecs().length != 1 || fr.domains()[0] != null)
+        throw new IllegalArgumentException("First argument must be a numeric vector");
+
+      final double fmax = fr.vecs()[0].max();
+      final double fmin = fr.vecs()[0].min();
+      final double width = (fmax - fmin)/nbins;
+      // TODO: Check what R does when width = 0, I think it perturbs constant vecs automatically
+
+      /* String[][] domains = new String[1][nbins];
+      for(int i = 0; i < nbins; i++)
+        domains[0][i] = "(" + fmin + i*width + "," + fmin + (i+1)*width + "]"; */
+
+      Frame fr2 = new MRTask2() {
+        @Override public void map(Chunk chk, NewChunk nchk) {
+          for(int r = 0; r < chk._len; r++) {
+            double x = chk.at0(r);
+            nchk.addNum(Math.floor((x - fmin)/width));
+            // TODO: Add all unique bins as domains (lower_bound, upper_bound]
+          }
+        }
+      }.doAll(1,fr).outputFrame(fr._names, fr.domains());
+      // }.doAll(1,fr).outputFrame(fr._names, domains);
+      env.subRef(fr, skey);
+      env.pop();
+      env.push(fr2);
+    } else
+      throw H2O.unimpl();
+  }
+}
\ No newline at end of file

From 31d5745313e45b9c0a897f1db323dcd3178518af Mon Sep 17 00:00:00 2001
From: Cliff Click <cliffc@acm.org>
Date: Mon, 11 Nov 2013 11:05:08 -0800
Subject: [PATCH 08/11] Cleanup fvec compression

Major comment upgrade.  Allocate side arrays in each Chunk.
Drop redundant min/max/hasFloat fields.
Drop redundant invalid/setInvalid calls.
Minor bugfix with setting NA into existing C0DChunk.
---
 src/main/java/water/MemoryManager.java |  21 +++--
 src/main/java/water/fvec/C0DChunk.java |   5 +-
 src/main/java/water/fvec/C0LChunk.java |   5 +-
 src/main/java/water/fvec/C1Chunk.java  |   7 +-
 src/main/java/water/fvec/C1NChunk.java |  12 +--
 src/main/java/water/fvec/C1SChunk.java |  16 ++--
 src/main/java/water/fvec/C2Chunk.java  |   7 +-
 src/main/java/water/fvec/C2SChunk.java |  27 +++---
 src/main/java/water/fvec/C4SChunk.java |  13 ++-
 src/main/java/water/fvec/C8DChunk.java |   5 +-
 src/main/java/water/fvec/CBSChunk.java |   9 +-
 src/main/java/water/fvec/CX0Chunk.java |   1 -
 src/main/java/water/fvec/CX2Chunk.java |   1 -
 src/main/java/water/fvec/NewChunk.java | 120 +++++++++++--------------
 14 files changed, 117 insertions(+), 132 deletions(-)

diff --git a/src/main/java/water/MemoryManager.java b/src/main/java/water/MemoryManager.java
index 6a3e44b583..af33aecfee 100644
--- a/src/main/java/water/MemoryManager.java
+++ b/src/main/java/water/MemoryManager.java
@@ -227,9 +227,10 @@ public static Object malloc(int elems, long bytes, int type, Object orig, int fr
         case  5: return new float  [elems];
         case  9: return new double [elems];
         case  0: return new boolean[elems];
-        case -1: return Arrays.copyOfRange((byte[])orig,from,elems);
-        case -4: return Arrays.copyOfRange((int [])orig,from,elems);
-        case -8: return Arrays.copyOfRange((long[])orig,from,elems);
+        case -1: return Arrays.copyOfRange((byte  [])orig,from,elems);
+        case -4: return Arrays.copyOfRange((int   [])orig,from,elems);
+        case -8: return Arrays.copyOfRange((long  [])orig,from,elems);
+        case -9: return Arrays.copyOfRange((double[])orig,from,elems);
         default: throw H2O.unimpl();
         }
       }
@@ -251,12 +252,14 @@ public static Object malloc(int elems, long bytes, int type, Object orig, int fr
   public static float  [] malloc4f(int size) { return (float  [])malloc(size,size*4, 5,null,0); }
   public static double [] malloc8d(int size) { return (double [])malloc(size,size*8, 9,null,0); }
   public static boolean[] mallocZ (int size) { return (boolean[])malloc(size,size*1, 0,null,0); }
-  public static byte[] arrayCopyOfRange(byte[] orig, int from, int sz) { return (byte[]) malloc(sz,(sz-from),-1,orig,from); }
-  public static int [] arrayCopyOfRange(int [] orig, int from, int sz) { return (int []) malloc(sz,(sz-from),-4,orig,from); }
-  public static long[] arrayCopyOfRange(long[] orig, int from, int sz) { return (long[]) malloc(sz,(sz-from),-8,orig,from); }
-  public static byte[] arrayCopyOf( byte[] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
-  public static int [] arrayCopyOf( int [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
-  public static long[] arrayCopyOf( long[] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
+  public static byte   [] arrayCopyOfRange(byte  [] orig, int from, int sz) { return (byte  []) malloc(sz,(sz-from),-1,orig,from); }
+  public static int    [] arrayCopyOfRange(int   [] orig, int from, int sz) { return (int   []) malloc(sz,(sz-from),-4,orig,from); }
+  public static long   [] arrayCopyOfRange(long  [] orig, int from, int sz) { return (long  []) malloc(sz,(sz-from),-8,orig,from); }
+  public static double [] arrayCopyOfRange(double[] orig, int from, int sz) { return (double[]) malloc(sz,(sz-from),-9,orig,from); }
+  public static byte   [] arrayCopyOf( byte  [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
+  public static int    [] arrayCopyOf( int   [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
+  public static long   [] arrayCopyOf( long  [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
+  public static double [] arrayCopyOf( double[] orig, int sz) { return arrayCopyOfRange(orig,0,sz); }
 
   // Memory available for tasks (we assume 3/4 of the heap is available for tasks)
   static final AtomicLong _taskMem = new AtomicLong(MEM_MAX-(MEM_MAX>>2));
diff --git a/src/main/java/water/fvec/C0DChunk.java b/src/main/java/water/fvec/C0DChunk.java
index dd3d8d2276..78fda64660 100644
--- a/src/main/java/water/fvec/C0DChunk.java
+++ b/src/main/java/water/fvec/C0DChunk.java
@@ -2,6 +2,7 @@
 
 import java.util.Arrays;
 import water.AutoBuffer;
+import water.MemoryManager;
 import water.UDP;
 
 /**
@@ -24,7 +25,7 @@ public class C0DChunk extends Chunk {
   @Override boolean set_impl(int idx, long l) { return l==_con; }
   @Override boolean set_impl(int i, double d) { return d==_con; }
   @Override boolean set_impl(int i, float f ) { return f==_con; }
-  @Override boolean setNA_impl(int i) { return _con==Double.NaN; }
+  @Override boolean setNA_impl(int i) { return Double.isNaN(_con); }
   @Override boolean hasFloat() { return (long)_con!=_con; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C0DChunk read(AutoBuffer bb) {
@@ -35,7 +36,7 @@ public class C0DChunk extends Chunk {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
-    Arrays.fill(nc._ds,_con);
+    Arrays.fill(nc._ds = MemoryManager.malloc8d(_len),_con);
     return nc;
   }
   // 3.3333333e33
diff --git a/src/main/java/water/fvec/C0LChunk.java b/src/main/java/water/fvec/C0LChunk.java
index 7b05420074..bf6a212254 100644
--- a/src/main/java/water/fvec/C0LChunk.java
+++ b/src/main/java/water/fvec/C0LChunk.java
@@ -2,6 +2,7 @@
 
 import java.util.Arrays;
 import water.AutoBuffer;
+import water.MemoryManager;
 import water.UDP;
 
 /**
@@ -32,8 +33,8 @@ public class C0LChunk extends Chunk {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
-    if( nc._ls != null ) Arrays.fill(nc._ls,_con);
-    else                 Arrays.fill(nc._ds,_con);
+    nc._xs = MemoryManager.malloc4(_len);
+    Arrays.fill(nc._ls = MemoryManager.malloc8(_len),_con);
     return nc;
   }
 }
diff --git a/src/main/java/water/fvec/C1Chunk.java b/src/main/java/water/fvec/C1Chunk.java
index cd78af8b1d..8c9cbb4d5a 100644
--- a/src/main/java/water/fvec/C1Chunk.java
+++ b/src/main/java/water/fvec/C1Chunk.java
@@ -37,9 +37,12 @@ public class C1Chunk extends Chunk {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
+    nc._xs = MemoryManager.malloc4(_len);
+    nc._ls = MemoryManager.malloc8(_len);
     for( int i=0; i<_len; i++ ) {
-      if( isNA_impl(i) ) nc.setInvalid(i);
-      else nc._ls[i] = at8_impl(i);
+      int res = 0xFF&_mem[i+OFF];
+      if( res == C1Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE;
+      else                     nc._ls[i] = res;
     }
     return nc;
   }
diff --git a/src/main/java/water/fvec/C1NChunk.java b/src/main/java/water/fvec/C1NChunk.java
index c5576646ac..32ae4f874f 100644
--- a/src/main/java/water/fvec/C1NChunk.java
+++ b/src/main/java/water/fvec/C1NChunk.java
@@ -15,11 +15,6 @@ public class C1NChunk extends Chunk {
   @Override boolean set_impl(int i, double d) { return false; }
   @Override boolean set_impl(int i, float f ) { return false; }
   @Override boolean setNA_impl(int idx) { return false; }
-  @Override NewChunk inflate_impl(NewChunk nc) {
-    for( int i=0; i<_len; i++ )
-      nc._ls[i] = at8_impl(i);
-    return nc;
-  }
   @Override boolean hasFloat() { return false; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C1NChunk read(AutoBuffer bb) {
@@ -28,4 +23,11 @@ public class C1NChunk extends Chunk {
     _len = _mem.length;
     return this;
   }
+  @Override NewChunk inflate_impl(NewChunk nc) {
+    nc._xs = MemoryManager.malloc4(_len);
+    nc._ls = MemoryManager.malloc8(_len);
+    for( int i=0; i<_len; i++ )
+      nc._ls[i] = 0xFF&_mem[i+OFF];
+    return nc;
+  }
 }
diff --git a/src/main/java/water/fvec/C1SChunk.java b/src/main/java/water/fvec/C1SChunk.java
index 909c8f6681..1a85dd7ec5 100644
--- a/src/main/java/water/fvec/C1SChunk.java
+++ b/src/main/java/water/fvec/C1SChunk.java
@@ -1,5 +1,6 @@
 package water.fvec;
 
+import java.util.Arrays;
 import water.*;
 import water.parser.DParseTask;
 
@@ -49,17 +50,12 @@ public class C1SChunk extends Chunk {
   @Override NewChunk inflate_impl(NewChunk nc) {
     double dx = Math.log10(_scale);
     assert DParseTask.fitsIntoInt(dx);
-    int x = (int)dx;
-    nc._ds = null;
-    nc._ls = MemoryManager.malloc8 (_len);
-    nc._xs = MemoryManager.malloc4 (_len);
+    Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int)dx);
+    nc._ls = MemoryManager.malloc8(_len);
     for( int i=0; i<_len; i++ ) {
-      long res = 0xFF&_mem[i+OFF];
-      if( res == C1Chunk._NA ) nc.setInvalid(i);
-      else {
-        nc._ls[i] = res+_bias;
-        nc._xs[i] = x;
-      }
+      int res = 0xFF&_mem[i+OFF];
+      if( res == C1Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE;
+      else                     nc._ls[i] = res+_bias;
     }
     return nc;
   }
diff --git a/src/main/java/water/fvec/C2Chunk.java b/src/main/java/water/fvec/C2Chunk.java
index 9343f97d0b..05db07075f 100644
--- a/src/main/java/water/fvec/C2Chunk.java
+++ b/src/main/java/water/fvec/C2Chunk.java
@@ -41,9 +41,12 @@ public class C2Chunk extends Chunk {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
+    nc._xs = MemoryManager.malloc4(_len);
+    nc._ls = MemoryManager.malloc8(_len);
     for( int i=0; i<_len; i++ ) {
-      if( isNA_impl(i) ) nc.setInvalid(i);
-      else nc._ls[i] = at8_impl(i);
+      int res = UDP.get2(_mem,(i<<1)+OFF);
+      if( res == C2Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE;
+      else                     nc._ls[i] = res;
     }
     return nc;
   }
diff --git a/src/main/java/water/fvec/C2SChunk.java b/src/main/java/water/fvec/C2SChunk.java
index 4e40bf5d84..96849998c2 100644
--- a/src/main/java/water/fvec/C2SChunk.java
+++ b/src/main/java/water/fvec/C2SChunk.java
@@ -1,5 +1,6 @@
 package water.fvec;
 
+import java.util.Arrays;
 import water.*;
 import water.parser.DParseTask;
 
@@ -7,7 +8,6 @@
  * The scale/bias function, where data is in SIGNED bytes before scaling.
  */
 public class C2SChunk extends Chunk {
-  static private final long _NA = Short.MIN_VALUE;
   static final int OFF=8+4;
   public double _scale;
   int _bias;
@@ -18,14 +18,14 @@ public class C2SChunk extends Chunk {
   }
   @Override protected final long at8_impl( int i ) {
     long res = UDP.get2(_mem,(i<<1)+OFF);
-    if( res == _NA ) throw new IllegalArgumentException("at8 but value is missing");
+    if( res == C2Chunk._NA ) throw new IllegalArgumentException("at8 but value is missing");
     return (long)((res + _bias)*_scale);
   }
   @Override protected final double atd_impl( int i ) {
     long res = UDP.get2(_mem,(i<<1)+OFF);
-    return (res == _NA)?Double.NaN:(res + _bias)*_scale;
+    return (res == C2Chunk._NA)?Double.NaN:(res + _bias)*_scale;
   }
-  @Override protected final boolean isNA_impl( int i ) { return UDP.get2(_mem,(i<<1)+OFF) == _NA; }
+  @Override protected final boolean isNA_impl( int i ) { return UDP.get2(_mem,(i<<1)+OFF) == C2Chunk._NA; }
   @Override boolean set_impl(int idx, long l) {
     long res = (long)(l/_scale)-_bias; // Compressed value
     double d = (res+_bias)*_scale;     // Reverse it
@@ -36,14 +36,14 @@ public class C2SChunk extends Chunk {
   }
   @Override boolean set_impl(int i, double d) {
     short s = (short)((d/_scale)-_bias);
-    if( s == _NA ) return false;
+    if( s == C2Chunk._NA ) return false;
     double d2 = (s+_bias)*_scale;
     if( d!=d2 ) return false;
     UDP.set2(_mem,(i<<1)+OFF,s);
     return true;
   }
   @Override boolean set_impl(int i, float f ) { return false; }
-  @Override boolean setNA_impl(int idx) { UDP.set2(_mem,(idx<<1)+OFF,(short)_NA); return true; }
+  @Override boolean setNA_impl(int idx) { UDP.set2(_mem,(idx<<1)+OFF,(short)C2Chunk._NA); return true; }
   @Override boolean hasFloat() { return _scale < 1.0; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C2SChunk read(AutoBuffer bb) {
@@ -57,17 +57,12 @@ public class C2SChunk extends Chunk {
   @Override NewChunk inflate_impl(NewChunk nc) {
     double dx = Math.log10(_scale);
     assert DParseTask.fitsIntoInt(dx);
-    int x = (int)dx;
-    nc._ds = null;
-    nc._ls = MemoryManager.malloc8 (_len);
-    nc._xs = MemoryManager.malloc4 (_len);
+    Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int)dx);
+    nc._ls = MemoryManager.malloc8(_len);
     for( int i=0; i<_len; i++ ) {
-      long res = UDP.get2(_mem,(i<<1)+OFF);
-      if( res == _NA ) nc.setInvalid(i);
-      else {
-        nc._ls[i] = res+_bias;
-        nc._xs[i] = x;
-      }
+      int res = UDP.get2(_mem,(i<<1)+OFF);
+      if( res == C2Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE;
+      else                     nc._ls[i] = res+_bias;
     }
     return nc;
   }
diff --git a/src/main/java/water/fvec/C4SChunk.java b/src/main/java/water/fvec/C4SChunk.java
index 61f85a61ac..a762680f09 100644
--- a/src/main/java/water/fvec/C4SChunk.java
+++ b/src/main/java/water/fvec/C4SChunk.java
@@ -1,5 +1,6 @@
 package water.fvec;
 
+import java.util.Arrays;
 import water.*;
 import water.parser.DParseTask;
 
@@ -50,14 +51,12 @@ public class C4SChunk extends Chunk {
   @Override NewChunk inflate_impl(NewChunk nc) {
     double dx = Math.log10(_scale);
     assert DParseTask.fitsIntoInt(dx);
-    int x = (int)dx;
+    Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int)dx);
+    nc._ls = MemoryManager.malloc8(_len);
     for( int i=0; i<_len; i++ ) {
-      long res = UDP.get4(_mem,(i<<2)+OFF);
-      if( res == _NA ) nc.setInvalid(i);
-      else {
-        nc._ls[i] = res+_bias;
-        nc._xs[i] = x;
-      }
+      int res = UDP.get4(_mem,(i<<2)+OFF);
+      if( res == _NA ) nc._xs[i] = Integer.MIN_VALUE;
+      else             nc._ls[i] = res+_bias;
     }
     return nc;
   }
diff --git a/src/main/java/water/fvec/C8DChunk.java b/src/main/java/water/fvec/C8DChunk.java
index a9291271db..8fa0c82589 100644
--- a/src/main/java/water/fvec/C8DChunk.java
+++ b/src/main/java/water/fvec/C8DChunk.java
@@ -12,10 +12,7 @@ public class C8DChunk extends Chunk {
     if( Double.isNaN(res) ) throw new IllegalArgumentException("at8 but value is missing");
     return (long)res;
   }
-  @Override protected final double atd_impl( int i ) {
-    double res = UDP.get8d(_mem,i<<3);
-    return res;
-  }
+  @Override protected final double   atd_impl( int i ) { return              UDP.get8d(_mem,i<<3) ; }
   @Override protected final boolean isNA_impl( int i ) { return Double.isNaN(UDP.get8d(_mem,i<<3)); }
   @Override boolean set_impl(int idx, long l) { return false; }
   @Override boolean set_impl(int i, double d) {
diff --git a/src/main/java/water/fvec/CBSChunk.java b/src/main/java/water/fvec/CBSChunk.java
index 4e90c349c3..7c8a6682f5 100644
--- a/src/main/java/water/fvec/CBSChunk.java
+++ b/src/main/java/water/fvec/CBSChunk.java
@@ -1,6 +1,7 @@
 package water.fvec;
 
 import water.AutoBuffer;
+import water.MemoryManager;
 import water.H2O;
 
 /** A simple chunk for boolean values. In fact simple bit vector.
@@ -57,10 +58,12 @@ protected byte atb(int idx) {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
+    nc._xs = MemoryManager.malloc4(_len);
+    nc._ls = MemoryManager.malloc8(_len);
     for (int i=0; i<_len; i++) {
-      long res = at8_impl(i);
-      if (res == _NA) nc.setInvalid(i);
-      else nc._ls[i] = res;
+      int res = atb(i);
+      if (res == _NA) nc._xs[i] = Integer.MIN_VALUE;
+      else            nc._ls[i] = res;
     }
     return nc;
   }
diff --git a/src/main/java/water/fvec/CX0Chunk.java b/src/main/java/water/fvec/CX0Chunk.java
index 55ca525ab7..b334507c7c 100644
--- a/src/main/java/water/fvec/CX0Chunk.java
+++ b/src/main/java/water/fvec/CX0Chunk.java
@@ -42,7 +42,6 @@ public CX0Chunk(long[] ls, int len, int nzcnt) {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
-    nc._ds = null;
     nc._ls = MemoryManager.malloc8 (_len);
     nc._xs = MemoryManager.malloc4 (_len);
     for( int i=OFF; i<_mem.length; i+=2 )
diff --git a/src/main/java/water/fvec/CX2Chunk.java b/src/main/java/water/fvec/CX2Chunk.java
index 9eb1dafc11..34c9cf4857 100644
--- a/src/main/java/water/fvec/CX2Chunk.java
+++ b/src/main/java/water/fvec/CX2Chunk.java
@@ -53,7 +53,6 @@ private int at_impl(int idx) {
     return this;
   }
   @Override NewChunk inflate_impl(NewChunk nc) {
-    nc._ds = null;
     nc._ls = MemoryManager.malloc8 (_len);
     nc._xs = MemoryManager.malloc4 (_len);
     for( int i=OFF; i<_mem.length; i+=4 ) {
diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java
index 56e7e693f9..8c63678370 100644
--- a/src/main/java/water/fvec/NewChunk.java
+++ b/src/main/java/water/fvec/NewChunk.java
@@ -8,38 +8,31 @@
 import water.*;
 import water.parser.DParseTask;
 
-// An uncompressed chunk of data, support an append operation
+// An uncompressed chunk of data, supporting an append operation
 public class NewChunk extends Chunk {
   final int _cidx;
-  transient long _ls[];         // Mantissa
-  transient int _xs[];          // Exponent
+  // We can record the following (mixed) data types:
+  // 1- doubles, in _ds including NaN for NA & 0; _ls==_xs==null
+  // 2- scaled decimals from parsing, in _ls & _xs; _ds==null
+  // 3- zero: requires _ls==0 && _xs==0
+  // 4- NA: either _ls==0 && _xs==Integer.MIN_VALUE, OR _ds=NaN
+  // 5- Enum: _ls==0 && _xs>0 && _ds==null
+  // Chunk._len is the count of elements appended
+  // Sparse: if _row !=null, then _ls/_xs/_ds are compressed to non-zero's
+  // only, and _row is the row number.  Still Chunk._len is count of elements
+  // including zeros.
+  transient long   _ls[];       // Mantissa
+  transient int    _xs[];       // Exponent, or if _ls==0, NA or Enum
   transient double _ds[];       // Doubles, for inflating via doubles
-  transient double _min, _max;
-  int _naCnt;
-  int _strCnt;
+  int _naCnt;                   // Count of NA's   appended
+  int _strCnt;                  // Count of Enum's appended
 
-  public NewChunk( Vec vec, int cidx ) {
-    _vec = vec;
-    _cidx = cidx;               // This chunk#
-    _ls = new long[4];          // A little room for data
-    _xs = new int [4];
-    _min =  Double.MAX_VALUE;
-    _max = -Double.MAX_VALUE;
-  }
+  public NewChunk( Vec vec, int cidx ) { _vec = vec; _cidx = cidx; }
 
-  // Constructor used when inflating a Chunk
+  // Constructor used when inflating a Chunk.
   public NewChunk( Chunk C ) {
-    _vec = C._vec;
-    _cidx = _vec.elem2ChunkIdx(C._start); // This chunk#
+    this(C._vec,C._vec.elem2ChunkIdx(C._start));
     _len = C._len;
-    if( C.hasFloat() || C instanceof C0DChunk ) {
-      _ds = MemoryManager.malloc8d(_len);
-    } else {
-      _ls = MemoryManager.malloc8 (_len);
-      _xs = MemoryManager.malloc4 (_len);
-    }
-    _min =  Double.MAX_VALUE;
-    _max = -Double.MAX_VALUE;
   }
 
   public byte type(){
@@ -50,52 +43,41 @@ public byte type(){
     return AppendableVec.NUMBER;
   }
   protected final boolean isNA(int idx) {
-    return (_ds == null) ? (_ls[idx] == 0 && _xs[idx] != 0) : Double.isNaN(_ds[idx]);
+    return (_ds == null) ? (_ls[idx] == 0 && _xs[idx] == Integer.MIN_VALUE) : Double.isNaN(_ds[idx]);
   }
 
-  public void addNA(){
-    append2(0,Integer.MIN_VALUE); ++_naCnt;
-  }
-  private boolean _hasFloat;
+  public void addEnum(int e) { append2(0,      e          ); ++_strCnt;}
+  public void addNA  (     ) { append2(0,Integer.MIN_VALUE); ++_naCnt ;}
   public void addNum(long val, int exp) {
-    if(val == 0)exp = 0;
-    _hasFloat |= (exp < 0);
+    if(val == 0)exp = 0;        // Canonicalize zero
     append2(val,exp);
   }
-  public void addEnum(int e) {
-    append2(0,e); ++_strCnt;
-  }
+  // Fast-path append double data
   public void addNum(double d) {
-    if(_ds == null) {
-      assert _len == 0;
-      _ds = new double[1];
-    }
-    if( _len >= _ds.length ) {
-      if( _len > Vec.CHUNK_SZ )
-        throw new ArrayIndexOutOfBoundsException(_len);
-      _ds = Arrays.copyOf(_ds,_len<<1);
-    }
-    _ds[_len] = d;
-    _len++;
-    _hasFloat = true;
+    if( _ls==null||_len >= _ls.length ) append2slowd();
+    _ds[_len++] = d;
   }
-
   // Fast-path append long data
   void append2( long l, int x ) {
-    if( _len >= _ls.length ) append2slow();
-    _ls[_len] = l;
-    _xs[_len] = x;
-    _len++;
+    if( _ls==null||_len >= _ls.length ) append2slow();
+    _ls[_len  ] = l;
+    _xs[_len++] = x;
+  }
+  // Slow-path append data
+  void append2slowd( ) {
+    if( _len > Vec.CHUNK_SZ )
+      throw new ArrayIndexOutOfBoundsException(_len);
+    assert _ls==null;
+    _ds = _ds==null ? MemoryManager.malloc8d(4) : MemoryManager.arrayCopyOf(_ds,_len<<1);
   }
   // Slow-path append data
   void append2slow( ) {
     if( _len > Vec.CHUNK_SZ )
       throw new ArrayIndexOutOfBoundsException(_len);
-    _ls = MemoryManager.arrayCopyOf(_ls,_len<<1);
-    _xs = MemoryManager.arrayCopyOf(_xs,_len<<1);
+    assert _ds==null;
+    _xs = _ls==null ? MemoryManager.malloc4(4) : MemoryManager.arrayCopyOf(_xs,_len<<1);
+    _ls = _ls==null ? MemoryManager.malloc8(4) : MemoryManager.arrayCopyOf(_ls,_len<<1);
   }
-  void invalid() { append2(0,Integer.MIN_VALUE); }
-  void setInvalid(int idx) { _ls[idx]=0; _xs[idx] = Integer.MIN_VALUE; }
 
   /*
    *
@@ -267,8 +249,8 @@ Chunk compress() {
       _ls = new long[_ds.length]; // Else flip to longs
       _xs = new int [_ds.length];
       for( i=0; i<_len; i++ )   // Inject all doubles into longs
-        if( Double.isNaN(_ds[i]) ) setInvalid(i);
-        else _ls[i] = (long)_ds[i];
+        if( Double.isNaN(_ds[i]) ) _xs[i] = Integer.MIN_VALUE;
+        else                       _ls[i] = (long)_ds[i];
     }
 
     // data in some fixed-point format.
@@ -276,6 +258,8 @@ Chunk compress() {
     boolean hasNA = false;
     _naCnt=0;
     int nzCnt=0;                // Non-zero count
+    double min =  Double.MAX_VALUE;
+    double max = -Double.MAX_VALUE;
 
     for( int i=0; i<_len; i++ ) {
       if( isNA(i) ) { hasNA = true; _naCnt++; continue;}
@@ -284,8 +268,8 @@ Chunk compress() {
       if( l!=0 ) nzCnt++;
       // Compute per-chunk min/sum/max
       double d = l*DParseTask.pow10(x);
-      if( d < _min ) _min = d;
-      if( d > _max ) _max = d;
+      if( d < min ) min = d;
+      if( d > max ) max = d;
       if( l==0 ) x=0;           // Canonicalize zero exponent
       long t;
       while( l!=0 && (t=l/10)*10==l ) { l=t; x++; }
@@ -309,17 +293,17 @@ Chunk compress() {
       if( le < lemin ) lemin=le;
       if( le > lemax ) lemax=le;
     }
-    final boolean fpoint = xmin < 0 || _min < Long.MIN_VALUE || _max > Long.MAX_VALUE;
+    final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE;
 
     // Constant column?
-    if(!hasNA && _min==_max ) {
-      return ((long)_min  == _min)
-          ?new C0LChunk((long)_min,_len)
-          :new C0DChunk(_min, _len);
+    if( !hasNA && min==max ) {
+      return ((long)min  == min)
+          ? new C0LChunk((long)min,_len)
+          : new C0DChunk(      min,_len);
     }
 
     // Boolean column?
-    if (_max == 1 && _min == 0 && xmin == 0) {
+    if (max == 1 && min == 0 && xmin == 0) {
       if( nzCnt*32 < _len && _naCnt==0 )       // Very sparse?
         return new CX0Chunk(_ls,_len,nzCnt);        // Sparse boolean chunk
       int bpv = _strCnt+_naCnt > 0 ? 2 : 1;
@@ -360,7 +344,7 @@ Chunk compress() {
     if(xmin == 0 &&  0<=lemin && lemax <= 255 && ((_naCnt + _strCnt)==0) )
       return new C1NChunk( bufX(0,0,C1NChunk.OFF,0));
     if( lemax-lemin < 255 ) {         // Span fits in a byte?
-      if(0 <= _min && _max < 255 ) // Span fits in an unbiased byte?
+      if(0 <= min && max < 255 )      // Span fits in an unbiased byte?
         return new C1Chunk( bufX(0,0,C1Chunk.OFF,0));
       return new C1SChunk( bufX(lemin,xmin,C1SChunk.OFF,0),(int)lemin,DParseTask.pow10i(xmin));
     }
@@ -373,7 +357,7 @@ Chunk compress() {
       return new C2SChunk( bufX(bias,xmin,C2SChunk.OFF,1),bias,DParseTask.pow10i(xmin));
     }
     // Compress column into ints
-    if(Integer.MIN_VALUE < _min && _max <= Integer.MAX_VALUE )
+    if( Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE )
       return new C4Chunk( bufX(0,0,0,2));
     return new C8Chunk( bufX(0,0,0,3));
   }

From 76f67e15050bd98ad9a82c5df5cc18b0f2b4e642 Mon Sep 17 00:00:00 2001
From: cliffclick <cliffc@acm.org>
Date: Mon, 11 Nov 2013 20:52:14 -0800
Subject: [PATCH 09/11] More cleanup of NewChunk

Allow Enums to compress any style; remove one 64K cap (may be more
limits).  Correct/cleaup _naCnt.  Tighter asserts, more comments.
Remove leaking Key from Bit-test
---
 prj.el                                     |   2 +-
 src/main/java/water/fvec/C1SChunk.java     |   2 +-
 src/main/java/water/fvec/NewChunk.java     | 101 ++++++++++-----------
 src/test/java/water/fvec/CBSChunkTest.java |  22 +++--
 4 files changed, 65 insertions(+), 62 deletions(-)

diff --git a/prj.el b/prj.el
index aa9f4dbf7b..66a800ada8 100644
--- a/prj.el
+++ b/prj.el
@@ -7,7 +7,7 @@
  '(jde-run-option-debug nil)
  '(jde-run-option-vm-args nil)
  '(jde-compile-option-directory "./target/classes")
- '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "hex.KMeans2Test")))
+ '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.ParserTest2")))
  '(jde-debugger (quote ("JDEbug")))
  '(jde-compile-option-source (quote ("1.6")))
  '(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar")))
diff --git a/src/main/java/water/fvec/C1SChunk.java b/src/main/java/water/fvec/C1SChunk.java
index 1a85dd7ec5..383df953f2 100644
--- a/src/main/java/water/fvec/C1SChunk.java
+++ b/src/main/java/water/fvec/C1SChunk.java
@@ -37,7 +37,7 @@ public class C1SChunk extends Chunk {
   @Override boolean set_impl(int i, double d) { return false; }
   @Override boolean set_impl(int i, float f ) { return false; }
   @Override boolean setNA_impl(int idx) { _mem[idx+OFF] = (byte)C1Chunk._NA; return true; }
-  @Override boolean hasFloat() { return _scale < 1.0; }
+  @Override boolean hasFloat() { return _scale < 1.0 || _scale > Long.MAX_VALUE; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C1SChunk read(AutoBuffer bb) {
     _mem = bb.bufClose();
diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java
index 8c63678370..1b74823f01 100644
--- a/src/main/java/water/fvec/NewChunk.java
+++ b/src/main/java/water/fvec/NewChunk.java
@@ -16,7 +16,7 @@ public class NewChunk extends Chunk {
   // 2- scaled decimals from parsing, in _ls & _xs; _ds==null
   // 3- zero: requires _ls==0 && _xs==0
   // 4- NA: either _ls==0 && _xs==Integer.MIN_VALUE, OR _ds=NaN
-  // 5- Enum: _ls==0 && _xs>0 && _ds==null
+  // 5- Enum: _xs==(Integer.MIN_VALUE+1) && _ds==null
   // Chunk._len is the count of elements appended
   // Sparse: if _row !=null, then _ls/_xs/_ds are compressed to non-zero's
   // only, and _row is the row number.  Still Chunk._len is count of elements
@@ -35,7 +35,24 @@ public NewChunk( Chunk C ) {
     _len = C._len;
   }
 
-  public byte type(){
+  // Assert rollup counts are correct
+  private boolean checkCnt() {
+    int nas=0, ss=0;
+    if( _ds != null ) {
+      assert _ls==null && _xs==null;
+      for( double d : _ds ) if( Double.isNaN(d) ) nas++;
+    } else {
+      assert _ds==null;
+      if( _ls != null )
+        for( int i=0; i<_ls.length; i++ )
+          if( _ls[i]==0 && _xs[i]==Integer.MIN_VALUE ) nas++;
+          else if( _xs[i]==Integer.MIN_VALUE+1 ) ss++;
+    }
+    assert nas==_naCnt && ss==_strCnt : "na="+nas+" vs "+_naCnt+", str="+ss+" vs "+_strCnt;
+    return true;
+  }
+  public byte type() {
+    assert checkCnt();
     if(_naCnt == _len)
       return AppendableVec.NA;
     if(_strCnt > 0 && _strCnt + _naCnt == _len)
@@ -45,16 +62,19 @@ public byte type(){
   protected final boolean isNA(int idx) {
     return (_ds == null) ? (_ls[idx] == 0 && _xs[idx] == Integer.MIN_VALUE) : Double.isNaN(_ds[idx]);
   }
+  protected final boolean isEnum(int idx) {
+    return _ls!=null && _xs[idx]==Integer.MIN_VALUE+1;
+  }
 
-  public void addEnum(int e) { append2(0,      e          ); ++_strCnt;}
-  public void addNA  (     ) { append2(0,Integer.MIN_VALUE); ++_naCnt ;}
+  public void addEnum(int e) { append2(e,Integer.MIN_VALUE+1); ++_strCnt;}
+  public void addNA  (     ) { append2(0,Integer.MIN_VALUE  ); ++_naCnt ;}
   public void addNum(long val, int exp) {
     if(val == 0)exp = 0;        // Canonicalize zero
     append2(val,exp);
   }
   // Fast-path append double data
   public void addNum(double d) {
-    if( _ls==null||_len >= _ls.length ) append2slowd();
+    if( _ds==null||_len >= _ds.length ) append2slowd();
     _ds[_len++] = d;
   }
   // Fast-path append long data
@@ -209,41 +229,16 @@ Chunk compress() {
     long lemin= 0, lemax=lemin; // min/max at xmin fixed-point
     boolean overflow=false;
     boolean floatOverflow = false;
+    assert checkCnt();
 
     if(_naCnt == _len) // ALL NAs, nothing to do
       return new C0DChunk(Double.NaN,_len);
-    // Enum?  We assume that columns with ALL strings (and NAs) are enums if
-    // there were less than 65k unique vals.  If there were some numbers, we
-    // assume it is a numcol with strings being NAs.
-    if( type() == AppendableVec.ENUM) {
-      // find their max val
-      int sz = Integer.MIN_VALUE;
-      for(int x:_xs) if(x > sz)sz = x;
-      if( sz < Enum.MAX_ENUM_SIZE ) {
-        if(sz < 255){ // we can fit into 1Byte
-          byte [] bs = MemoryManager.malloc1(_len);
-          for(int i = 0; i < _len; ++i) bs[i] = (byte)(_xs[i] >= 0 ? (0xFF&_xs[i]) : C1Chunk._NA);
-          return new C1Chunk(bs);
-        } else if( sz <= 65535 ) { // 2 bytes
-          int bias = 0, off = 0;
-          if(sz >= 32767){
-            bias = 32767;
-            off = C2SChunk.OFF;
-          }
-          byte [] bs = MemoryManager.malloc1((_len << 1) + off);
-          for(int i = 0; i < _len; ++i){
-            if(_xs[i] >= 0) assert (short)(_xs[i]-bias) == (_xs[i]-bias);
-            UDP.set2(bs, off + (i << 1), (short)((_xs[i] > 0)? _xs[i]-bias : C2Chunk._NA));
-          }
-          return bias == 0 ? new C2Chunk(bs) : new C2SChunk(bs,bias,1);
-        } else throw H2O.unimpl();
-      }
-    }
+
     // If the data was set8 as doubles, we do a quick check to see if it's
     // plain longs.  If not, we give up and use doubles.
     if( _ds != null ) {
       int i=0;
-      for( ; i<_len; i++ ) // Attempt to inject all doubles into ints
+      for( ; i<_len; i++ ) // Attempt to inject all doubles into longs
         if( !Double.isNaN(_ds[i]) && (double)(long)_ds[i] != _ds[i] ) break;
       if( i<_len ) return chunkD();
       _ls = new long[_ds.length]; // Else flip to longs
@@ -251,27 +246,27 @@ Chunk compress() {
       for( i=0; i<_len; i++ )   // Inject all doubles into longs
         if( Double.isNaN(_ds[i]) ) _xs[i] = Integer.MIN_VALUE;
         else                       _ls[i] = (long)_ds[i];
+      _ds = null;
     }
 
-    // data in some fixed-point format.
+    // Data in some fixed-point format, not doubles
     boolean first = true;
-    boolean hasNA = false;
-    _naCnt=0;
     int nzCnt=0;                // Non-zero count
     double min =  Double.MAX_VALUE;
     double max = -Double.MAX_VALUE;
 
     for( int i=0; i<_len; i++ ) {
-      if( isNA(i) ) { hasNA = true; _naCnt++; continue;}
+      if( isNA(i) ) continue;
       long l = _ls[i];
       int  x = _xs[i];
+      if( x==Integer.MIN_VALUE+1 ) x=0; // Replace enum flag with no scaling
       if( l!=0 ) nzCnt++;
-      // Compute per-chunk min/sum/max
+      assert l!=0 || x==0;      // Exponent of zero is always zero
+      // Compute per-chunk min/max
       double d = l*DParseTask.pow10(x);
       if( d < min ) min = d;
       if( d > max ) max = d;
-      if( l==0 ) x=0;           // Canonicalize zero exponent
-      long t;
+      long t;                   // Remove extra scaling
       while( l!=0 && (t=l/10)*10==l ) { l=t; x++; }
       floatOverflow = Math.abs(l) > MAX_FLOAT_MANTISSA;
       if( first ) {
@@ -293,10 +288,9 @@ Chunk compress() {
       if( le < lemin ) lemin=le;
       if( le > lemax ) lemax=le;
     }
-    final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE;
 
     // Constant column?
-    if( !hasNA && min==max ) {
+    if( _naCnt==0 && min==max ) {
       return ((long)min  == min)
           ? new C0LChunk((long)min,_len)
           : new C0DChunk(      min,_len);
@@ -304,13 +298,15 @@ Chunk compress() {
 
     // Boolean column?
     if (max == 1 && min == 0 && xmin == 0) {
-      if( nzCnt*32 < _len && _naCnt==0 )       // Very sparse?
-        return new CX0Chunk(_ls,_len,nzCnt);        // Sparse boolean chunk
-      int bpv = _strCnt+_naCnt > 0 ? 2 : 1;
+      if( nzCnt*32 < _len && _naCnt==0 )     // Very sparse?
+        return new CX0Chunk(_ls,_len,nzCnt); // Sparse boolean chunk
+      int bpv = _strCnt+_naCnt > 0 ? 2 : 1;  // Bit-vector
       byte[] cbuf = bufB(CBSChunk.OFF, bpv);
       return new CBSChunk(cbuf, cbuf[0], cbuf[1]);
     }
 
+    // Result column must hold floats?
+    final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE;
     // Highly sparse but not a bitvector or constant?
     if( !fpoint && (nzCnt+_naCnt)*8 < _len &&
         lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE )// Only handling unbiased shorts here
@@ -375,7 +371,7 @@ private byte[] bufX( long bias, int scale, int off, int log ) {
           default: H2O.fail();
         }
       } else {
-        int x = _xs[i]-scale;
+        int x = (_xs[i]==Integer.MIN_VALUE+1 ? 0 : _xs[i])-scale;
         long le = x >= 0
             ? _ls[i]*DParseTask.pow10i( x)
             : _ls[i]/DParseTask.pow10i(-x);
@@ -396,7 +392,7 @@ private byte[] bufX( long bias, int scale, int off, int log ) {
   private Chunk chunkD() {
     final byte [] bs = MemoryManager.malloc1(_len*8);
     for(int i = 0; i < _len; ++i)
-      UDP.set8d(bs, 8*i, _ds != null?_ds[i]:isNA0(i)?Double.NaN:_ls[i]*DParseTask.pow10(_xs[i]));
+      UDP.set8d(bs, 8*i, _ds != null?_ds[i]:(isNA(i)||isEnum(i))?Double.NaN:_ls[i]*DParseTask.pow10(_xs[i]));
     return new C8DChunk(bs);
   }
 
@@ -436,17 +432,20 @@ private byte[] bufB(int off, int bpv) {
   // in-range and refer to the inflated values of the original Chunk.
   @Override boolean set_impl(int i, long l) {
     if( _ds != null ) throw H2O.unimpl();
+    if( _xs[i]==Integer.MIN_VALUE+1 ) _naCnt--;
     _ls[i]=l; _xs[i]=0;
     return true;
   }
   @Override boolean set_impl(int i, double d) {
-    if( _ls != null ) {
-      _ds = MemoryManager.malloc8d(_len);
+    if( _ls != null ) {         // Flip to using doubles
+      double ds[] = MemoryManager.malloc8d(_len);
       for( int j = 0; j<_len; j++ )
-        _ds[j] = _ls[j]*Math.pow(10,_xs[j]);
-      _ls = null;  _xs = null;
+        ds[j] = (isNA(j) || isEnum(j)) ? Double.NaN : _ls[j]*Math.pow(10,_xs[j]);
+      _ds = ds;  _ls = null;  _xs = null;
     }
+    if( Double.isNaN(_ds[i]) ) _naCnt--;
     _ds[i]=d;
+    if( Double.isNaN( d    ) ) _naCnt++;
     return true;
   }
   @Override boolean set_impl(int i, float f) {  return set_impl(i,(double)f); }
diff --git a/src/test/java/water/fvec/CBSChunkTest.java b/src/test/java/water/fvec/CBSChunkTest.java
index 6db4bff1ad..a7385e7561 100644
--- a/src/test/java/water/fvec/CBSChunkTest.java
+++ b/src/test/java/water/fvec/CBSChunkTest.java
@@ -3,10 +3,12 @@
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 
+import org.junit.BeforeClass;
 import org.junit.Ignore;
 import org.junit.Test;
-
 import water.Futures;
+import water.TestUtil;
+import water.UKV;
 
 /** Test for CBSChunk implementation.
  *
@@ -17,7 +19,8 @@
  * expected results. In this case expectation is little bit missused
  * since it is used to avoid DKV call.
  * */
-public class CBSChunkTest {
+public class CBSChunkTest extends TestUtil {
+  @BeforeClass public static void stall() { stall_till_cloudsize(1); }
 
   void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expNA) {
     AppendableVec av = new AppendableVec(Vec.newKey());
@@ -43,6 +46,7 @@ void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expN
     for( int i=0; i<ls.length; i++ )
       if(xs[i]==0)assertEquals(ls[i], cc.at80(i));
       else assertTrue(cc.isNA0(i));
+    UKV.remove(vv._key);
   }
 
   // Test one bit per value compression which is used
@@ -66,20 +70,20 @@ void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expN
   // used for data containing NAs
   @Test public void test2BPV() {
    // Simple case only compressing 2*3bits into 1byte including 1 NA
-   testImpl(new long[] {0,0,1},
-            new int [] {0,1,0},
+   testImpl(new long[] {0,0,                  1},
+            new int [] {0,Integer.MIN_VALUE,0},
             2, 2, 1, 1);
    // Filling whole byte, one NA
-   testImpl(new long[] {1,0,0,1},
-            new int [] {0,1,0,0},
+   testImpl(new long[] {1,0                ,0,1},
+            new int [] {0,Integer.MIN_VALUE,0,0},
             2, 0, 1, 1);
    // crossing the border of two bytes by 4bits, one NA
-   testImpl(new long[] {1,0,0,1, 0,0},
-            new int [] {0,0,1,0, 0,0},
+   testImpl(new long[] {1,0,0,                1, 0,0},
+            new int [] {0,0,Integer.MIN_VALUE,0, 0,0},
             2, 4, 2, 1);
    // Two full bytes, 5 NAs
    testImpl(new long[] {0,0,0,1, 0,0,1,0},
-            new int [] {1,1,1,0, 0,1,0,1},
+            new int [] {Integer.MIN_VALUE,Integer.MIN_VALUE,Integer.MIN_VALUE,0, 0,Integer.MIN_VALUE,0,Integer.MIN_VALUE},
             2, 0, 2, 5);
   }
 }

From 282222f9cae54730ca0909ed12b8545fc93492e9 Mon Sep 17 00:00:00 2001
From: Cliff Click <cliffc@acm.org>
Date: Mon, 11 Nov 2013 21:20:21 -0800
Subject: [PATCH 10/11] Uniformly cleanup enums & numbers

If the Chunk will be enums, nuke all the numbers.
If the Chunk will be numbers, nuke all the enums.
---
 src/main/java/water/fvec/NewChunk.java | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java
index 1b74823f01..a6ba349592 100644
--- a/src/main/java/water/fvec/NewChunk.java
+++ b/src/main/java/water/fvec/NewChunk.java
@@ -231,8 +231,15 @@ Chunk compress() {
     boolean floatOverflow = false;
     assert checkCnt();
 
-    if(_naCnt == _len) // ALL NAs, nothing to do
+    byte mode = type();
+    if( mode==AppendableVec.NA ) // ALL NAs, nothing to do
       return new C0DChunk(Double.NaN,_len);
+    for( int i=0; i<_len; i++ )
+      if( mode==AppendableVec.ENUM   && !isEnum(i) ||
+          mode==AppendableVec.NUMBER &&  isEnum(i) )
+        setNA_impl(i);
+    if( mode==AppendableVec.NUMBER ) _strCnt=0;
+    assert checkCnt();
 
     // If the data was set8 as doubles, we do a quick check to see if it's
     // plain longs.  If not, we give up and use doubles.

From cdff5e6d9a2000b2bc18ead164e12c911bd2654d Mon Sep 17 00:00:00 2001
From: Cliff Click <cliffc@acm.org>
Date: Tue, 12 Nov 2013 08:46:11 -0800
Subject: [PATCH 11/11] swap out tricky incremental math for simple bulk math

All rollups done in bulk at the start of NewChunk.compress().
Remove some commented-out code.
---
 prj.el                                     |   2 +-
 src/main/java/water/fvec/NewChunk.java     | 187 ++++-----------------
 src/test/java/water/fvec/CBSChunkTest.java |   2 +-
 3 files changed, 36 insertions(+), 155 deletions(-)

diff --git a/prj.el b/prj.el
index 66a800ada8..d257036cb3 100644
--- a/prj.el
+++ b/prj.el
@@ -7,7 +7,7 @@
  '(jde-run-option-debug nil)
  '(jde-run-option-vm-args nil)
  '(jde-compile-option-directory "./target/classes")
- '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.ParserTest2")))
+ '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.CBSChunkTest" "water.fvec.ParserTest2")))
  '(jde-debugger (quote ("JDEbug")))
  '(jde-compile-option-source (quote ("1.6")))
  '(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar")))
diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java
index a6ba349592..43f317534b 100644
--- a/src/main/java/water/fvec/NewChunk.java
+++ b/src/main/java/water/fvec/NewChunk.java
@@ -24,8 +24,9 @@ public class NewChunk extends Chunk {
   transient long   _ls[];       // Mantissa
   transient int    _xs[];       // Exponent, or if _ls==0, NA or Enum
   transient double _ds[];       // Doubles, for inflating via doubles
-  int _naCnt;                   // Count of NA's   appended
+  int _naCnt=-1;                // Count of NA's   appended
   int _strCnt;                  // Count of Enum's appended
+  int _nzCnt;                   // Count of non-zero's appended
 
   public NewChunk( Vec vec, int cidx ) { _vec = vec; _cidx = cidx; }
 
@@ -35,24 +36,26 @@ public NewChunk( Chunk C ) {
     _len = C._len;
   }
 
-  // Assert rollup counts are correct
-  private boolean checkCnt() {
-    int nas=0, ss=0;
-    if( _ds != null ) {
-      assert _ls==null && _xs==null;
-      for( double d : _ds ) if( Double.isNaN(d) ) nas++;
-    } else {
-      assert _ds==null;
-      if( _ls != null )
-        for( int i=0; i<_ls.length; i++ )
-          if( _ls[i]==0 && _xs[i]==Integer.MIN_VALUE ) nas++;
-          else if( _xs[i]==Integer.MIN_VALUE+1 ) ss++;
-    }
-    assert nas==_naCnt && ss==_strCnt : "na="+nas+" vs "+_naCnt+", str="+ss+" vs "+_strCnt;
-    return true;
-  }
+  // Heuristic to decide the basic type of a column
   public byte type() {
-    assert checkCnt();
+    if( _naCnt == -1 ) {        // No rollups yet?
+      int nas=0, ss=0, nzs=0;
+      if( _ds != null ) {
+        assert _ls==null && _xs==null;
+        for( double d : _ds ) if( Double.isNaN(d) ) nas++; else if( d!=0 ) nzs++;
+      } else {
+        assert _ds==null;
+        if( _ls != null )
+          for( int i=0; i<_ls.length; i++ )
+            if( isNA(i) ) nas++;
+            else { 
+              if( isEnum(i)   ) ss++;
+              if( _ls[i] != 0 ) nzs++;
+            }
+      }
+      _nzCnt=nzs;  _strCnt=ss;  _naCnt=nas;
+    }
+    // Now run heuristic for type
     if(_naCnt == _len)
       return AppendableVec.NA;
     if(_strCnt > 0 && _strCnt + _naCnt == _len)
@@ -66,10 +69,10 @@ protected final boolean isEnum(int idx) {
     return _ls!=null && _xs[idx]==Integer.MIN_VALUE+1;
   }
 
-  public void addEnum(int e) { append2(e,Integer.MIN_VALUE+1); ++_strCnt;}
-  public void addNA  (     ) { append2(0,Integer.MIN_VALUE  ); ++_naCnt ;}
+  public void addEnum(int e) { append2(e,Integer.MIN_VALUE+1); }
+  public void addNA  (     ) { append2(0,Integer.MIN_VALUE  ); }
   public void addNum(long val, int exp) {
-    if(val == 0)exp = 0;        // Canonicalize zero
+    if( val == 0 ) exp = 0;// Canonicalize zero
     append2(val,exp);
   }
   // Fast-path append double data
@@ -84,14 +87,14 @@ void append2( long l, int x ) {
     _xs[_len++] = x;
   }
   // Slow-path append data
-  void append2slowd( ) {
+  private void append2slowd( ) {
     if( _len > Vec.CHUNK_SZ )
       throw new ArrayIndexOutOfBoundsException(_len);
     assert _ls==null;
     _ds = _ds==null ? MemoryManager.malloc8d(4) : MemoryManager.arrayCopyOf(_ds,_len<<1);
   }
   // Slow-path append data
-  void append2slow( ) {
+  private void append2slow( ) {
     if( _len > Vec.CHUNK_SZ )
       throw new ArrayIndexOutOfBoundsException(_len);
     assert _ds==null;
@@ -99,117 +102,6 @@ void append2slow( ) {
     _ls = _ls==null ? MemoryManager.malloc8(4) : MemoryManager.arrayCopyOf(_ls,_len<<1);
   }
 
-  /*
-   *
-   *
-   *
-   * private long attemptTimeParse( ValueString str ) {
-    long t0 = attemptTimeParse_0(str); // "yyyy-MM-dd HH:mm:ss.SSS"
-    if( t0 != Long.MIN_VALUE ) return t0;
-    long t1 = attemptTimeParse_1(str); // "dd-MMM-yy"
-    if( t1 != Long.MIN_VALUE ) return t1;
-    return Long.MIN_VALUE;
-  }
-  // So I just brutally parse "yyyy-MM-dd HH:mm:ss.SSS"
-  private long attemptTimeParse_0( ValueString str ) {
-    final byte[] buf = str._buf;
-    int i=str._off;
-    final int end = i+str._length;
-    while( i < end && buf[i] == ' ' ) i++;
-    if   ( i < end && buf[i] == '"' ) i++;
-    if( (end-i) < 19 ) return Long.MIN_VALUE;
-    int yy=0, MM=0, dd=0, HH=0, mm=0, ss=0, SS=0;
-    yy = digit(yy,buf[i++]);
-    yy = digit(yy,buf[i++]);
-    yy = digit(yy,buf[i++]);
-    yy = digit(yy,buf[i++]);
-    if( yy < 1970 ) return Long.MIN_VALUE;
-    if( buf[i++] != '-' ) return Long.MIN_VALUE;
-    MM = digit(MM,buf[i++]);
-    MM = digit(MM,buf[i++]);
-    if( MM < 1 || MM > 12 ) return Long.MIN_VALUE;
-    if( buf[i++] != '-' ) return Long.MIN_VALUE;
-    dd = digit(dd,buf[i++]);
-    dd = digit(dd,buf[i++]);
-    if( dd < 1 || dd > 31 ) return Long.MIN_VALUE;
-    if( buf[i++] != ' ' ) return Long.MIN_VALUE;
-    HH = digit(HH,buf[i++]);
-    HH = digit(HH,buf[i++]);
-    if( HH < 0 || HH > 23 ) return Long.MIN_VALUE;
-    if( buf[i++] != ':' ) return Long.MIN_VALUE;
-    mm = digit(mm,buf[i++]);
-    mm = digit(mm,buf[i++]);
-    if( mm < 0 || mm > 59 ) return Long.MIN_VALUE;
-    if( buf[i++] != ':' ) return Long.MIN_VALUE;
-    ss = digit(ss,buf[i++]);
-    ss = digit(ss,buf[i++]);
-    if( ss < 0 || ss > 59 ) return Long.MIN_VALUE;
-    if( i<end && buf[i] == '.' ) {
-      i++;
-      if( i<end ) SS = digit(SS,buf[i++]);
-      if( i<end ) SS = digit(SS,buf[i++]);
-      if( i<end ) SS = digit(SS,buf[i++]);
-      if( SS < 0 || SS > 999 ) return Long.MIN_VALUE;
-    }
-    if( i<end && buf[i] == '"' ) i++;
-    if( i<end ) return Long.MIN_VALUE;
-    return new GregorianCalendar(yy,MM,dd,HH,mm,ss).getTimeInMillis()+SS;
-  }
-
-  // So I just brutally parse "dd-MMM-yy".
-  public static final byte MMS[][][] = new byte[][][] {
-    {"jan".getBytes(),null},
-    {"feb".getBytes(),null},
-    {"mar".getBytes(),null},
-    {"apr".getBytes(),null},
-    {"may".getBytes(),null},
-    {"jun".getBytes(),"june".getBytes()},
-    {"jul".getBytes(),"july".getBytes()},
-    {"aug".getBytes(),null},
-    {"sep".getBytes(),"sept".getBytes()},
-    {"oct".getBytes(),null},
-    {"nov".getBytes(),null},
-    {"dec".getBytes(),null}
-  };
-  private long attemptTimeParse_1( ValueString str ) {
-    final byte[] buf = str._buf;
-    int i=str._off;
-    final int end = i+str._length;
-    while( i < end && buf[i] == ' ' ) i++;
-    if   ( i < end && buf[i] == '"' ) i++;
-    if( (end-i) < 8 ) return Long.MIN_VALUE;
-    int yy=0, MM=0, dd=0;
-    dd = digit(dd,buf[i++]);
-    if( buf[i] != '-' ) dd = digit(dd,buf[i++]);
-    if( dd < 1 || dd > 31 ) return Long.MIN_VALUE;
-    if( buf[i++] != '-' ) return Long.MIN_VALUE;
-    byte[]mm=null;
-    OUTER: for( ; MM<MMS.length; MM++ ) {
-      byte[][] mms = MMS[MM];
-      INNER: for( int k=0; k<mms.length; k++ ) {
-        mm = mms[k];
-        if( mm == null ) continue;
-        for( int j=0; j<mm.length; j++ )
-          if( mm[j] != Character.toLowerCase(buf[i+j]) )
-            continue INNER;
-        break OUTER;
-      }
-    }
-    if( MM == MMS.length ) return Long.MIN_VALUE; // No matching month
-    i += mm.length;             // Skip month bytes
-    MM++;                       // 1-based month
-    if( buf[i++] != '-' ) return Long.MIN_VALUE;
-    yy = digit(yy,buf[i++]);
-    yy = digit(yy,buf[i++]);
-    yy += 2000;                 // Y2K bug
-    if( i<end && buf[i] == '"' ) i++;
-    if( i<end ) return Long.MIN_VALUE;
-    return new GregorianCalendar(yy,MM,dd).getTimeInMillis();
-  }
-
-   */
-
-
   // Do any final actions on a completed NewVector.  Mostly: compress it, and
   // do a DKV put on an appropriate Key.  The original NewVector goes dead
   // (does not live on inside the K/V store).
@@ -229,7 +121,6 @@ Chunk compress() {
     long lemin= 0, lemax=lemin; // min/max at xmin fixed-point
     boolean overflow=false;
     boolean floatOverflow = false;
-    assert checkCnt();
 
     byte mode = type();
     if( mode==AppendableVec.NA ) // ALL NAs, nothing to do
@@ -238,8 +129,8 @@ Chunk compress() {
       if( mode==AppendableVec.ENUM   && !isEnum(i) ||
           mode==AppendableVec.NUMBER &&  isEnum(i) )
         setNA_impl(i);
-    if( mode==AppendableVec.NUMBER ) _strCnt=0;
-    assert checkCnt();
+    _naCnt = -1;  type();    // Re-run rollups after dropping all numbers/enums
+
 
     // If the data was set8 as doubles, we do a quick check to see if it's
     // plain longs.  If not, we give up and use doubles.
@@ -258,7 +149,6 @@ Chunk compress() {
 
     // Data in some fixed-point format, not doubles
     boolean first = true;
-    int nzCnt=0;                // Non-zero count
     double min =  Double.MAX_VALUE;
     double max = -Double.MAX_VALUE;
 
@@ -267,7 +157,6 @@ Chunk compress() {
       long l = _ls[i];
       int  x = _xs[i];
       if( x==Integer.MIN_VALUE+1 ) x=0; // Replace enum flag with no scaling
-      if( l!=0 ) nzCnt++;
       assert l!=0 || x==0;      // Exponent of zero is always zero
       // Compute per-chunk min/max
       double d = l*DParseTask.pow10(x);
@@ -305,9 +194,9 @@ Chunk compress() {
 
     // Boolean column?
     if (max == 1 && min == 0 && xmin == 0) {
-      if( nzCnt*32 < _len && _naCnt==0 )     // Very sparse?
-        return new CX0Chunk(_ls,_len,nzCnt); // Sparse boolean chunk
-      int bpv = _strCnt+_naCnt > 0 ? 2 : 1;  // Bit-vector
+      if( _nzCnt*32 < _len && _naCnt==0 )     // Very sparse?
+        return new CX0Chunk(_ls,_len,_nzCnt); // Sparse boolean chunk
+      int bpv = _strCnt+_naCnt > 0 ? 2 : 1;   // Bit-vector
       byte[] cbuf = bufB(CBSChunk.OFF, bpv);
       return new CBSChunk(cbuf, cbuf[0], cbuf[1]);
     }
@@ -315,9 +204,9 @@ Chunk compress() {
     // Result column must hold floats?
     final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE;
     // Highly sparse but not a bitvector or constant?
-    if( !fpoint && (nzCnt+_naCnt)*8 < _len &&
-        lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE )// Only handling unbiased shorts here
-      return new CX2Chunk(_ls,_xs,_len,nzCnt,_naCnt); // Sparse byte chunk
+    if( !fpoint && (_nzCnt+_naCnt)*8 < _len &&
+        lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE ) // Only handling unbiased shorts here
+      return new CX2Chunk(_ls,_xs,_len,_nzCnt,_naCnt); // Sparse byte chunk
 
     // Exponent scaling: replacing numbers like 1.3 with 13e-1.  '13' fits in a
     // byte and we scale the column by 0.1.  A set of numbers like
@@ -426,10 +315,6 @@ private byte[] bufB(int off, int bpv) {
     bs[1] = (byte) bpv;
     // Flush last byte
     if (boff>0) bs[idx++] = b;
-    /*for (int i=0; i<idx; i++) {
-      if (i==0 || i==1) System.err.println(bs[i]);
-      else System.err.println(bs[i] + " = " + Integer.toBinaryString(bs[i]));
-    }*/
     return bs;
   }
 
@@ -439,7 +324,6 @@ private byte[] bufB(int off, int bpv) {
   // in-range and refer to the inflated values of the original Chunk.
   @Override boolean set_impl(int i, long l) {
     if( _ds != null ) throw H2O.unimpl();
-    if( _xs[i]==Integer.MIN_VALUE+1 ) _naCnt--;
     _ls[i]=l; _xs[i]=0;
     return true;
   }
@@ -450,9 +334,7 @@ private byte[] bufB(int off, int bpv) {
         ds[j] = (isNA(j) || isEnum(j)) ? Double.NaN : _ls[j]*Math.pow(10,_xs[j]);
       _ds = ds;  _ls = null;  _xs = null;
     }
-    if( Double.isNaN(_ds[i]) ) _naCnt--;
     _ds[i]=d;
-    if( Double.isNaN( d    ) ) _naCnt++;
     return true;
   }
   @Override boolean set_impl(int i, float f) {  return set_impl(i,(double)f); }
@@ -460,7 +342,6 @@ private byte[] bufB(int off, int bpv) {
     if( isNA(i) ) return true;
     if( _ls != null ) { _ls[i] = 0; _xs[i] = Integer.MIN_VALUE; }
     if( _ds != null ) { _ds[i] = Double.NaN; }
-    _naCnt++;
     return true;
   }
   @Override public long   at8_impl( int i ) {
diff --git a/src/test/java/water/fvec/CBSChunkTest.java b/src/test/java/water/fvec/CBSChunkTest.java
index a7385e7561..9244030880 100644
--- a/src/test/java/water/fvec/CBSChunkTest.java
+++ b/src/test/java/water/fvec/CBSChunkTest.java
@@ -31,7 +31,7 @@ void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expN
     nc._ls = ls;
     nc._xs = xs;
     nc._len = ls.length;
-    for (int i=0;i<ls.length; i++) nc._naCnt += nc.isNA(i) ? 1 : 0; // Compute number of NAs
+    nc.type();                  // Compute rollups, including NA
     assertEquals(expNA, nc._naCnt);
     // Compress chunk
     Chunk cc = nc.compress();