From 3038ca536e1cba405d173723ca8aa860c6a7d445 Mon Sep 17 00:00:00 2001 From: mmalohlava Date: Mon, 11 Nov 2013 12:32:15 -0800 Subject: [PATCH 01/11] Fix in computing tree stats for GBM trees. --- src/main/java/hex/drf/DRF.java | 10 ++-------- src/main/java/hex/gbm/GBM.java | 9 +++++---- src/main/java/hex/gbm/SharedTreeModelBuilder.java | 7 +++++++ 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/main/java/hex/drf/DRF.java b/src/main/java/hex/drf/DRF.java index 28c872c283..ddc6f0eab8 100644 --- a/src/main/java/hex/drf/DRF.java +++ b/src/main/java/hex/drf/DRF.java @@ -345,20 +345,14 @@ private DTree[] buildNextKTrees(Frame fr, int mtrys, float sample_rate, Random r } }.doAll(fr); + // Collect leaves stats + for (int i=0; i>32L),(int)seed }); } + + // helper for debugging + static protected void printGenerateTrees(DTree[] trees) { + for( int k=0; k Date: Mon, 11 Nov 2013 16:17:40 -0800 Subject: [PATCH 02/11] added some Exec2 testing and more stressful gz compression ratio --- py/h2o.py | 25 ++-- py/h2o_exec.py | 114 +++++++++++++----- py/testdir_single_jvm/test_GBM_fvec.py | 1 - py/testdir_single_jvm/test_exec2_operators.py | 59 +++++++++ .../test_parse_syn_gz_cat.py | 27 +++-- py/testdir_single_jvm/test_rf_syn_gz_cat.py | 13 +- 6 files changed, 189 insertions(+), 50 deletions(-) create mode 100644 py/testdir_single_jvm/test_exec2_operators.py diff --git a/py/h2o.py b/py/h2o.py index 0986ca8931..e95fb2473d 100644 --- a/py/h2o.py +++ b/py/h2o.py @@ -895,7 +895,8 @@ def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=N raise Exception("Could not decode any json from the request. Do you have beta features turned on? beta_features: ", beta_features) for e in ['error', 'Error', 'errors', 'Errors']: - if e in rjson: + # error can be null (python None). This happens in exec2 + if e in rjson and rjson[e]: verboseprint(dump_json(rjson)) emsg = 'rjson %s in %s: %s' % (e, inspect.stack()[1][3], rjson[e]) if ignoreH2oError: @@ -905,7 +906,8 @@ def __do_json_request(self, jsonRequest=None, fullUrl=None, timeout=10, params=N raise Exception(emsg) for w in ['warning', 'Warning', 'warnings', 'Warnings']: - if w in rjson: + # warning can be null (python None). + if w in rjson and rjson[w]: verboseprint(dump_json(rjson)) print 'rjson %s in %s: %s' % (w, inspect.stack()[1][3], rjson[w]) @@ -1388,15 +1390,20 @@ def import_hdfs(self, path, timeoutSecs=180): # 'destination_key', 'escape_nan' 'expression' def exec_query(self, timeoutSecs=20, ignoreH2oError=False, **kwargs): - params_dict = { - 'expression': None, - ## 'escape_nan': 0, - ## 'destination_key': "Result.hex", # curious as to whether specifying destination key messes anything up. - } + if beta_features: + params_dict = { + 'str': None, + } + else: + params_dict = { + 'expression': None, + ## 'escape_nan': 0, + } + browseAlso = kwargs.pop('browseAlso',False) - params_dict.update(kwargs) + check_params_update_kwargs(params_dict, kwargs, 'exec_query', print_params=True) verboseprint("\nexec_query:", params_dict) - a = self.__do_json_request('Exec.json', + a = self.__do_json_request('2/Exec2.json' if beta_features else 'Exec.json', timeout=timeoutSecs, ignoreH2oError=ignoreH2oError, params=params_dict) verboseprint("\nexec_query result:", dump_json(a)) return a diff --git a/py/h2o_exec.py b/py/h2o_exec.py index d312c69d8a..2fb13e581b 100644 --- a/py/h2o_exec.py +++ b/py/h2o_exec.py @@ -32,30 +32,48 @@ def checkScalarResult(resultInspect, resultKey): # weird..it's a tuple, not a list? when the extra level of hier is there # this works: if type(resultInspect) is not dict: - ### print "Trimming resultInspect hier." resultInspect0 = resultInspect[0] else: resultInspect0 = resultInspect emsg = None while(True): - if 'type' not in resultInspect0: - emsg = "'type' missing. Look at the json just printed" - break - t = resultInspect0["type"] - if t != 'parsed': - emsg = resultKey + " 'type' is not 'parsed'. Look at the json just printed" - break - - if 'rows' not in resultInspect0: - emsg = "Inspect response: 'rows' missing. Look at the json just printed" - break - rows = resultInspect0["rows"] - - if 'cols' not in resultInspect0: - emsg = "Inspect response: 'cols' missing. Look at the json just printed" - break - cols = resultInspect0["cols"] + + if h2o.beta_features: + if 'num_rows' not in resultInspect0: + emsg = "Inspect response: 'num_rows' missing. Look at the json just printed" + break + rows = resultInspect0["num_rows"] + + if 'cols' not in resultInspect0: + emsg = "Inspect response: 'num_cols' missing. Look at the json just printed" + break + cols = resultInspect0["cols"] + + print "cols:", h2o.dump_json(cols) + + num_cols = resultInspect0["num_cols"] + + else: + + if 'type' not in resultInspect0: + emsg = "'type' missing. Look at the json just printed" + break + t = resultInspect0["type"] + + if t != 'parsed': + emsg = resultKey + " 'type' is not 'parsed'. Look at the json just printed" + break + + if 'rows' not in resultInspect0: + emsg = "Inspect response: 'rows' missing. Look at the json just printed" + break + rows = resultInspect0["rows"] + + if 'cols' not in resultInspect0: + emsg = "Inspect response: 'cols' missing. Look at the json just printed" + break + cols = resultInspect0["cols"] break @@ -69,7 +87,10 @@ def checkScalarResult(resultInspect, resultKey): # FIX! the key for the value can be 0 or 1 or ?? (apparently col?) Should change H2O here metaDict = cols[0] for key,value in metaDict.items(): - h2o.verboseprint("Inspect metadata:", key, value) + if h2o.beta_features: + print "Inspect metaDict:", key, value + else: + h2o.verboseprint("Inspect metaDict:", key, value) min_value = metaDict['min'] checkForBadFP(min_value) @@ -104,22 +125,61 @@ def exec_expr(node=None, execExpr=None, resultKey="Result.hex", timeoutSecs=10, start = time.time() # FIX! Exec has 'escape_nan' arg now. should we test? # 5/14/13 removed escape_nan=0 - resultExec = h2o_cmd.runExec(node, expression=execExpr, - timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError) + + if h2o.beta_features: + kwargs = {'str': execExpr} + resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) + else: + kwargs = {'expression': execExpr} + resultExec = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) + h2o.verboseprint(resultExec) h2o.verboseprint('exec took', time.time() - start, 'seconds') ### print 'exec took', time.time() - start, 'seconds' h2o.verboseprint("\nfirst look at the default Result key") # new offset=-1 to get the metadata? - defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1) - checkScalarResult(defaultInspectM1, "Result.hex") + if h2o.beta_features: # default assign not present in v2? + # constants don't create keys. + # so the only way to see the results is to do another exec? + kwargs = {'str': resultKey} + resultExec2 = h2o_cmd.runExec(node, timeoutSecs=timeoutSecs, ignoreH2oError=ignoreH2oError, **kwargs) + print "resultExec2:", h2o.dump_json(resultExec2) + + # maybe return 'scalar' in some cases? + return resultExec2, resultExec2['cols'][0]['min'] + # exec_query parameters: {'str': 'Result0 = c(0)'} + # exec_query parameters: {'str': 'Result0'} + # resultExec2: { + # "Request2": 0, + # "cols": [ + # { + # "max": 0.0, + # "mean": 0.0, + # "min": 0.0, + # "naCnt": 0, + # "name": "c", + # "type": "Int" + # } + # ], + # "error": null, + # "funstr": null, + # "key": null, + # "num_cols": 1, + # "num_rows": 1, + # "result": "c \n0 \n", + # "scalar": 0.0 + # } + + else: + defaultInspectM1 = h2o_cmd.runInspect(None, "Result.hex", offset=-1) + checkScalarResult(defaultInspectM1, "Result.hex") - h2o.verboseprint("\nNow look at the assigned " + resultKey + " key") - resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1) - min_value = checkScalarResult(resultInspectM1, resultKey) + h2o.verboseprint("\nNow look at the assigned " + resultKey + " key") + resultInspectM1 = h2o_cmd.runInspect(None, resultKey, offset=-1) + min_value = checkScalarResult(resultInspectM1, resultKey) - return resultInspectM1, min_value + return resultInspectM1, min_value def exec_zero_list(zeroList): diff --git a/py/testdir_single_jvm/test_GBM_fvec.py b/py/testdir_single_jvm/test_GBM_fvec.py index 049a8b77b2..227085b60a 100644 --- a/py/testdir_single_jvm/test_GBM_fvec.py +++ b/py/testdir_single_jvm/test_GBM_fvec.py @@ -115,7 +115,6 @@ def colIt(x): return "C" + str(x) } kwargs = params.copy() - h2o.beta_features = True timeoutSecs = 1800 start = time.time() GBMFirstResult = h2o_cmd.runGBM(parseResult=parseResult, noPoll=True,**kwargs) diff --git a/py/testdir_single_jvm/test_exec2_operators.py b/py/testdir_single_jvm/test_exec2_operators.py new file mode 100644 index 0000000000..df01ba362d --- /dev/null +++ b/py/testdir_single_jvm/test_exec2_operators.py @@ -0,0 +1,59 @@ +import unittest, random, sys, time +sys.path.extend(['.','..','py']) + +import h2o, h2o_browse as h2b, h2o_exec as h2e, h2o_hosts + +print "FIX! evidently visibility between expressions depends on the type. constants disappear?" +print "hack by creating vectors" +initList = [ + 'Result0 = c(0)', + 'Result1 = c(1)', + 'Result2 = c(2)', + 'Result3 = c(3)', + ] + +# double assign to Result.hex, so the checker doesn't have different names to check? +exprList = [ + # 'Result.hex = Result = Result0 * Result', + 'Result.hex = Result = Result1 + Result', + # 'Result.hex = Result = Result2 / Result', + # 'Result.hex = Result = Result3 - Result', + ] + +class Basic(unittest.TestCase): + def tearDown(self): + h2o.check_sandbox_for_errors() + + @classmethod + def setUpClass(cls): + global SEED, localhost + SEED = h2o.setup_random_seed() + localhost = h2o.decide_if_localhost() + if (localhost): + h2o.build_cloud(1) + else: + h2o_hosts.build_cloud_with_hosts(1) + + @classmethod + def tearDownClass(cls): + h2o.tear_down_cloud() + + def test_exec_operators(self): + h2o.beta_features = True + + for i, execExpr in enumerate(initList): + if h2o.beta_features: # no default result + resultKey = "Result" + str(i) + else: + resultKey = "Result.hex" + h2e.exec_expr(h2o.nodes[0], execExpr, resultKey=resultKey, timeoutSecs=4) + + start = time.time() + h2e.exec_expr_list_rand(len(h2o.nodes), exprList, None, maxTrials=200, timeoutSecs=10) + + h2o.check_sandbox_for_errors() + print "exec end on ", "operators" , 'took', time.time() - start, 'seconds' + + +if __name__ == '__main__': + h2o.unit_main() diff --git a/py/testdir_single_jvm/test_parse_syn_gz_cat.py b/py/testdir_single_jvm/test_parse_syn_gz_cat.py index ebf6f51dbd..241e47d98d 100644 --- a/py/testdir_single_jvm/test_parse_syn_gz_cat.py +++ b/py/testdir_single_jvm/test_parse_syn_gz_cat.py @@ -2,7 +2,7 @@ sys.path.extend(['.','..','py']) import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_exec as h2e, h2o_util -print "Create csv with lots of same data (95% 0?), so gz will have high compression ratio" +print "Create csv with lots of same data (98% 0?), so gz will have high compression ratio" print "Cat a bunch of them together, to get an effective large blow up inside h2o" print "Can also copy the files to test multi-file gz parse...that will behave differently" print "Behavior may be different depending on whether small ints are used, reals or used, or enums are used" @@ -16,8 +16,11 @@ def write_syn_dataset(csvPathname, rowCount, colCount, SEED): for i in range(rowCount): rowData = [] for j in range(colCount): - r = h2o_util.choice_with_probability([(1.1, .05), (0.1, .95)]) - rowData.append(r) + # r = h2o_util.choice_with_probability([(1.1, .02), (0.1, .98)]) + r = h2o_util.choice_with_probability([(1, .001), (0, .999)]) + # make r a many-digit real, so gzip compresses even more better! + # rowData.append('%#034.32e' % r) + rowData.append('%.1f' % r) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") @@ -34,7 +37,7 @@ def setUpClass(cls): SEED = h2o.setup_random_seed() localhost = h2o.decide_if_localhost() if (localhost): - h2o.build_cloud(1,java_heap_GB=14) + h2o.build_cloud(3,java_heap_GB=1) else: h2o_hosts.build_cloud_with_hosts() @@ -46,9 +49,11 @@ def test_parse_syn_gz_cat(self): SYNDATASETS_DIR = h2o.make_syn_dir() tryList = [ # summary fails with 100000 cols - (10, 5000, 'cE', 600), - (10, 10000, 'cF', 600), - (10, 50000, 'cF', 600), + # overwrite the key each time to save space? + (100, 40000, 'cF', 600), + (100, 20000, 'cF', 600), + (100, 10000, 'cF', 600), + (100, 5000, 'cF', 600), ] FILEREPL = 200 @@ -95,10 +100,14 @@ def test_parse_syn_gz_cat(self): start = time.time() inspect = h2o_cmd.runInspect(None, parseResult['destination_key'], timeoutSecs=timeoutSecs) print "Inspect:", parseResult['destination_key'], "took", time.time() - start, "seconds" + num_rows = inspect['num_rows'] + num_cols = inspect['num_cols'] + value_size_bytes = inspect['value_size_bytes'] h2o_cmd.infoFromInspect(inspect, csvPathname) print "\n" + csvPathname, \ - " num_rows:", "{:,}".format(inspect['num_rows']), \ - " num_cols:", "{:,}".format(inspect['num_cols']) + "\n num_rows:", "{:,}".format(num_rows), \ + "\n num_cols:", "{:,}".format(num_cols), \ + "\n value_size_bytes:", "{:,}".format(value_size_bytes) # should match # of cols in header or ?? self.assertEqual(inspect['num_cols'], colCount, diff --git a/py/testdir_single_jvm/test_rf_syn_gz_cat.py b/py/testdir_single_jvm/test_rf_syn_gz_cat.py index 22590d8464..f4cac912bb 100644 --- a/py/testdir_single_jvm/test_rf_syn_gz_cat.py +++ b/py/testdir_single_jvm/test_rf_syn_gz_cat.py @@ -2,7 +2,7 @@ sys.path.extend(['.','..','py']) import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i, h2o_exec as h2e, h2o_util -print "Create csv with lots of same data (95% 0?), so gz will have high compression ratio" +print "Create csv with lots of same data (98% 0?), so gz will have high compression ratio" print "Cat a bunch of them together, to get an effective large blow up inside h2o" print "Can also copy the files to test multi-file gz parse...that will behave differently" print "Behavior may be different depending on whether small ints are used, reals or used, or enums are used" @@ -22,16 +22,21 @@ def write_syn_dataset(csvPathname, rowCount, colCount, SEED): rowSum = 0 for j in range(colCount): if BASE==2: - # we're just doing 50/50 for now, unlike the print says above - r = h2o_util.choice_with_probability([(0, .5), (1, .5)]) + # 50/50 + # r = h2o_util.choice_with_probability([(0, .5), (1, .5)]) + # 98/2 + r = h2o_util.choice_with_probability([(0, .98), (1, .2)]) else: raise Exception("Unsupported BASE: " + BASE) rowSum += r + + rowData.append(r) responseVar = rowSum % BASE - rowData.append(responseVar) + # make r a many-digit real, so gzip compresses even more better! + rowData.append('%#034.32e' % responseVar) rowDataCsv = ",".join(map(str,rowData)) dsf.write(rowDataCsv + "\n") From f38787117807419bba42f403b8a779adc5829d8b Mon Sep 17 00:00:00 2001 From: anqi Date: Mon, 11 Nov 2013 18:03:42 -0800 Subject: [PATCH 03/11] Added some basic operations like logical AND/OR --- R/h2oRClient-package/R/Classes.R | 179 ++++++++-------------------- R/h2oRClient-package/R/Internal.R | 23 +++- src/main/java/water/exec/ASTOp.java | 28 ++++- 3 files changed, 94 insertions(+), 136 deletions(-) diff --git a/R/h2oRClient-package/R/Classes.R b/R/h2oRClient-package/R/Classes.R index ab0e6aee14..f00675e6f0 100644 --- a/R/h2oRClient-package/R/Classes.R +++ b/R/h2oRClient-package/R/Classes.R @@ -322,72 +322,6 @@ setMethod("summary", "H2OParsedData", function(object) { result }) -histograms <- function(object) { UseMethod("histograms", object) } -setMethod("histograms", "H2OParsedData2", function(object) { - res = h2o.__remoteSend(object@h2o, h2o.__PAGE_SUMMARY2, source=object@key) - list.of.bins <- lapply(res$summaries, function(res) { - if (res$rows == 0) { - bins <- NULL - } else { - domains <- res$domains - counts <- res$bins - breaks <- seq(res$start, by=res$binsz, length.out=length(res$bins) + 1) - bins <- list(domains,counts,breaks) - names(bins) <- cbind('domains', 'counts', 'breaks') - } - bins - }) -}) - -setMethod("summary", "H2OParsedData2", function(object) { - res = h2o.__remoteSend(object@h2o, h2o.__PAGE_SUMMARY2, source=object@key) - col.summaries = res$summaries - col.names = res$names - col.means = res$means - col.results = mapply(c, res$summaries, res$names, res$means, SIMPLIFY=FALSE) - for (i in 1:length(col.results)) - names(col.results[[i]])[(length(col.results[[i]]) - 1) : length(col.results[[i]])] <- c('name', 'mean') - result = NULL - - result <- sapply(col.results, function(res) { - if(is.null(res$domains)) { # numeric column - if(is.null(res$mins) || length(res$mins) == 0) res$mins = NaN - if(is.null(res$maxs) || length(res$maxs) == 0) res$maxs = NaN - if(is.null(res$percentileValues)) - params = format(rep(round(as.numeric(col.means[[i]]), 3), 6), nsmall = 3) - else - params = format(round(as.numeric(c( - res$mins[1], - res$percentileValues[4], - res$percentileValues[6], - res$mean, - res$percentileValues[8], - tail(res$maxs, 1))), 3), nsmall = 3) - result = c(paste("Min. :", params[1], " ", sep=""), paste("1st Qu.:", params[2], " ", sep=""), - paste("Median :", params[3], " ", sep=""), paste("Mean :", params[4], " ", sep=""), - paste("3rd Qu.:", params[5], " ", sep=""), paste("Max. :", params[6], " ", sep="")) - } - else { - domains <- res$domains[res$maxs + 1] - counts <- res$bins[res$maxs + 1] - width <- max(cbind(nchar(domains), nchar(counts))) - result <- paste(domains, - mapply(function(x, y) { paste(rep(' ', max(width + 1 - nchar(x) - nchar(y),0)), collapse='') }, domains, counts), - ":", - counts, - " ", - sep='') - result[6] <- NA - result - } - }) - - result = as.table(result) - rownames(result) <- rep("", 6) - colnames(result) <- col.names - result -}) - setMethod("summary", "H2OPCAModel", function(object) { # TODO: Save propVar and cumVar from the Java output instead of computing here myVar = object@model$sdev^2 @@ -472,7 +406,7 @@ setMethod("h2o.factor", signature(data="H2OParsedData", col="character"), h2o.factor(data, ind-1) }) -#--------------------------------- FluidVecs --------------------------------------# +#------------------------------------ FluidVecs ----------------------------------------# setMethod("show", "H2ORawData2", function(object) { print(object@h2o) cat("Raw Data Key:", object@key, "\n") @@ -556,41 +490,58 @@ setMethod("$", "H2OParsedData2", function(x, name) { new("H2OParsedData2", h2o=x@h2o, key=res$dest_key) }) -setMethod("+", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("+", e1, e2) }) -setMethod("-", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("-", e1, e2) }) -setMethod("*", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("*", e1, e2) }) -setMethod("/", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("/", e1, e2) }) -# setMethod("%%", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("%", e1, e2) }) -setMethod("==", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("==", e1, e2) }) -setMethod(">", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">", e1, e2) }) -setMethod("<", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<", e1, e2) }) -setMethod("!=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("!=", e1, e2) }) -setMethod(">=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">=", e1, e2) }) -setMethod("<=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<=", e1, e2) }) - -setMethod("+", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("+", e1, e2) }) -setMethod("-", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("-", e1, e2) }) -setMethod("*", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("*", e1, e2) }) -setMethod("/", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("/", e1, e2) }) -# setMethod("%%", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("%", e1, e2) }) -setMethod("==", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("==", e1, e2) }) -setMethod(">", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">", e1, e2) }) -setMethod("<", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<", e1, e2) }) -setMethod("!=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("!=", e1, e2) }) -setMethod(">=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2(">=", e1, e2) }) -setMethod("<=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__operator2("<=", e1, e2) }) - -setMethod("+", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("+", e1, e2) }) -setMethod("-", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("-", e1, e2) }) -setMethod("*", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("*", e1, e2) }) -setMethod("/", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("/", e1, e2) }) -# setMethod("%%", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("%", e1, e2) }) -setMethod("==", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("==", e1, e2) }) -setMethod(">", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2(">", e1, e2) }) -setMethod("<", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("<", e1, e2) }) -setMethod("!=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("!=", e1, e2) }) -setMethod(">=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2(">=", e1, e2) }) -setMethod("<=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__operator2("<=", e1, e2) }) +setMethod("+", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("+", e1, e2) }) +setMethod("-", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("-", e1, e2) }) +setMethod("*", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("*", e1, e2) }) +setMethod("/", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("/", e1, e2) }) +setMethod("%%", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("%", e1, e2) }) +setMethod("==", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("==", e1, e2) }) +setMethod(">", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">", e1, e2) }) +setMethod("<", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<", e1, e2) }) +setMethod("!=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) }) +setMethod(">=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) }) +setMethod("<=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) }) +setMethod("&", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) }) +setMethod("|", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) }) + +setMethod("+", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("+", e1, e2) }) +setMethod("-", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("-", e1, e2) }) +setMethod("*", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("*", e1, e2) }) +setMethod("/", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("/", e1, e2) }) +setMethod("%%", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("%", e1, e2) }) +setMethod("==", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("==", e1, e2) }) +setMethod(">", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">", e1, e2) }) +setMethod("<", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<", e1, e2) }) +setMethod("!=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) }) +setMethod(">=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) }) +setMethod("<=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) }) +setMethod("&", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) }) +setMethod("|", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) }) + +setMethod("+", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("+", e1, e2) }) +setMethod("-", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("-", e1, e2) }) +setMethod("*", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("*", e1, e2) }) +setMethod("/", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("/", e1, e2) }) +setMethod("%%", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("%", e1, e2) }) +setMethod("==", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("==", e1, e2) }) +setMethod(">", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(">", e1, e2) }) +setMethod("<", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("<", e1, e2) }) +setMethod("!=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("!=", e1, e2) }) +setMethod(">=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(">=", e1, e2) }) +setMethod("<=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("<=", e1, e2) }) +setMethod("&", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("&", e1, e2) }) +setMethod("|", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("|", e1, e2) }) + +setMethod("abs", "H2OParsedData2", function(x) { h2o.__unop2("abs", x) }) +setMethod("sign", "H2OParsedData2", function(x) { h2o.__unop2("sgn", x) }) +setMethod("sqrt", "H2OParsedData2", function(x) { h2o.__unop2("sqrt", x) }) +setMethod("ceiling", "H2OParsedData2", function(x) { h2o.__unop2("ceil", x) }) +setMethod("floor", "H2OParsedData2", function(x) { h2o.__unop2("floor", x) }) +setMethod("log", "H2OParsedData2", function(x) { h2o.__unop2("log", x) }) +setMethod("exp", "H2OParsedData2", function(x) { h2o.__unop2("exp", x) }) +setMethod("sum", "H2OParsedData2", function(x) { h2o.__unop2("sum", x) }) +setMethod("is.na", "H2OParsedData2", function(x) { h2o.__unop2("is.na", x) }) +setMethod("table", "H2OParsedData2", function(x) { h2o.__unop2("table", x) }) setMethod("colnames", "H2OParsedData2", function(x) { res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key) @@ -605,15 +556,6 @@ setMethod("nrow", "H2OParsedData2", function(x) { setMethod("ncol", "H2OParsedData2", function(x) { res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key); as.numeric(res$numCols) }) -setMethod("sign", "H2OParsedData2", function(x) { - expr = paste("sgn(", x@key, ")") - res = h2o.__exec2(x@h2o, expr) - if(res$num_rows == 0 && res$num_cols == 0) - res$scalar - else - new("H2OParsedData2", h2o=x@h2o, key=res$dest_key) -}) - setMethod("min", "H2OParsedData2", function(x) { res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key) min(sapply(res$cols, function(x) { x$min })) @@ -630,12 +572,6 @@ setMethod("range", "H2OParsedData2", function(x) { c(min(temp[1,]), max(temp[2,])) }) -setMethod("sum", "H2OParsedData2", function(x) { - expr = paste("sum(", x@key, ")", sep="") - res = h2o.__exec2(x@h2o, expr) - res$scalar -}) - setMethod("colMeans", "H2OParsedData2", function(x) { res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key) temp = sapply(res$cols, function(x) { x$mean }) @@ -660,15 +596,6 @@ setMethod("tail", "H2OParsedData2", function(x, n = 6L, ...) { tail(new("H2OParsedData", h2o=x@h2o, key=x@key), n, ...) }) -setMethod("is.na", "H2OParsedData2", function(x) { - expr = paste("is.na(", x@key, ")") - res = h2o.__exec2(x@h2o, expr) - if(res$num_rows == 0 && res$num_cols == 0) - res$scalar - else - new("H2OLogicalData2", h2o=x@h2o, key=res$dest_key) -}) - setMethod("is.factor", "H2OParsedData2", function(x) { res = h2o.__remoteSend(x@h2o, h2o.__PAGE_SUMMARY2, source=x@key) temp = sapply(res$summaries, function(x) { is.null(x$domains) }) diff --git a/R/h2oRClient-package/R/Internal.R b/R/h2oRClient-package/R/Internal.R index 0250b69b07..394d7bb0d9 100644 --- a/R/h2oRClient-package/R/Internal.R +++ b/R/h2oRClient-package/R/Internal.R @@ -4,7 +4,7 @@ pkg.env$result_count = 0 pkg.env$IS_LOGGING = FALSE TEMP_KEY = "Last.value" RESULT_MAX = 100 -LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=") +LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&", "|") # Initialize functions for R logging myPath = paste(Sys.getenv("HOME"), "Library/Application Support/h2o", sep="/") @@ -290,13 +290,24 @@ h2o.__exec2_dest_key <- function(client, expr, destKey) { return(res) } -h2o.__operator2 <- function(op, x, y) { +h2o.__unop2 <- function(op, x) { + expr = paste(op, "(", x@key, ")") + res = h2o.__exec2(x@h2o, expr) + if(res$num_rows == 0 && res$num_cols == 0) # TODO: If logical operator, need to indicate + return(res$scalar) + if(op %in% LOGICAL_OPERATORS) + new("H2OLogicalData2", h2o=myClient, key=res$dest_key) + else + new("H2OParsedData2", h2o=myClient, key=res$dest_key) +} + +h2o.__binop2 <- function(op, x, y) { # if(!((ncol(x) == 1 || class(x) == "numeric") && (ncol(y) == 1 || class(y) == "numeric"))) # stop("Can only operate on single column vectors") - LHS = ifelse(class(x) == "H2OParsedData2", x@key, x) - RHS = ifelse(class(y) == "H2OParsedData2", y@key, y) + LHS = ifelse(class(x) == "H2OParsedData2" || class(x) == "H2OLogicalData2", x@key, x) + RHS = ifelse(class(y) == "H2OParsedData2" || class(y) == "H2OLogicalData2", y@key, y) expr = paste(LHS, op, RHS) - if(class(x) == "H2OParsedData2") myClient = x@h2o + if(class(x) == "H2OParsedData2" || class(x) == "H2OLogicalData2") myClient = x@h2o else myClient = y@h2o res = h2o.__exec2(myClient, expr) @@ -306,4 +317,4 @@ h2o.__operator2 <- function(op, x, y) { new("H2OLogicalData2", h2o=myClient, key=res$dest_key) else new("H2OParsedData2", h2o=myClient, key=res$dest_key) -} +} \ No newline at end of file diff --git a/src/main/java/water/exec/ASTOp.java b/src/main/java/water/exec/ASTOp.java index 8fc2cccde1..bf218165a2 100644 --- a/src/main/java/water/exec/ASTOp.java +++ b/src/main/java/water/exec/ASTOp.java @@ -15,15 +15,23 @@ public abstract class ASTOp extends AST { static { // Unary ops put(new ASTIsNA()); - put(new ASTSgn ()); put(new ASTNrow()); put(new ASTNcol()); + put(new ASTAbs()); + put(new ASTSgn ()); + put(new ASTSqrt()); + put(new ASTCeil()); + put(new ASTFlr()); + put(new ASTLog()); + put(new ASTExp()); // Binary ops put(new ASTPlus()); put(new ASTSub ()); put(new ASTMul ()); put(new ASTDiv ()); + put(new ASTPow ()); + put(new ASTMod ()); put(new ASTMin ()); put(new ASTMax ()); put(new ASTLT ()); @@ -32,6 +40,8 @@ public abstract class ASTOp extends AST { put(new ASTGE ()); put(new ASTEQ ()); put(new ASTNE ()); + put(new ASTLA ()); + put(new ASTLO ()); // Misc put(new ASTCat ()); @@ -111,8 +121,14 @@ static Type[] newsig() { } } +class ASTAbs extends ASTUniOp { String opStr(){ return "abs"; } ASTOp make() {return new ASTAbs ();} double op(double d) { return Math.abs(d);}} +class ASTSgn extends ASTUniOp { String opStr(){ return "sgn" ; } ASTOp make() {return new ASTSgn ();} double op(double d) { return Math.signum(d);}} +class ASTSqrt extends ASTUniOp { String opStr(){ return "sqrt"; } ASTOp make() {return new ASTSqrt();} double op(double d) { return Math.sqrt(d);}} +class ASTCeil extends ASTUniOp { String opStr(){ return "ceil"; } ASTOp make() {return new ASTCeil();} double op(double d) { return Math.ceil(d);}} +class ASTFlr extends ASTUniOp { String opStr(){ return "floor"; } ASTOp make() {return new ASTFlr(); } double op(double d) { return Math.floor(d);}} +class ASTLog extends ASTUniOp { String opStr(){ return "log"; } ASTOp make() {return new ASTLog ();} double op(double d) { return Math.log(d);}} +class ASTExp extends ASTUniOp { String opStr(){ return "exp"; } ASTOp make() {return new ASTExp ();} double op(double d) { return Math.exp(d);}} class ASTIsNA extends ASTUniOp { String opStr(){ return "is.na"; } ASTOp make() {return new ASTIsNA();} double op(double d) { return Double.isNaN(d)?1:0;}} -class ASTSgn extends ASTUniOp { String opStr(){ return "sgn" ; } ASTOp make() {return new ASTSgn ();} double op(double d) { return Math.signum(d);}} class ASTNrow extends ASTUniOp { ASTNrow() { super(VARS,new Type[]{Type.DBL,Type.ARY}); } @Override String opStr() { return "nrow"; } @@ -201,6 +217,8 @@ class ASTPlus extends ASTBinOp { String opStr(){ return "+" ;} ASTOp make() {re class ASTSub extends ASTBinOp { String opStr(){ return "-" ;} ASTOp make() {return new ASTSub ();} double op(double d0, double d1) { return d0-d1;}} class ASTMul extends ASTBinOp { String opStr(){ return "*" ;} ASTOp make() {return new ASTMul ();} double op(double d0, double d1) { return d0*d1;}} class ASTDiv extends ASTBinOp { String opStr(){ return "/" ;} ASTOp make() {return new ASTDiv ();} double op(double d0, double d1) { return d0/d1;}} +class ASTPow extends ASTBinOp { String opStr(){ return "^" ;} ASTOp make() {return new ASTPow ();} double op(double d0, double d1) { return Math.pow(d0,d1);}} +class ASTMod extends ASTBinOp { String opStr(){ return "%" ;} ASTOp make() {return new ASTMod ();} double op(double d0, double d1) { return d0%d1;}} class ASTMin extends ASTBinOp { String opStr(){ return "min";} ASTOp make() {return new ASTMin ();} double op(double d0, double d1) { return Math.min(d0,d1);}} class ASTMax extends ASTBinOp { String opStr(){ return "max";} ASTOp make() {return new ASTMax ();} double op(double d0, double d1) { return Math.max(d0,d1);}} class ASTLT extends ASTBinOp { String opStr(){ return "<" ;} ASTOp make() {return new ASTLT ();} double op(double d0, double d1) { return d0< d1?1:0;}} @@ -209,6 +227,8 @@ class ASTGT extends ASTBinOp { String opStr(){ return ">" ;} ASTOp make() {re class ASTGE extends ASTBinOp { String opStr(){ return ">=" ;} ASTOp make() {return new ASTGE ();} double op(double d0, double d1) { return d0>=d1?1:0;}} class ASTEQ extends ASTBinOp { String opStr(){ return "==" ;} ASTOp make() {return new ASTEQ ();} double op(double d0, double d1) { return d0==d1?1:0;}} class ASTNE extends ASTBinOp { String opStr(){ return "!=" ;} ASTOp make() {return new ASTNE ();} double op(double d0, double d1) { return d0!=d1?1:0;}} +class ASTLA extends ASTBinOp { String opStr(){ return "&" ;} ASTOp make() {return new ASTLA ();} double op(double d0, double d1) { return (d0!=0&&d1!=0)?1:0;}} +class ASTLO extends ASTBinOp { String opStr(){ return "|" ;} ASTOp make() {return new ASTLO ();} double op(double d0, double d1) { return (d0!=0||d1!=0)?1:0;}} class ASTReduce extends ASTOp { static final String VARS[] = new String[]{ "", "op2", "ary"}; @@ -481,7 +501,7 @@ class ASTRApply extends ASTOp { nc.close(0,null); env.addRef(v = av.close(null)); } else { // Frame results - if( env.ary(-1).numCols() != 1 ) + if( env.ary(-1).numCols() != 1 ) throw new IllegalArgumentException("apply requires that "+op+" return 1 column"); v = env.popAry().anyVec();// Remove without lowering refcnt } @@ -494,7 +514,7 @@ class ASTRApply extends ASTOp { assert env.isAry(); assert env._sp == oldsp-4+1; return; - } + } if( d==1 || d == -2 ) // Work on rows throw H2O.unimpl(); throw new IllegalArgumentException("MARGIN limited to 1 (rows) or 2 (cols)"); From 5d15f173b61eafbb02cf2da27d6f82636c88fcd6 Mon Sep 17 00:00:00 2001 From: mmalohlava Date: Mon, 11 Nov 2013 20:10:40 -0800 Subject: [PATCH 04/11] Graph Java API little bit polished. --- src/main/java/hex/gbm/DTree.java | 14 ++++---------- src/main/java/water/api/DocGen.java | 15 +++++++++++++++ 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/main/java/hex/gbm/DTree.java b/src/main/java/hex/gbm/DTree.java index b87feb211a..a7bbbc8c6d 100644 --- a/src/main/java/hex/gbm/DTree.java +++ b/src/main/java/hex/gbm/DTree.java @@ -660,16 +660,10 @@ protected void generateHTMLVarImp(StringBuilder sb) { sb.append(""); DocGen.HTML.arrayTail(sb); // Generate a graph - horrible code - sb.append(""); - sb.append(""); - sb.append(""); - sb.append("
") - .append(" ") - .append("
"); + DocGen.HTML.graph(sb, "graphvarimp", "g_varimp", + DocGen.HTML.toJSArray(new StringBuilder(), Arrays.copyOf(_names, _names.length-1)), + DocGen.HTML.toJSArray(new StringBuilder(), varimp) + ); } public static class TreeStats extends Iced { diff --git a/src/main/java/water/api/DocGen.java b/src/main/java/water/api/DocGen.java index a382a740fd..edbc89af7a 100644 --- a/src/main/java/water/api/DocGen.java +++ b/src/main/java/water/api/DocGen.java @@ -2,6 +2,7 @@ import java.io.*; import java.lang.reflect.Field; +import java.util.Arrays; import java.util.Properties; import water.*; @@ -327,6 +328,20 @@ public StringBuilder toJSArray(StringBuilder sb, String[] ss) { sb.append(']'); return sb; } + + public StringBuilder graph(StringBuilder sb, String gid, String gname, StringBuilder ...gparams) { + sb.append(""); + sb.append(""); + sb.append(""); + sb.append("
") + .append(" ") + .append("
"); + return sb; + } } // -------------------------------------------------------------------------- From 3bfde24f68962ec06b7c900c420e9280791df325 Mon Sep 17 00:00:00 2001 From: mmalohlava Date: Mon, 11 Nov 2013 20:11:36 -0800 Subject: [PATCH 05/11] Var imp name tooltip. --- lib/resources/h2o/css/graphs.css | 30 ++++++++++++++++++++++++++++++ lib/resources/h2o/js/graphs.js | 24 ++++++++++++++++++++---- 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/lib/resources/h2o/css/graphs.css b/lib/resources/h2o/css/graphs.css index 07da832dab..95b1caa568 100644 --- a/lib/resources/h2o/css/graphs.css +++ b/lib/resources/h2o/css/graphs.css @@ -5,6 +5,9 @@ .bar.negative { fill: brown; } +.bar:hover { + fill: orange; +} .axis text { font: 10px sans-serif; @@ -16,3 +19,30 @@ stroke: #000; shape-rendering: crispEdges; } + +#d3tip { + position: absolute; + width: 120px; + height: auto; + padding: 2px; + background: lightsteelblue; + border: 0px; + -webkit-border-radius: 10px; + -moz-border-radius: 10px; + border-radius: 10px; + -webkit-box-shadow: 4px 4px 10px rgba(0, 0, 0, 0.4); + -moz-box-shadow: 4px 4px 10px rgba(0, 0, 0, 0.4); + box-shadow: 4px 4px 10px rgba(0, 0, 0, 0.4); + pointer-events: none; +} + +#d3tip.hidden { + display: none; +} + +#d3tip p { + text-align: center; + margin: 0; + font-family: sans-serif; + font-size: 12px; +} diff --git a/lib/resources/h2o/js/graphs.js b/lib/resources/h2o/js/graphs.js index 528934c6a2..5b69e09928 100644 --- a/lib/resources/h2o/js/graphs.js +++ b/lib/resources/h2o/js/graphs.js @@ -9,8 +9,8 @@ function g_varimp(divid, names, varimp) { var dataset = zip(names, varimp); // Setup size and axis var margin = {top: 30, right: 10, bottom: 10, left: 10}, - width = 480 - margin.left - margin.right, - height = 250 - margin.top - margin.bottom; + width = 640 - margin.left - margin.right, + height = 450 - margin.top - margin.bottom; var xScale = d3.scale.linear() .range([0, width]) @@ -24,12 +24,17 @@ function g_varimp(divid, names, varimp) { .scale(xScale) .orient("top"); - var svg = d3.select(divid).append("svg") + var svg = d3.select("#"+divid).append("svg") .attr("width", width + margin.left + margin.right) .attr("height", height + margin.top + margin.bottom) .append("g") .attr("transform", "translate(" + margin.left + "," + margin.top + ")"); + var tooltip = d3.select("body") + .append("div") + .attr("id", "d3tip") + .classed("hidden", true); + svg.selectAll(".bar") .data(dataset) .enter().append("rect") @@ -37,7 +42,18 @@ function g_varimp(divid, names, varimp) { .attr("x", function(d) { return xScale(Math.min(0, d[1])); }) .attr("y", function(d) { return yScale(d[0]); }) .attr("width", function(d) { return Math.abs(xScale(d[1]) - xScale(0)); }) - .attr("height", yScale.rangeBand()); + .attr("height", yScale.rangeBand()) + .on("mouseover", function (d) { + var xPosition = width + document.getElementById(divid).offsetLeft; + var yPosition = parseFloat(d3.select(this).attr("y")) + yScale.rangeBand() / 2 + document.getElementById(divid).offsetTop; + tooltip.style("left", xPosition + "px") + .style("top", yPosition + "px"); + tooltip.html("

" + d[0] + "
" + d[1] + "

"); + tooltip.classed("hidden", false); + }) + .on("mouseout", function(d) { + tooltip.classed("hidden", true); + }); svg.append("g") .attr("class", "x axis") From 4bd083ae83b2c3726f5df490cddbfde7fb2eaf40 Mon Sep 17 00:00:00 2001 From: mmalohlava Date: Mon, 11 Nov 2013 20:14:57 -0800 Subject: [PATCH 06/11] Fix in computing tree stats. --- src/main/java/hex/drf/DRF.java | 2 -- src/main/java/hex/gbm/DTree.java | 6 ++---- src/main/java/hex/gbm/GBM.java | 1 - 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/main/java/hex/drf/DRF.java b/src/main/java/hex/drf/DRF.java index ddc6f0eab8..3d9d4ee049 100644 --- a/src/main/java/hex/drf/DRF.java +++ b/src/main/java/hex/drf/DRF.java @@ -126,8 +126,6 @@ public static String link(Key k, String content) { tstats.updateBy(ktrees); model = doScoring(model, outputKey, fr, ktrees, tid, tstats); } - // finalize stats - tstats.close(); // Do final scoring with all the trees. model = doScoring(model, outputKey, fr, ktrees, tid, tstats); if (classification && importance) { diff --git a/src/main/java/hex/gbm/DTree.java b/src/main/java/hex/gbm/DTree.java index a7bbbc8c6d..a7bd8673d3 100644 --- a/src/main/java/hex/gbm/DTree.java +++ b/src/main/java/hex/gbm/DTree.java @@ -690,12 +690,10 @@ public void updateBy(DTree[] ktrees) { sumDepth += tree.depth; sumLeaves += tree.leaves; numTrees++; + meanDepth = (int) (sumDepth / numTrees); + meanLeaves = (int) (sumLeaves / numTrees); } } - public void close() { - meanDepth = (int) (sumDepth / numTrees); - meanLeaves = (int) (sumLeaves / numTrees); - } } // -------------------------------------------------------------------------- diff --git a/src/main/java/hex/gbm/GBM.java b/src/main/java/hex/gbm/GBM.java index 4af5426070..e96070fed7 100644 --- a/src/main/java/hex/gbm/GBM.java +++ b/src/main/java/hex/gbm/GBM.java @@ -98,7 +98,6 @@ public static String link(Key k, String content) { model = doScoring(model, outputKey, fr, ktrees, tid, tstats); } // Final scoring - tstats.close(); model = doScoring(model, outputKey, fr, ktrees, tid, tstats); cleanUp(fr,t_build); // Shared cleanup } From 685ac1d2e2047a90173601f7f8320d343dfc9027 Mon Sep 17 00:00:00 2001 From: anqi Date: Mon, 11 Nov 2013 21:15:33 -0800 Subject: [PATCH 07/11] WIP Adding cut method to ExecQuery2 --- R/examples/H2OExec2Demo.R | 98 ++++++++++++++--------------- R/h2oRClient-package/R/Classes.R | 18 +++--- R/h2oRClient-package/R/Internal.R | 10 +-- src/main/java/water/exec/ASTOp.java | 48 +++++++++++++- 4 files changed, 111 insertions(+), 63 deletions(-) diff --git a/R/examples/H2OExec2Demo.R b/R/examples/H2OExec2Demo.R index 8bfe3b9580..05314bd09c 100644 --- a/R/examples/H2OExec2Demo.R +++ b/R/examples/H2OExec2Demo.R @@ -1,51 +1,49 @@ -library(h2o) -h2o.installDepPkgs() -myIP = "127.0.0.1"; myPort = 54321 -localH2O = h2o.init(ip = myIP, port = myPort, startH2O = TRUE, silentUpgrade = FALSE, promptUpgrade = TRUE) - -# Import iris file to H2O -prosPath = system.file("extdata", "prostate.csv", package="h2oRClient") -prostate.hex = h2o.importFile.FV(localH2O, path = prosPath, key = "prostate.hex") - -# Print out basic summary -summary(prostate.hex) -head(prostate.hex) -tail(prostate.hex) - -# Get quantiles and examine outliers -prostate.qs = quantile(prostate.hex$PSA) -prostate.qs - -# Note: Right now, assignment must be done manually with h2o.assign! -outliers.low = prostate.hex[prostate.hex$PSA <= prostate.qs[2],] -outliers.low = h2o.assign(outliers.low, "PSA.low") -outliers.high = prostate.hex[prostate.hex$PSA >= prostate.qs[10],] -outliers.high = h2o.assign(outliers.high, "PSA.high") - -nrow(outliers.low) + nrow(outliers.high) -head(outliers.low); tail(outliers.low) -head(outliers.high); tail(outliers.high) - -# Drop outliers from data -prostate.trim = prostate.hex[prostate.hex$PSA > prostate.qs[2],] -prostate.trim = h2o.assign(prostate.trim, "prostate.trim") -prostate.trim = prostate.trim[prostate.trim$PSA < prostate.qs[10],] -prostate.trim = h2o.assign(prostate.trim, "prostate.trim") -nrow(prostate.trim) - -# Construct test and training sets -s = runif(nrow(prostate.hex)) -prostate.train = prostate.hex[s <= 0.8,] -prostate.train = h2o.assign(prostate.train, "prostate.train") -prostate.test = prostate.hex[s > 0.8,] -prostate.test = h2o.assign(prostate.test, "prostate.test") -nrow(prostate.train) + nrow(prostate.test) - -# Run GBM on training set and predict on test set -myY = "CAPSULE"; myX = setdiff(colnames(prostate.train), c(myY, "ID")) -prostate.gbm = h2o.gbm(x = myX, y = myY, distribution = "multinomial", data = prostate.train) -prostate.gbm -prostate.pred = h2o.predict(prostate.gbm, prostate.test) -summary(prostate.pred) -head(prostate.pred) +library(h2o) +h2o.installDepPkgs() +myIP = "127.0.0.1"; myPort = 54321 +localH2O = h2o.init(ip = myIP, port = myPort, startH2O = TRUE, silentUpgrade = FALSE, promptUpgrade = TRUE) + +# Import iris file to H2O +prosPath = system.file("extdata", "prostate.csv", package="h2oRClient") +prostate.hex = h2o.importFile.FV(localH2O, path = prosPath, key = "prostate.hex") + +# Print out basic summary +summary(prostate.hex) +head(prostate.hex) +tail(prostate.hex) +table(prostate.hex$RACE) # Note: Currently only works on a single integer/factor column + +# Get quantiles and examine outliers +prostate.qs = quantile(prostate.hex$PSA) +print(prostate.qs) + +# Note: Right now, assignment must be done manually with h2o.assign! +# PSA.outliers = prostate.hex[prostate.hex$PSA <= prostate.qs[2] | prostate.hex$PSA >= prostate.qs[10],] +PSA.outliers.ind = prostate.hex$PSA <= prostate.qs[2] | prostate.hex$PSA >= prostate.qs[10] +PSA.outliers = prostate.hex[PSA.outliers.ind,] +PSA.outliers = h2o.assign(PSA.outliers, "PSA.outliers") +nrow(PSA.outliers) +head(PSA.outliers); tail(PSA.outliers) + +# Drop outliers from data +# prostate.trim = prostate.hex[prostate.hex$PSA > prostate.qs[2] && prostate.hex$PSA < prostate.qs[10],] +prostate.trim = prostate.hex[!PSA.outliers.ind,] +prostate.trim = h2o.assign(prostate.trim, "prostate.trim") +nrow(prostate.trim) + +# Construct test and training sets +s = runif(nrow(prostate.hex)) +prostate.train = prostate.hex[s <= 0.8,] +prostate.train = h2o.assign(prostate.train, "prostate.train") +prostate.test = prostate.hex[s > 0.8,] +prostate.test = h2o.assign(prostate.test, "prostate.test") +nrow(prostate.train) + nrow(prostate.test) + +# Run GBM on training set and predict on test set +myY = "CAPSULE"; myX = setdiff(colnames(prostate.train), c(myY, "ID")) +prostate.gbm = h2o.gbm(x = myX, y = myY, distribution = "multinomial", data = prostate.train) +print(prostate.gbm) +prostate.pred = h2o.predict(prostate.gbm, prostate.test) +summary(prostate.pred) +head(prostate.pred) tail(prostate.pred) \ No newline at end of file diff --git a/R/h2oRClient-package/R/Classes.R b/R/h2oRClient-package/R/Classes.R index f00675e6f0..3e05b1453c 100644 --- a/R/h2oRClient-package/R/Classes.R +++ b/R/h2oRClient-package/R/Classes.R @@ -415,6 +415,7 @@ setMethod("show", "H2ORawData2", function(object) { setMethod("show", "H2OParsedData2", function(object) { print(object@h2o) cat("Parsed Data Key:", object@key, "\n") + if(ncol(object) <= 1000) print(head(object)) }) setMethod("[", "H2OParsedData2", function(x, i, j, ..., drop = TRUE) { @@ -501,8 +502,8 @@ setMethod("<", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__b setMethod("!=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) }) setMethod(">=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) }) setMethod("<=", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) }) -setMethod("&", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) }) -setMethod("|", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) }) +setMethod("&", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&&", e1, e2) }) +setMethod("|", c("H2OParsedData2", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("||", e1, e2) }) setMethod("+", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("+", e1, e2) }) setMethod("-", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("-", e1, e2) }) @@ -515,8 +516,8 @@ setMethod("<", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(" setMethod("!=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("!=", e1, e2) }) setMethod(">=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2(">=", e1, e2) }) setMethod("<=", c("numeric", "H2OParsedData2"), function(e1, e2) { h2o.__binop2("<=", e1, e2) }) -setMethod("&", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&", e1, e2) }) -setMethod("|", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("|", e1, e2) }) +setMethod("&", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("&&", e1, e2) }) +setMethod("|", c("numeric", "H2OParsedData2"), function(e1, e2) {h2o.__binop2("||", e1, e2) }) setMethod("+", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("+", e1, e2) }) setMethod("-", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("-", e1, e2) }) @@ -529,9 +530,10 @@ setMethod("<", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(" setMethod("!=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("!=", e1, e2) }) setMethod(">=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2(">=", e1, e2) }) setMethod("<=", c("H2OParsedData2", "numeric"), function(e1, e2) { h2o.__binop2("<=", e1, e2) }) -setMethod("&", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("&", e1, e2) }) -setMethod("|", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("|", e1, e2) }) +setMethod("&", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("&&", e1, e2) }) +setMethod("|", c("H2OParsedData2", "numeric"), function(e1, e2) {h2o.__binop2("||", e1, e2) }) +setMethod("!", "H2OParsedData2", function(x) { h2o.__unop2("!", x) }) setMethod("abs", "H2OParsedData2", function(x) { h2o.__unop2("abs", x) }) setMethod("sign", "H2OParsedData2", function(x) { h2o.__unop2("sgn", x) }) setMethod("sqrt", "H2OParsedData2", function(x) { h2o.__unop2("sqrt", x) }) @@ -541,7 +543,9 @@ setMethod("log", "H2OParsedData2", function(x) { h2o.__unop2("log", x) }) setMethod("exp", "H2OParsedData2", function(x) { h2o.__unop2("exp", x) }) setMethod("sum", "H2OParsedData2", function(x) { h2o.__unop2("sum", x) }) setMethod("is.na", "H2OParsedData2", function(x) { h2o.__unop2("is.na", x) }) -setMethod("table", "H2OParsedData2", function(x) { h2o.__unop2("table", x) }) + +table <- function(object) { UseMethod("table", object) } +setMethod("table", "H2OParsedData2", function(object) { h2o.__unop2("table", object) }) setMethod("colnames", "H2OParsedData2", function(x) { res = h2o.__remoteSend(x@h2o, h2o.__PAGE_INSPECT2, src_key=x@key) diff --git a/R/h2oRClient-package/R/Internal.R b/R/h2oRClient-package/R/Internal.R index 394d7bb0d9..2445386cae 100644 --- a/R/h2oRClient-package/R/Internal.R +++ b/R/h2oRClient-package/R/Internal.R @@ -1,10 +1,12 @@ # Hack to get around Exec.json always dumping to same Result.hex key +# TODO: Need better way to manage temporary/intermediate values in calculations! Right now, overwriting may occur silently pkg.env = new.env() pkg.env$result_count = 0 +pkg.env$temp_count = 0 pkg.env$IS_LOGGING = FALSE TEMP_KEY = "Last.value" -RESULT_MAX = 100 -LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&", "|") +RESULT_MAX = 200 +LOGICAL_OPERATORS = c("==", ">", "<", "!=", ">=", "<=", "&&", "||", "!") # Initialize functions for R logging myPath = paste(Sys.getenv("HOME"), "Library/Application Support/h2o", sep="/") @@ -296,9 +298,9 @@ h2o.__unop2 <- function(op, x) { if(res$num_rows == 0 && res$num_cols == 0) # TODO: If logical operator, need to indicate return(res$scalar) if(op %in% LOGICAL_OPERATORS) - new("H2OLogicalData2", h2o=myClient, key=res$dest_key) + new("H2OLogicalData2", h2o=x@h2o, key=res$dest_key) else - new("H2OParsedData2", h2o=myClient, key=res$dest_key) + new("H2OParsedData2", h2o=x@h2o, key=res$dest_key) } h2o.__binop2 <- function(op, x, y) { diff --git a/src/main/java/water/exec/ASTOp.java b/src/main/java/water/exec/ASTOp.java index bf218165a2..a6114da1d2 100644 --- a/src/main/java/water/exec/ASTOp.java +++ b/src/main/java/water/exec/ASTOp.java @@ -24,6 +24,7 @@ public abstract class ASTOp extends AST { put(new ASTFlr()); put(new ASTLog()); put(new ASTExp()); + put(new ASTNot()); // Binary ops put(new ASTPlus()); @@ -51,6 +52,7 @@ public abstract class ASTOp extends AST { put(new ASTIfElse()); put(new ASTRApply()); put(new ASTRunif()); + put(new ASTCut()); } static private void put(ASTOp ast) { OPS.put(ast.opStr(),ast); } @@ -129,6 +131,7 @@ class ASTFlr extends ASTUniOp { String opStr(){ return "floor"; } ASTOp make() class ASTLog extends ASTUniOp { String opStr(){ return "log"; } ASTOp make() {return new ASTLog ();} double op(double d) { return Math.log(d);}} class ASTExp extends ASTUniOp { String opStr(){ return "exp"; } ASTOp make() {return new ASTExp ();} double op(double d) { return Math.exp(d);}} class ASTIsNA extends ASTUniOp { String opStr(){ return "is.na"; } ASTOp make() {return new ASTIsNA();} double op(double d) { return Double.isNaN(d)?1:0;}} +class ASTNot extends ASTUniOp { String opStr(){ return "!"; } ASTOp make() {return new ASTNot(); } double op(double d) { return d==0?1:0; }} class ASTNrow extends ASTUniOp { ASTNrow() { super(VARS,new Type[]{Type.DBL,Type.ARY}); } @Override String opStr() { return "nrow"; } @@ -227,8 +230,8 @@ class ASTGT extends ASTBinOp { String opStr(){ return ">" ;} ASTOp make() {re class ASTGE extends ASTBinOp { String opStr(){ return ">=" ;} ASTOp make() {return new ASTGE ();} double op(double d0, double d1) { return d0>=d1?1:0;}} class ASTEQ extends ASTBinOp { String opStr(){ return "==" ;} ASTOp make() {return new ASTEQ ();} double op(double d0, double d1) { return d0==d1?1:0;}} class ASTNE extends ASTBinOp { String opStr(){ return "!=" ;} ASTOp make() {return new ASTNE ();} double op(double d0, double d1) { return d0!=d1?1:0;}} -class ASTLA extends ASTBinOp { String opStr(){ return "&" ;} ASTOp make() {return new ASTLA ();} double op(double d0, double d1) { return (d0!=0&&d1!=0)?1:0;}} -class ASTLO extends ASTBinOp { String opStr(){ return "|" ;} ASTOp make() {return new ASTLO ();} double op(double d0, double d1) { return (d0!=0||d1!=0)?1:0;}} +class ASTLA extends ASTBinOp { String opStr(){ return "&&" ;} ASTOp make() {return new ASTLA ();} double op(double d0, double d1) { return (d0!=0 && d1!=0)?1:0;}} +class ASTLO extends ASTBinOp { String opStr(){ return "||" ;} ASTOp make() {return new ASTLO ();} double op(double d0, double d1) { return (d0==0 && d1==0)?0:1;}} class ASTReduce extends ASTOp { static final String VARS[] = new String[]{ "", "op2", "ary"}; @@ -521,3 +524,44 @@ class ASTRApply extends ASTOp { } } +class ASTCut extends ASTOp { + ASTCut() { super(new String[]{"cut", "ary", "dbls"}, new Type[]{Type.ARY, Type.ARY, Type.dblary()}); } + @Override String opStr() { return "cut"; } + @Override ASTOp make() {return new ASTCut();} + @Override void apply(Env env, int argcnt) { + if(env.isDbl()) { + int nbins = (int) Math.floor(env.popDbl()); + if(nbins < 2) + throw new IllegalArgumentException("Number of intervals must be at least 2"); + + Frame fr = env.popAry(); + String skey = env.key(); + if(fr.vecs().length != 1 || fr.domains()[0] != null) + throw new IllegalArgumentException("First argument must be a numeric vector"); + + final double fmax = fr.vecs()[0].max(); + final double fmin = fr.vecs()[0].min(); + final double width = (fmax - fmin)/nbins; + // TODO: Check what R does when width = 0, I think it perturbs constant vecs automatically + + /* String[][] domains = new String[1][nbins]; + for(int i = 0; i < nbins; i++) + domains[0][i] = "(" + fmin + i*width + "," + fmin + (i+1)*width + "]"; */ + + Frame fr2 = new MRTask2() { + @Override public void map(Chunk chk, NewChunk nchk) { + for(int r = 0; r < chk._len; r++) { + double x = chk.at0(r); + nchk.addNum(Math.floor((x - fmin)/width)); + // TODO: Add all unique bins as domains (lower_bound, upper_bound] + } + } + }.doAll(1,fr).outputFrame(fr._names, fr.domains()); + // }.doAll(1,fr).outputFrame(fr._names, domains); + env.subRef(fr, skey); + env.pop(); + env.push(fr2); + } else + throw H2O.unimpl(); + } +} \ No newline at end of file From 31d5745313e45b9c0a897f1db323dcd3178518af Mon Sep 17 00:00:00 2001 From: Cliff Click Date: Mon, 11 Nov 2013 11:05:08 -0800 Subject: [PATCH 08/11] Cleanup fvec compression Major comment upgrade. Allocate side arrays in each Chunk. Drop redundant min/max/hasFloat fields. Drop redundant invalid/setInvalid calls. Minor bugfix with setting NA into existing C0DChunk. --- src/main/java/water/MemoryManager.java | 21 +++-- src/main/java/water/fvec/C0DChunk.java | 5 +- src/main/java/water/fvec/C0LChunk.java | 5 +- src/main/java/water/fvec/C1Chunk.java | 7 +- src/main/java/water/fvec/C1NChunk.java | 12 +-- src/main/java/water/fvec/C1SChunk.java | 16 ++-- src/main/java/water/fvec/C2Chunk.java | 7 +- src/main/java/water/fvec/C2SChunk.java | 27 +++--- src/main/java/water/fvec/C4SChunk.java | 13 ++- src/main/java/water/fvec/C8DChunk.java | 5 +- src/main/java/water/fvec/CBSChunk.java | 9 +- src/main/java/water/fvec/CX0Chunk.java | 1 - src/main/java/water/fvec/CX2Chunk.java | 1 - src/main/java/water/fvec/NewChunk.java | 120 +++++++++++-------------- 14 files changed, 117 insertions(+), 132 deletions(-) diff --git a/src/main/java/water/MemoryManager.java b/src/main/java/water/MemoryManager.java index 6a3e44b583..af33aecfee 100644 --- a/src/main/java/water/MemoryManager.java +++ b/src/main/java/water/MemoryManager.java @@ -227,9 +227,10 @@ public static Object malloc(int elems, long bytes, int type, Object orig, int fr case 5: return new float [elems]; case 9: return new double [elems]; case 0: return new boolean[elems]; - case -1: return Arrays.copyOfRange((byte[])orig,from,elems); - case -4: return Arrays.copyOfRange((int [])orig,from,elems); - case -8: return Arrays.copyOfRange((long[])orig,from,elems); + case -1: return Arrays.copyOfRange((byte [])orig,from,elems); + case -4: return Arrays.copyOfRange((int [])orig,from,elems); + case -8: return Arrays.copyOfRange((long [])orig,from,elems); + case -9: return Arrays.copyOfRange((double[])orig,from,elems); default: throw H2O.unimpl(); } } @@ -251,12 +252,14 @@ public static Object malloc(int elems, long bytes, int type, Object orig, int fr public static float [] malloc4f(int size) { return (float [])malloc(size,size*4, 5,null,0); } public static double [] malloc8d(int size) { return (double [])malloc(size,size*8, 9,null,0); } public static boolean[] mallocZ (int size) { return (boolean[])malloc(size,size*1, 0,null,0); } - public static byte[] arrayCopyOfRange(byte[] orig, int from, int sz) { return (byte[]) malloc(sz,(sz-from),-1,orig,from); } - public static int [] arrayCopyOfRange(int [] orig, int from, int sz) { return (int []) malloc(sz,(sz-from),-4,orig,from); } - public static long[] arrayCopyOfRange(long[] orig, int from, int sz) { return (long[]) malloc(sz,(sz-from),-8,orig,from); } - public static byte[] arrayCopyOf( byte[] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } - public static int [] arrayCopyOf( int [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } - public static long[] arrayCopyOf( long[] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } + public static byte [] arrayCopyOfRange(byte [] orig, int from, int sz) { return (byte []) malloc(sz,(sz-from),-1,orig,from); } + public static int [] arrayCopyOfRange(int [] orig, int from, int sz) { return (int []) malloc(sz,(sz-from),-4,orig,from); } + public static long [] arrayCopyOfRange(long [] orig, int from, int sz) { return (long []) malloc(sz,(sz-from),-8,orig,from); } + public static double [] arrayCopyOfRange(double[] orig, int from, int sz) { return (double[]) malloc(sz,(sz-from),-9,orig,from); } + public static byte [] arrayCopyOf( byte [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } + public static int [] arrayCopyOf( int [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } + public static long [] arrayCopyOf( long [] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } + public static double [] arrayCopyOf( double[] orig, int sz) { return arrayCopyOfRange(orig,0,sz); } // Memory available for tasks (we assume 3/4 of the heap is available for tasks) static final AtomicLong _taskMem = new AtomicLong(MEM_MAX-(MEM_MAX>>2)); diff --git a/src/main/java/water/fvec/C0DChunk.java b/src/main/java/water/fvec/C0DChunk.java index dd3d8d2276..78fda64660 100644 --- a/src/main/java/water/fvec/C0DChunk.java +++ b/src/main/java/water/fvec/C0DChunk.java @@ -2,6 +2,7 @@ import java.util.Arrays; import water.AutoBuffer; +import water.MemoryManager; import water.UDP; /** @@ -24,7 +25,7 @@ public class C0DChunk extends Chunk { @Override boolean set_impl(int idx, long l) { return l==_con; } @Override boolean set_impl(int i, double d) { return d==_con; } @Override boolean set_impl(int i, float f ) { return f==_con; } - @Override boolean setNA_impl(int i) { return _con==Double.NaN; } + @Override boolean setNA_impl(int i) { return Double.isNaN(_con); } @Override boolean hasFloat() { return (long)_con!=_con; } @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); } @Override public C0DChunk read(AutoBuffer bb) { @@ -35,7 +36,7 @@ public class C0DChunk extends Chunk { return this; } @Override NewChunk inflate_impl(NewChunk nc) { - Arrays.fill(nc._ds,_con); + Arrays.fill(nc._ds = MemoryManager.malloc8d(_len),_con); return nc; } // 3.3333333e33 diff --git a/src/main/java/water/fvec/C0LChunk.java b/src/main/java/water/fvec/C0LChunk.java index 7b05420074..bf6a212254 100644 --- a/src/main/java/water/fvec/C0LChunk.java +++ b/src/main/java/water/fvec/C0LChunk.java @@ -2,6 +2,7 @@ import java.util.Arrays; import water.AutoBuffer; +import water.MemoryManager; import water.UDP; /** @@ -32,8 +33,8 @@ public class C0LChunk extends Chunk { return this; } @Override NewChunk inflate_impl(NewChunk nc) { - if( nc._ls != null ) Arrays.fill(nc._ls,_con); - else Arrays.fill(nc._ds,_con); + nc._xs = MemoryManager.malloc4(_len); + Arrays.fill(nc._ls = MemoryManager.malloc8(_len),_con); return nc; } } diff --git a/src/main/java/water/fvec/C1Chunk.java b/src/main/java/water/fvec/C1Chunk.java index cd78af8b1d..8c9cbb4d5a 100644 --- a/src/main/java/water/fvec/C1Chunk.java +++ b/src/main/java/water/fvec/C1Chunk.java @@ -37,9 +37,12 @@ public class C1Chunk extends Chunk { return this; } @Override NewChunk inflate_impl(NewChunk nc) { + nc._xs = MemoryManager.malloc4(_len); + nc._ls = MemoryManager.malloc8(_len); for( int i=0; i<_len; i++ ) { - if( isNA_impl(i) ) nc.setInvalid(i); - else nc._ls[i] = at8_impl(i); + int res = 0xFF&_mem[i+OFF]; + if( res == C1Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE; + else nc._ls[i] = res; } return nc; } diff --git a/src/main/java/water/fvec/C1NChunk.java b/src/main/java/water/fvec/C1NChunk.java index c5576646ac..32ae4f874f 100644 --- a/src/main/java/water/fvec/C1NChunk.java +++ b/src/main/java/water/fvec/C1NChunk.java @@ -15,11 +15,6 @@ public class C1NChunk extends Chunk { @Override boolean set_impl(int i, double d) { return false; } @Override boolean set_impl(int i, float f ) { return false; } @Override boolean setNA_impl(int idx) { return false; } - @Override NewChunk inflate_impl(NewChunk nc) { - for( int i=0; i<_len; i++ ) - nc._ls[i] = at8_impl(i); - return nc; - } @Override boolean hasFloat() { return false; } @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); } @Override public C1NChunk read(AutoBuffer bb) { @@ -28,4 +23,11 @@ public class C1NChunk extends Chunk { _len = _mem.length; return this; } + @Override NewChunk inflate_impl(NewChunk nc) { + nc._xs = MemoryManager.malloc4(_len); + nc._ls = MemoryManager.malloc8(_len); + for( int i=0; i<_len; i++ ) + nc._ls[i] = 0xFF&_mem[i+OFF]; + return nc; + } } diff --git a/src/main/java/water/fvec/C1SChunk.java b/src/main/java/water/fvec/C1SChunk.java index 909c8f6681..1a85dd7ec5 100644 --- a/src/main/java/water/fvec/C1SChunk.java +++ b/src/main/java/water/fvec/C1SChunk.java @@ -1,5 +1,6 @@ package water.fvec; +import java.util.Arrays; import water.*; import water.parser.DParseTask; @@ -49,17 +50,12 @@ public class C1SChunk extends Chunk { @Override NewChunk inflate_impl(NewChunk nc) { double dx = Math.log10(_scale); assert DParseTask.fitsIntoInt(dx); - int x = (int)dx; - nc._ds = null; - nc._ls = MemoryManager.malloc8 (_len); - nc._xs = MemoryManager.malloc4 (_len); + Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int)dx); + nc._ls = MemoryManager.malloc8(_len); for( int i=0; i<_len; i++ ) { - long res = 0xFF&_mem[i+OFF]; - if( res == C1Chunk._NA ) nc.setInvalid(i); - else { - nc._ls[i] = res+_bias; - nc._xs[i] = x; - } + int res = 0xFF&_mem[i+OFF]; + if( res == C1Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE; + else nc._ls[i] = res+_bias; } return nc; } diff --git a/src/main/java/water/fvec/C2Chunk.java b/src/main/java/water/fvec/C2Chunk.java index 9343f97d0b..05db07075f 100644 --- a/src/main/java/water/fvec/C2Chunk.java +++ b/src/main/java/water/fvec/C2Chunk.java @@ -41,9 +41,12 @@ public class C2Chunk extends Chunk { return this; } @Override NewChunk inflate_impl(NewChunk nc) { + nc._xs = MemoryManager.malloc4(_len); + nc._ls = MemoryManager.malloc8(_len); for( int i=0; i<_len; i++ ) { - if( isNA_impl(i) ) nc.setInvalid(i); - else nc._ls[i] = at8_impl(i); + int res = UDP.get2(_mem,(i<<1)+OFF); + if( res == C2Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE; + else nc._ls[i] = res; } return nc; } diff --git a/src/main/java/water/fvec/C2SChunk.java b/src/main/java/water/fvec/C2SChunk.java index 4e40bf5d84..96849998c2 100644 --- a/src/main/java/water/fvec/C2SChunk.java +++ b/src/main/java/water/fvec/C2SChunk.java @@ -1,5 +1,6 @@ package water.fvec; +import java.util.Arrays; import water.*; import water.parser.DParseTask; @@ -7,7 +8,6 @@ * The scale/bias function, where data is in SIGNED bytes before scaling. */ public class C2SChunk extends Chunk { - static private final long _NA = Short.MIN_VALUE; static final int OFF=8+4; public double _scale; int _bias; @@ -18,14 +18,14 @@ public class C2SChunk extends Chunk { } @Override protected final long at8_impl( int i ) { long res = UDP.get2(_mem,(i<<1)+OFF); - if( res == _NA ) throw new IllegalArgumentException("at8 but value is missing"); + if( res == C2Chunk._NA ) throw new IllegalArgumentException("at8 but value is missing"); return (long)((res + _bias)*_scale); } @Override protected final double atd_impl( int i ) { long res = UDP.get2(_mem,(i<<1)+OFF); - return (res == _NA)?Double.NaN:(res + _bias)*_scale; + return (res == C2Chunk._NA)?Double.NaN:(res + _bias)*_scale; } - @Override protected final boolean isNA_impl( int i ) { return UDP.get2(_mem,(i<<1)+OFF) == _NA; } + @Override protected final boolean isNA_impl( int i ) { return UDP.get2(_mem,(i<<1)+OFF) == C2Chunk._NA; } @Override boolean set_impl(int idx, long l) { long res = (long)(l/_scale)-_bias; // Compressed value double d = (res+_bias)*_scale; // Reverse it @@ -36,14 +36,14 @@ public class C2SChunk extends Chunk { } @Override boolean set_impl(int i, double d) { short s = (short)((d/_scale)-_bias); - if( s == _NA ) return false; + if( s == C2Chunk._NA ) return false; double d2 = (s+_bias)*_scale; if( d!=d2 ) return false; UDP.set2(_mem,(i<<1)+OFF,s); return true; } @Override boolean set_impl(int i, float f ) { return false; } - @Override boolean setNA_impl(int idx) { UDP.set2(_mem,(idx<<1)+OFF,(short)_NA); return true; } + @Override boolean setNA_impl(int idx) { UDP.set2(_mem,(idx<<1)+OFF,(short)C2Chunk._NA); return true; } @Override boolean hasFloat() { return _scale < 1.0; } @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); } @Override public C2SChunk read(AutoBuffer bb) { @@ -57,17 +57,12 @@ public class C2SChunk extends Chunk { @Override NewChunk inflate_impl(NewChunk nc) { double dx = Math.log10(_scale); assert DParseTask.fitsIntoInt(dx); - int x = (int)dx; - nc._ds = null; - nc._ls = MemoryManager.malloc8 (_len); - nc._xs = MemoryManager.malloc4 (_len); + Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int)dx); + nc._ls = MemoryManager.malloc8(_len); for( int i=0; i<_len; i++ ) { - long res = UDP.get2(_mem,(i<<1)+OFF); - if( res == _NA ) nc.setInvalid(i); - else { - nc._ls[i] = res+_bias; - nc._xs[i] = x; - } + int res = UDP.get2(_mem,(i<<1)+OFF); + if( res == C2Chunk._NA ) nc._xs[i] = Integer.MIN_VALUE; + else nc._ls[i] = res+_bias; } return nc; } diff --git a/src/main/java/water/fvec/C4SChunk.java b/src/main/java/water/fvec/C4SChunk.java index 61f85a61ac..a762680f09 100644 --- a/src/main/java/water/fvec/C4SChunk.java +++ b/src/main/java/water/fvec/C4SChunk.java @@ -1,5 +1,6 @@ package water.fvec; +import java.util.Arrays; import water.*; import water.parser.DParseTask; @@ -50,14 +51,12 @@ public class C4SChunk extends Chunk { @Override NewChunk inflate_impl(NewChunk nc) { double dx = Math.log10(_scale); assert DParseTask.fitsIntoInt(dx); - int x = (int)dx; + Arrays.fill(nc._xs = MemoryManager.malloc4(_len), (int)dx); + nc._ls = MemoryManager.malloc8(_len); for( int i=0; i<_len; i++ ) { - long res = UDP.get4(_mem,(i<<2)+OFF); - if( res == _NA ) nc.setInvalid(i); - else { - nc._ls[i] = res+_bias; - nc._xs[i] = x; - } + int res = UDP.get4(_mem,(i<<2)+OFF); + if( res == _NA ) nc._xs[i] = Integer.MIN_VALUE; + else nc._ls[i] = res+_bias; } return nc; } diff --git a/src/main/java/water/fvec/C8DChunk.java b/src/main/java/water/fvec/C8DChunk.java index a9291271db..8fa0c82589 100644 --- a/src/main/java/water/fvec/C8DChunk.java +++ b/src/main/java/water/fvec/C8DChunk.java @@ -12,10 +12,7 @@ public class C8DChunk extends Chunk { if( Double.isNaN(res) ) throw new IllegalArgumentException("at8 but value is missing"); return (long)res; } - @Override protected final double atd_impl( int i ) { - double res = UDP.get8d(_mem,i<<3); - return res; - } + @Override protected final double atd_impl( int i ) { return UDP.get8d(_mem,i<<3) ; } @Override protected final boolean isNA_impl( int i ) { return Double.isNaN(UDP.get8d(_mem,i<<3)); } @Override boolean set_impl(int idx, long l) { return false; } @Override boolean set_impl(int i, double d) { diff --git a/src/main/java/water/fvec/CBSChunk.java b/src/main/java/water/fvec/CBSChunk.java index 4e90c349c3..7c8a6682f5 100644 --- a/src/main/java/water/fvec/CBSChunk.java +++ b/src/main/java/water/fvec/CBSChunk.java @@ -1,6 +1,7 @@ package water.fvec; import water.AutoBuffer; +import water.MemoryManager; import water.H2O; /** A simple chunk for boolean values. In fact simple bit vector. @@ -57,10 +58,12 @@ protected byte atb(int idx) { return this; } @Override NewChunk inflate_impl(NewChunk nc) { + nc._xs = MemoryManager.malloc4(_len); + nc._ls = MemoryManager.malloc8(_len); for (int i=0; i<_len; i++) { - long res = at8_impl(i); - if (res == _NA) nc.setInvalid(i); - else nc._ls[i] = res; + int res = atb(i); + if (res == _NA) nc._xs[i] = Integer.MIN_VALUE; + else nc._ls[i] = res; } return nc; } diff --git a/src/main/java/water/fvec/CX0Chunk.java b/src/main/java/water/fvec/CX0Chunk.java index 55ca525ab7..b334507c7c 100644 --- a/src/main/java/water/fvec/CX0Chunk.java +++ b/src/main/java/water/fvec/CX0Chunk.java @@ -42,7 +42,6 @@ public CX0Chunk(long[] ls, int len, int nzcnt) { return this; } @Override NewChunk inflate_impl(NewChunk nc) { - nc._ds = null; nc._ls = MemoryManager.malloc8 (_len); nc._xs = MemoryManager.malloc4 (_len); for( int i=OFF; i<_mem.length; i+=2 ) diff --git a/src/main/java/water/fvec/CX2Chunk.java b/src/main/java/water/fvec/CX2Chunk.java index 9eb1dafc11..34c9cf4857 100644 --- a/src/main/java/water/fvec/CX2Chunk.java +++ b/src/main/java/water/fvec/CX2Chunk.java @@ -53,7 +53,6 @@ private int at_impl(int idx) { return this; } @Override NewChunk inflate_impl(NewChunk nc) { - nc._ds = null; nc._ls = MemoryManager.malloc8 (_len); nc._xs = MemoryManager.malloc4 (_len); for( int i=OFF; i<_mem.length; i+=4 ) { diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java index 56e7e693f9..8c63678370 100644 --- a/src/main/java/water/fvec/NewChunk.java +++ b/src/main/java/water/fvec/NewChunk.java @@ -8,38 +8,31 @@ import water.*; import water.parser.DParseTask; -// An uncompressed chunk of data, support an append operation +// An uncompressed chunk of data, supporting an append operation public class NewChunk extends Chunk { final int _cidx; - transient long _ls[]; // Mantissa - transient int _xs[]; // Exponent + // We can record the following (mixed) data types: + // 1- doubles, in _ds including NaN for NA & 0; _ls==_xs==null + // 2- scaled decimals from parsing, in _ls & _xs; _ds==null + // 3- zero: requires _ls==0 && _xs==0 + // 4- NA: either _ls==0 && _xs==Integer.MIN_VALUE, OR _ds=NaN + // 5- Enum: _ls==0 && _xs>0 && _ds==null + // Chunk._len is the count of elements appended + // Sparse: if _row !=null, then _ls/_xs/_ds are compressed to non-zero's + // only, and _row is the row number. Still Chunk._len is count of elements + // including zeros. + transient long _ls[]; // Mantissa + transient int _xs[]; // Exponent, or if _ls==0, NA or Enum transient double _ds[]; // Doubles, for inflating via doubles - transient double _min, _max; - int _naCnt; - int _strCnt; + int _naCnt; // Count of NA's appended + int _strCnt; // Count of Enum's appended - public NewChunk( Vec vec, int cidx ) { - _vec = vec; - _cidx = cidx; // This chunk# - _ls = new long[4]; // A little room for data - _xs = new int [4]; - _min = Double.MAX_VALUE; - _max = -Double.MAX_VALUE; - } + public NewChunk( Vec vec, int cidx ) { _vec = vec; _cidx = cidx; } - // Constructor used when inflating a Chunk + // Constructor used when inflating a Chunk. public NewChunk( Chunk C ) { - _vec = C._vec; - _cidx = _vec.elem2ChunkIdx(C._start); // This chunk# + this(C._vec,C._vec.elem2ChunkIdx(C._start)); _len = C._len; - if( C.hasFloat() || C instanceof C0DChunk ) { - _ds = MemoryManager.malloc8d(_len); - } else { - _ls = MemoryManager.malloc8 (_len); - _xs = MemoryManager.malloc4 (_len); - } - _min = Double.MAX_VALUE; - _max = -Double.MAX_VALUE; } public byte type(){ @@ -50,52 +43,41 @@ public byte type(){ return AppendableVec.NUMBER; } protected final boolean isNA(int idx) { - return (_ds == null) ? (_ls[idx] == 0 && _xs[idx] != 0) : Double.isNaN(_ds[idx]); + return (_ds == null) ? (_ls[idx] == 0 && _xs[idx] == Integer.MIN_VALUE) : Double.isNaN(_ds[idx]); } - public void addNA(){ - append2(0,Integer.MIN_VALUE); ++_naCnt; - } - private boolean _hasFloat; + public void addEnum(int e) { append2(0, e ); ++_strCnt;} + public void addNA ( ) { append2(0,Integer.MIN_VALUE); ++_naCnt ;} public void addNum(long val, int exp) { - if(val == 0)exp = 0; - _hasFloat |= (exp < 0); + if(val == 0)exp = 0; // Canonicalize zero append2(val,exp); } - public void addEnum(int e) { - append2(0,e); ++_strCnt; - } + // Fast-path append double data public void addNum(double d) { - if(_ds == null) { - assert _len == 0; - _ds = new double[1]; - } - if( _len >= _ds.length ) { - if( _len > Vec.CHUNK_SZ ) - throw new ArrayIndexOutOfBoundsException(_len); - _ds = Arrays.copyOf(_ds,_len<<1); - } - _ds[_len] = d; - _len++; - _hasFloat = true; + if( _ls==null||_len >= _ls.length ) append2slowd(); + _ds[_len++] = d; } - // Fast-path append long data void append2( long l, int x ) { - if( _len >= _ls.length ) append2slow(); - _ls[_len] = l; - _xs[_len] = x; - _len++; + if( _ls==null||_len >= _ls.length ) append2slow(); + _ls[_len ] = l; + _xs[_len++] = x; + } + // Slow-path append data + void append2slowd( ) { + if( _len > Vec.CHUNK_SZ ) + throw new ArrayIndexOutOfBoundsException(_len); + assert _ls==null; + _ds = _ds==null ? MemoryManager.malloc8d(4) : MemoryManager.arrayCopyOf(_ds,_len<<1); } // Slow-path append data void append2slow( ) { if( _len > Vec.CHUNK_SZ ) throw new ArrayIndexOutOfBoundsException(_len); - _ls = MemoryManager.arrayCopyOf(_ls,_len<<1); - _xs = MemoryManager.arrayCopyOf(_xs,_len<<1); + assert _ds==null; + _xs = _ls==null ? MemoryManager.malloc4(4) : MemoryManager.arrayCopyOf(_xs,_len<<1); + _ls = _ls==null ? MemoryManager.malloc8(4) : MemoryManager.arrayCopyOf(_ls,_len<<1); } - void invalid() { append2(0,Integer.MIN_VALUE); } - void setInvalid(int idx) { _ls[idx]=0; _xs[idx] = Integer.MIN_VALUE; } /* * @@ -267,8 +249,8 @@ Chunk compress() { _ls = new long[_ds.length]; // Else flip to longs _xs = new int [_ds.length]; for( i=0; i<_len; i++ ) // Inject all doubles into longs - if( Double.isNaN(_ds[i]) ) setInvalid(i); - else _ls[i] = (long)_ds[i]; + if( Double.isNaN(_ds[i]) ) _xs[i] = Integer.MIN_VALUE; + else _ls[i] = (long)_ds[i]; } // data in some fixed-point format. @@ -276,6 +258,8 @@ Chunk compress() { boolean hasNA = false; _naCnt=0; int nzCnt=0; // Non-zero count + double min = Double.MAX_VALUE; + double max = -Double.MAX_VALUE; for( int i=0; i<_len; i++ ) { if( isNA(i) ) { hasNA = true; _naCnt++; continue;} @@ -284,8 +268,8 @@ Chunk compress() { if( l!=0 ) nzCnt++; // Compute per-chunk min/sum/max double d = l*DParseTask.pow10(x); - if( d < _min ) _min = d; - if( d > _max ) _max = d; + if( d < min ) min = d; + if( d > max ) max = d; if( l==0 ) x=0; // Canonicalize zero exponent long t; while( l!=0 && (t=l/10)*10==l ) { l=t; x++; } @@ -309,17 +293,17 @@ Chunk compress() { if( le < lemin ) lemin=le; if( le > lemax ) lemax=le; } - final boolean fpoint = xmin < 0 || _min < Long.MIN_VALUE || _max > Long.MAX_VALUE; + final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; // Constant column? - if(!hasNA && _min==_max ) { - return ((long)_min == _min) - ?new C0LChunk((long)_min,_len) - :new C0DChunk(_min, _len); + if( !hasNA && min==max ) { + return ((long)min == min) + ? new C0LChunk((long)min,_len) + : new C0DChunk( min,_len); } // Boolean column? - if (_max == 1 && _min == 0 && xmin == 0) { + if (max == 1 && min == 0 && xmin == 0) { if( nzCnt*32 < _len && _naCnt==0 ) // Very sparse? return new CX0Chunk(_ls,_len,nzCnt); // Sparse boolean chunk int bpv = _strCnt+_naCnt > 0 ? 2 : 1; @@ -360,7 +344,7 @@ Chunk compress() { if(xmin == 0 && 0<=lemin && lemax <= 255 && ((_naCnt + _strCnt)==0) ) return new C1NChunk( bufX(0,0,C1NChunk.OFF,0)); if( lemax-lemin < 255 ) { // Span fits in a byte? - if(0 <= _min && _max < 255 ) // Span fits in an unbiased byte? + if(0 <= min && max < 255 ) // Span fits in an unbiased byte? return new C1Chunk( bufX(0,0,C1Chunk.OFF,0)); return new C1SChunk( bufX(lemin,xmin,C1SChunk.OFF,0),(int)lemin,DParseTask.pow10i(xmin)); } @@ -373,7 +357,7 @@ Chunk compress() { return new C2SChunk( bufX(bias,xmin,C2SChunk.OFF,1),bias,DParseTask.pow10i(xmin)); } // Compress column into ints - if(Integer.MIN_VALUE < _min && _max <= Integer.MAX_VALUE ) + if( Integer.MIN_VALUE < min && max <= Integer.MAX_VALUE ) return new C4Chunk( bufX(0,0,0,2)); return new C8Chunk( bufX(0,0,0,3)); } From 76f67e15050bd98ad9a82c5df5cc18b0f2b4e642 Mon Sep 17 00:00:00 2001 From: cliffclick Date: Mon, 11 Nov 2013 20:52:14 -0800 Subject: [PATCH 09/11] More cleanup of NewChunk Allow Enums to compress any style; remove one 64K cap (may be more limits). Correct/cleaup _naCnt. Tighter asserts, more comments. Remove leaking Key from Bit-test --- prj.el | 2 +- src/main/java/water/fvec/C1SChunk.java | 2 +- src/main/java/water/fvec/NewChunk.java | 101 ++++++++++----------- src/test/java/water/fvec/CBSChunkTest.java | 22 +++-- 4 files changed, 65 insertions(+), 62 deletions(-) diff --git a/prj.el b/prj.el index aa9f4dbf7b..66a800ada8 100644 --- a/prj.el +++ b/prj.el @@ -7,7 +7,7 @@ '(jde-run-option-debug nil) '(jde-run-option-vm-args nil) '(jde-compile-option-directory "./target/classes") - '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "hex.KMeans2Test"))) + '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.ParserTest2"))) '(jde-debugger (quote ("JDEbug"))) '(jde-compile-option-source (quote ("1.6"))) '(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar"))) diff --git a/src/main/java/water/fvec/C1SChunk.java b/src/main/java/water/fvec/C1SChunk.java index 1a85dd7ec5..383df953f2 100644 --- a/src/main/java/water/fvec/C1SChunk.java +++ b/src/main/java/water/fvec/C1SChunk.java @@ -37,7 +37,7 @@ public class C1SChunk extends Chunk { @Override boolean set_impl(int i, double d) { return false; } @Override boolean set_impl(int i, float f ) { return false; } @Override boolean setNA_impl(int idx) { _mem[idx+OFF] = (byte)C1Chunk._NA; return true; } - @Override boolean hasFloat() { return _scale < 1.0; } + @Override boolean hasFloat() { return _scale < 1.0 || _scale > Long.MAX_VALUE; } @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); } @Override public C1SChunk read(AutoBuffer bb) { _mem = bb.bufClose(); diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java index 8c63678370..1b74823f01 100644 --- a/src/main/java/water/fvec/NewChunk.java +++ b/src/main/java/water/fvec/NewChunk.java @@ -16,7 +16,7 @@ public class NewChunk extends Chunk { // 2- scaled decimals from parsing, in _ls & _xs; _ds==null // 3- zero: requires _ls==0 && _xs==0 // 4- NA: either _ls==0 && _xs==Integer.MIN_VALUE, OR _ds=NaN - // 5- Enum: _ls==0 && _xs>0 && _ds==null + // 5- Enum: _xs==(Integer.MIN_VALUE+1) && _ds==null // Chunk._len is the count of elements appended // Sparse: if _row !=null, then _ls/_xs/_ds are compressed to non-zero's // only, and _row is the row number. Still Chunk._len is count of elements @@ -35,7 +35,24 @@ public NewChunk( Chunk C ) { _len = C._len; } - public byte type(){ + // Assert rollup counts are correct + private boolean checkCnt() { + int nas=0, ss=0; + if( _ds != null ) { + assert _ls==null && _xs==null; + for( double d : _ds ) if( Double.isNaN(d) ) nas++; + } else { + assert _ds==null; + if( _ls != null ) + for( int i=0; i<_ls.length; i++ ) + if( _ls[i]==0 && _xs[i]==Integer.MIN_VALUE ) nas++; + else if( _xs[i]==Integer.MIN_VALUE+1 ) ss++; + } + assert nas==_naCnt && ss==_strCnt : "na="+nas+" vs "+_naCnt+", str="+ss+" vs "+_strCnt; + return true; + } + public byte type() { + assert checkCnt(); if(_naCnt == _len) return AppendableVec.NA; if(_strCnt > 0 && _strCnt + _naCnt == _len) @@ -45,16 +62,19 @@ public byte type(){ protected final boolean isNA(int idx) { return (_ds == null) ? (_ls[idx] == 0 && _xs[idx] == Integer.MIN_VALUE) : Double.isNaN(_ds[idx]); } + protected final boolean isEnum(int idx) { + return _ls!=null && _xs[idx]==Integer.MIN_VALUE+1; + } - public void addEnum(int e) { append2(0, e ); ++_strCnt;} - public void addNA ( ) { append2(0,Integer.MIN_VALUE); ++_naCnt ;} + public void addEnum(int e) { append2(e,Integer.MIN_VALUE+1); ++_strCnt;} + public void addNA ( ) { append2(0,Integer.MIN_VALUE ); ++_naCnt ;} public void addNum(long val, int exp) { if(val == 0)exp = 0; // Canonicalize zero append2(val,exp); } // Fast-path append double data public void addNum(double d) { - if( _ls==null||_len >= _ls.length ) append2slowd(); + if( _ds==null||_len >= _ds.length ) append2slowd(); _ds[_len++] = d; } // Fast-path append long data @@ -209,41 +229,16 @@ Chunk compress() { long lemin= 0, lemax=lemin; // min/max at xmin fixed-point boolean overflow=false; boolean floatOverflow = false; + assert checkCnt(); if(_naCnt == _len) // ALL NAs, nothing to do return new C0DChunk(Double.NaN,_len); - // Enum? We assume that columns with ALL strings (and NAs) are enums if - // there were less than 65k unique vals. If there were some numbers, we - // assume it is a numcol with strings being NAs. - if( type() == AppendableVec.ENUM) { - // find their max val - int sz = Integer.MIN_VALUE; - for(int x:_xs) if(x > sz)sz = x; - if( sz < Enum.MAX_ENUM_SIZE ) { - if(sz < 255){ // we can fit into 1Byte - byte [] bs = MemoryManager.malloc1(_len); - for(int i = 0; i < _len; ++i) bs[i] = (byte)(_xs[i] >= 0 ? (0xFF&_xs[i]) : C1Chunk._NA); - return new C1Chunk(bs); - } else if( sz <= 65535 ) { // 2 bytes - int bias = 0, off = 0; - if(sz >= 32767){ - bias = 32767; - off = C2SChunk.OFF; - } - byte [] bs = MemoryManager.malloc1((_len << 1) + off); - for(int i = 0; i < _len; ++i){ - if(_xs[i] >= 0) assert (short)(_xs[i]-bias) == (_xs[i]-bias); - UDP.set2(bs, off + (i << 1), (short)((_xs[i] > 0)? _xs[i]-bias : C2Chunk._NA)); - } - return bias == 0 ? new C2Chunk(bs) : new C2SChunk(bs,bias,1); - } else throw H2O.unimpl(); - } - } + // If the data was set8 as doubles, we do a quick check to see if it's // plain longs. If not, we give up and use doubles. if( _ds != null ) { int i=0; - for( ; i<_len; i++ ) // Attempt to inject all doubles into ints + for( ; i<_len; i++ ) // Attempt to inject all doubles into longs if( !Double.isNaN(_ds[i]) && (double)(long)_ds[i] != _ds[i] ) break; if( i<_len ) return chunkD(); _ls = new long[_ds.length]; // Else flip to longs @@ -251,27 +246,27 @@ Chunk compress() { for( i=0; i<_len; i++ ) // Inject all doubles into longs if( Double.isNaN(_ds[i]) ) _xs[i] = Integer.MIN_VALUE; else _ls[i] = (long)_ds[i]; + _ds = null; } - // data in some fixed-point format. + // Data in some fixed-point format, not doubles boolean first = true; - boolean hasNA = false; - _naCnt=0; int nzCnt=0; // Non-zero count double min = Double.MAX_VALUE; double max = -Double.MAX_VALUE; for( int i=0; i<_len; i++ ) { - if( isNA(i) ) { hasNA = true; _naCnt++; continue;} + if( isNA(i) ) continue; long l = _ls[i]; int x = _xs[i]; + if( x==Integer.MIN_VALUE+1 ) x=0; // Replace enum flag with no scaling if( l!=0 ) nzCnt++; - // Compute per-chunk min/sum/max + assert l!=0 || x==0; // Exponent of zero is always zero + // Compute per-chunk min/max double d = l*DParseTask.pow10(x); if( d < min ) min = d; if( d > max ) max = d; - if( l==0 ) x=0; // Canonicalize zero exponent - long t; + long t; // Remove extra scaling while( l!=0 && (t=l/10)*10==l ) { l=t; x++; } floatOverflow = Math.abs(l) > MAX_FLOAT_MANTISSA; if( first ) { @@ -293,10 +288,9 @@ Chunk compress() { if( le < lemin ) lemin=le; if( le > lemax ) lemax=le; } - final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; // Constant column? - if( !hasNA && min==max ) { + if( _naCnt==0 && min==max ) { return ((long)min == min) ? new C0LChunk((long)min,_len) : new C0DChunk( min,_len); @@ -304,13 +298,15 @@ Chunk compress() { // Boolean column? if (max == 1 && min == 0 && xmin == 0) { - if( nzCnt*32 < _len && _naCnt==0 ) // Very sparse? - return new CX0Chunk(_ls,_len,nzCnt); // Sparse boolean chunk - int bpv = _strCnt+_naCnt > 0 ? 2 : 1; + if( nzCnt*32 < _len && _naCnt==0 ) // Very sparse? + return new CX0Chunk(_ls,_len,nzCnt); // Sparse boolean chunk + int bpv = _strCnt+_naCnt > 0 ? 2 : 1; // Bit-vector byte[] cbuf = bufB(CBSChunk.OFF, bpv); return new CBSChunk(cbuf, cbuf[0], cbuf[1]); } + // Result column must hold floats? + final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; // Highly sparse but not a bitvector or constant? if( !fpoint && (nzCnt+_naCnt)*8 < _len && lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE )// Only handling unbiased shorts here @@ -375,7 +371,7 @@ private byte[] bufX( long bias, int scale, int off, int log ) { default: H2O.fail(); } } else { - int x = _xs[i]-scale; + int x = (_xs[i]==Integer.MIN_VALUE+1 ? 0 : _xs[i])-scale; long le = x >= 0 ? _ls[i]*DParseTask.pow10i( x) : _ls[i]/DParseTask.pow10i(-x); @@ -396,7 +392,7 @@ private byte[] bufX( long bias, int scale, int off, int log ) { private Chunk chunkD() { final byte [] bs = MemoryManager.malloc1(_len*8); for(int i = 0; i < _len; ++i) - UDP.set8d(bs, 8*i, _ds != null?_ds[i]:isNA0(i)?Double.NaN:_ls[i]*DParseTask.pow10(_xs[i])); + UDP.set8d(bs, 8*i, _ds != null?_ds[i]:(isNA(i)||isEnum(i))?Double.NaN:_ls[i]*DParseTask.pow10(_xs[i])); return new C8DChunk(bs); } @@ -436,17 +432,20 @@ private byte[] bufB(int off, int bpv) { // in-range and refer to the inflated values of the original Chunk. @Override boolean set_impl(int i, long l) { if( _ds != null ) throw H2O.unimpl(); + if( _xs[i]==Integer.MIN_VALUE+1 ) _naCnt--; _ls[i]=l; _xs[i]=0; return true; } @Override boolean set_impl(int i, double d) { - if( _ls != null ) { - _ds = MemoryManager.malloc8d(_len); + if( _ls != null ) { // Flip to using doubles + double ds[] = MemoryManager.malloc8d(_len); for( int j = 0; j<_len; j++ ) - _ds[j] = _ls[j]*Math.pow(10,_xs[j]); - _ls = null; _xs = null; + ds[j] = (isNA(j) || isEnum(j)) ? Double.NaN : _ls[j]*Math.pow(10,_xs[j]); + _ds = ds; _ls = null; _xs = null; } + if( Double.isNaN(_ds[i]) ) _naCnt--; _ds[i]=d; + if( Double.isNaN( d ) ) _naCnt++; return true; } @Override boolean set_impl(int i, float f) { return set_impl(i,(double)f); } diff --git a/src/test/java/water/fvec/CBSChunkTest.java b/src/test/java/water/fvec/CBSChunkTest.java index 6db4bff1ad..a7385e7561 100644 --- a/src/test/java/water/fvec/CBSChunkTest.java +++ b/src/test/java/water/fvec/CBSChunkTest.java @@ -3,10 +3,12 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import org.junit.BeforeClass; import org.junit.Ignore; import org.junit.Test; - import water.Futures; +import water.TestUtil; +import water.UKV; /** Test for CBSChunk implementation. * @@ -17,7 +19,8 @@ * expected results. In this case expectation is little bit missused * since it is used to avoid DKV call. * */ -public class CBSChunkTest { +public class CBSChunkTest extends TestUtil { + @BeforeClass public static void stall() { stall_till_cloudsize(1); } void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expNA) { AppendableVec av = new AppendableVec(Vec.newKey()); @@ -43,6 +46,7 @@ void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expN for( int i=0; i Date: Mon, 11 Nov 2013 21:20:21 -0800 Subject: [PATCH 10/11] Uniformly cleanup enums & numbers If the Chunk will be enums, nuke all the numbers. If the Chunk will be numbers, nuke all the enums. --- src/main/java/water/fvec/NewChunk.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java index 1b74823f01..a6ba349592 100644 --- a/src/main/java/water/fvec/NewChunk.java +++ b/src/main/java/water/fvec/NewChunk.java @@ -231,8 +231,15 @@ Chunk compress() { boolean floatOverflow = false; assert checkCnt(); - if(_naCnt == _len) // ALL NAs, nothing to do + byte mode = type(); + if( mode==AppendableVec.NA ) // ALL NAs, nothing to do return new C0DChunk(Double.NaN,_len); + for( int i=0; i<_len; i++ ) + if( mode==AppendableVec.ENUM && !isEnum(i) || + mode==AppendableVec.NUMBER && isEnum(i) ) + setNA_impl(i); + if( mode==AppendableVec.NUMBER ) _strCnt=0; + assert checkCnt(); // If the data was set8 as doubles, we do a quick check to see if it's // plain longs. If not, we give up and use doubles. From cdff5e6d9a2000b2bc18ead164e12c911bd2654d Mon Sep 17 00:00:00 2001 From: Cliff Click Date: Tue, 12 Nov 2013 08:46:11 -0800 Subject: [PATCH 11/11] swap out tricky incremental math for simple bulk math All rollups done in bulk at the start of NewChunk.compress(). Remove some commented-out code. --- prj.el | 2 +- src/main/java/water/fvec/NewChunk.java | 187 ++++----------------- src/test/java/water/fvec/CBSChunkTest.java | 2 +- 3 files changed, 36 insertions(+), 155 deletions(-) diff --git a/prj.el b/prj.el index 66a800ada8..d257036cb3 100644 --- a/prj.el +++ b/prj.el @@ -7,7 +7,7 @@ '(jde-run-option-debug nil) '(jde-run-option-vm-args nil) '(jde-compile-option-directory "./target/classes") - '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.ParserTest2"))) + '(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.CBSChunkTest" "water.fvec.ParserTest2"))) '(jde-debugger (quote ("JDEbug"))) '(jde-compile-option-source (quote ("1.6"))) '(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar"))) diff --git a/src/main/java/water/fvec/NewChunk.java b/src/main/java/water/fvec/NewChunk.java index a6ba349592..43f317534b 100644 --- a/src/main/java/water/fvec/NewChunk.java +++ b/src/main/java/water/fvec/NewChunk.java @@ -24,8 +24,9 @@ public class NewChunk extends Chunk { transient long _ls[]; // Mantissa transient int _xs[]; // Exponent, or if _ls==0, NA or Enum transient double _ds[]; // Doubles, for inflating via doubles - int _naCnt; // Count of NA's appended + int _naCnt=-1; // Count of NA's appended int _strCnt; // Count of Enum's appended + int _nzCnt; // Count of non-zero's appended public NewChunk( Vec vec, int cidx ) { _vec = vec; _cidx = cidx; } @@ -35,24 +36,26 @@ public NewChunk( Chunk C ) { _len = C._len; } - // Assert rollup counts are correct - private boolean checkCnt() { - int nas=0, ss=0; - if( _ds != null ) { - assert _ls==null && _xs==null; - for( double d : _ds ) if( Double.isNaN(d) ) nas++; - } else { - assert _ds==null; - if( _ls != null ) - for( int i=0; i<_ls.length; i++ ) - if( _ls[i]==0 && _xs[i]==Integer.MIN_VALUE ) nas++; - else if( _xs[i]==Integer.MIN_VALUE+1 ) ss++; - } - assert nas==_naCnt && ss==_strCnt : "na="+nas+" vs "+_naCnt+", str="+ss+" vs "+_strCnt; - return true; - } + // Heuristic to decide the basic type of a column public byte type() { - assert checkCnt(); + if( _naCnt == -1 ) { // No rollups yet? + int nas=0, ss=0, nzs=0; + if( _ds != null ) { + assert _ls==null && _xs==null; + for( double d : _ds ) if( Double.isNaN(d) ) nas++; else if( d!=0 ) nzs++; + } else { + assert _ds==null; + if( _ls != null ) + for( int i=0; i<_ls.length; i++ ) + if( isNA(i) ) nas++; + else { + if( isEnum(i) ) ss++; + if( _ls[i] != 0 ) nzs++; + } + } + _nzCnt=nzs; _strCnt=ss; _naCnt=nas; + } + // Now run heuristic for type if(_naCnt == _len) return AppendableVec.NA; if(_strCnt > 0 && _strCnt + _naCnt == _len) @@ -66,10 +69,10 @@ protected final boolean isEnum(int idx) { return _ls!=null && _xs[idx]==Integer.MIN_VALUE+1; } - public void addEnum(int e) { append2(e,Integer.MIN_VALUE+1); ++_strCnt;} - public void addNA ( ) { append2(0,Integer.MIN_VALUE ); ++_naCnt ;} + public void addEnum(int e) { append2(e,Integer.MIN_VALUE+1); } + public void addNA ( ) { append2(0,Integer.MIN_VALUE ); } public void addNum(long val, int exp) { - if(val == 0)exp = 0; // Canonicalize zero + if( val == 0 ) exp = 0;// Canonicalize zero append2(val,exp); } // Fast-path append double data @@ -84,14 +87,14 @@ void append2( long l, int x ) { _xs[_len++] = x; } // Slow-path append data - void append2slowd( ) { + private void append2slowd( ) { if( _len > Vec.CHUNK_SZ ) throw new ArrayIndexOutOfBoundsException(_len); assert _ls==null; _ds = _ds==null ? MemoryManager.malloc8d(4) : MemoryManager.arrayCopyOf(_ds,_len<<1); } // Slow-path append data - void append2slow( ) { + private void append2slow( ) { if( _len > Vec.CHUNK_SZ ) throw new ArrayIndexOutOfBoundsException(_len); assert _ds==null; @@ -99,117 +102,6 @@ void append2slow( ) { _ls = _ls==null ? MemoryManager.malloc8(4) : MemoryManager.arrayCopyOf(_ls,_len<<1); } - /* - * - * - * - * private long attemptTimeParse( ValueString str ) { - long t0 = attemptTimeParse_0(str); // "yyyy-MM-dd HH:mm:ss.SSS" - if( t0 != Long.MIN_VALUE ) return t0; - long t1 = attemptTimeParse_1(str); // "dd-MMM-yy" - if( t1 != Long.MIN_VALUE ) return t1; - return Long.MIN_VALUE; - } - // So I just brutally parse "yyyy-MM-dd HH:mm:ss.SSS" - private long attemptTimeParse_0( ValueString str ) { - final byte[] buf = str._buf; - int i=str._off; - final int end = i+str._length; - while( i < end && buf[i] == ' ' ) i++; - if ( i < end && buf[i] == '"' ) i++; - if( (end-i) < 19 ) return Long.MIN_VALUE; - int yy=0, MM=0, dd=0, HH=0, mm=0, ss=0, SS=0; - yy = digit(yy,buf[i++]); - yy = digit(yy,buf[i++]); - yy = digit(yy,buf[i++]); - yy = digit(yy,buf[i++]); - if( yy < 1970 ) return Long.MIN_VALUE; - if( buf[i++] != '-' ) return Long.MIN_VALUE; - MM = digit(MM,buf[i++]); - MM = digit(MM,buf[i++]); - if( MM < 1 || MM > 12 ) return Long.MIN_VALUE; - if( buf[i++] != '-' ) return Long.MIN_VALUE; - dd = digit(dd,buf[i++]); - dd = digit(dd,buf[i++]); - if( dd < 1 || dd > 31 ) return Long.MIN_VALUE; - if( buf[i++] != ' ' ) return Long.MIN_VALUE; - HH = digit(HH,buf[i++]); - HH = digit(HH,buf[i++]); - if( HH < 0 || HH > 23 ) return Long.MIN_VALUE; - if( buf[i++] != ':' ) return Long.MIN_VALUE; - mm = digit(mm,buf[i++]); - mm = digit(mm,buf[i++]); - if( mm < 0 || mm > 59 ) return Long.MIN_VALUE; - if( buf[i++] != ':' ) return Long.MIN_VALUE; - ss = digit(ss,buf[i++]); - ss = digit(ss,buf[i++]); - if( ss < 0 || ss > 59 ) return Long.MIN_VALUE; - if( i 999 ) return Long.MIN_VALUE; - } - if( i 31 ) return Long.MIN_VALUE; - if( buf[i++] != '-' ) return Long.MIN_VALUE; - byte[]mm=null; - OUTER: for( ; MM 0 ? 2 : 1; // Bit-vector + if( _nzCnt*32 < _len && _naCnt==0 ) // Very sparse? + return new CX0Chunk(_ls,_len,_nzCnt); // Sparse boolean chunk + int bpv = _strCnt+_naCnt > 0 ? 2 : 1; // Bit-vector byte[] cbuf = bufB(CBSChunk.OFF, bpv); return new CBSChunk(cbuf, cbuf[0], cbuf[1]); } @@ -315,9 +204,9 @@ Chunk compress() { // Result column must hold floats? final boolean fpoint = xmin < 0 || min < Long.MIN_VALUE || max > Long.MAX_VALUE; // Highly sparse but not a bitvector or constant? - if( !fpoint && (nzCnt+_naCnt)*8 < _len && - lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE )// Only handling unbiased shorts here - return new CX2Chunk(_ls,_xs,_len,nzCnt,_naCnt); // Sparse byte chunk + if( !fpoint && (_nzCnt+_naCnt)*8 < _len && + lemin > Short.MIN_VALUE && lemax <= Short.MAX_VALUE ) // Only handling unbiased shorts here + return new CX2Chunk(_ls,_xs,_len,_nzCnt,_naCnt); // Sparse byte chunk // Exponent scaling: replacing numbers like 1.3 with 13e-1. '13' fits in a // byte and we scale the column by 0.1. A set of numbers like @@ -426,10 +315,6 @@ private byte[] bufB(int off, int bpv) { bs[1] = (byte) bpv; // Flush last byte if (boff>0) bs[idx++] = b; - /*for (int i=0; i