Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
arnocandel committed Mar 11, 2014
2 parents aab568b + 1984de6 commit ef2e92d
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 15 deletions.
4 changes: 2 additions & 2 deletions h2o-perf/bench/tests/gbm_covtype/parse.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,10 @@ source("../../R/h2oPerf/prologue.R")

data_source <<- "home-0xdiag-datasets"

trainData <<- "/home/0xdiag/datasets/standard/covtype200x.data"
trainData <<- "/home/0xdiag/datasets/standard/covtype20x.data"
response <<- "C55"

num_train_rows <<- 116202400
num_train_rows <<- 11620240
num_explan_cols <<- 54

upload.VA("parsed.hex", trainData)
Expand Down
2 changes: 1 addition & 1 deletion h2o-perf/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,7 @@ <h4 class="panel-title">

<form action="../prototype/php/post.php" method="post">
<div>
<textarea name="parse_1" style="width: 700px; height: 700px;">i
<textarea name="parse_1" style="width: 700px; height: 700px;">
SELECT
FROM_UNIXTIME(tr.start_epoch_ms / 1000, '%Y-%m-%d') as run_date,
tr.test_name,
Expand Down
8 changes: 4 additions & 4 deletions lib/resources/h2o/css/main.css
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,11 @@ body {
}

td {
min-width: 125px;
min-width: 115px;
}

.table th, .table td {
text-align: center;
}
/*.table th, .table td {*/
/*text-align: left;*/
/*}*/


77 changes: 73 additions & 4 deletions py/testdir_single_jvm/test_exec_enums_rand_cut.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import unittest, random, sys, time, re
sys.path.extend(['.','..','py'])
import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_glm, h2o_util
import h2o_gbm
import getpass

# details:
# we want to seed a random dictionary for our enums
Expand All @@ -16,6 +18,13 @@
MIN_ENUM_WIDTH = 2
MAX_ENUM_WIDTH = 8
RAND_ENUM_LENGTH = True

DO_PLOT = getpass.getuser()=='kevin'

DO_MEDIAN = True
MAX_QBINS = 1000
MULTI_PASS = 1

def random_enum(n, randChars=randChars, quoteChars=quoteChars):
# randomly return None 10% of the time
if random.randint(0,9)==0:
Expand Down Expand Up @@ -74,6 +83,7 @@ def write_syn_dataset(csvPathname, rowCount, inCount=1, outCount=1, SEED='123456
dsf.write(rowDataCsv)
dsf.close()


class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()
Expand All @@ -90,7 +100,7 @@ def setUpClass(cls):

@classmethod
def tearDownClass(cls):
### time.sleep(3600)
h2o.sleep(3600)
h2o.tear_down_cloud()

def test_exec_enums_rand_cut(self):
Expand All @@ -104,9 +114,18 @@ def test_exec_enums_rand_cut(self):
(n, 10, 9, 'cE', 300),
]

### h2b.browseTheCloud()
# create key names to use for exec
eKeys = ['e%s' % i for i in range(10)]

h2b.browseTheCloud()
trial = 0
xList = []
eList = []
fList = []
for repeat in range(10):
for (rowCount, iColCount, oColCount, hex_key, timeoutSecs) in tryList:

# CREATE DATASET*******************************************
colCount = iColCount + oColCount
SEEDPERFILE = random.randint(0, sys.maxint)
csvFilename = 'syn_enums_' + str(rowCount) + 'x' + str(colCount) + '.csv'
Expand All @@ -115,7 +134,13 @@ def test_exec_enums_rand_cut(self):
print "Creating random", csvPathname
write_syn_dataset(csvPathname, rowCount, iColCount, oColCount, SEEDPERFILE)

parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=hex_key, timeoutSecs=30)
# PARSE*******************************************************
# should be two different keys in the sample
e = random.sample(eKeys,2)
fKey = e[0]
eKey = e[1]

parseResult = h2i.import_parse(path=csvPathname, schema='put', hex_key=eKey, timeoutSecs=30)
print "Parse result['destination_key']:", parseResult['destination_key']
inspect = h2o_cmd.runInspect(key=parseResult['destination_key'])
print h2o.dump_json(inspect)
Expand All @@ -126,12 +151,56 @@ def test_exec_enums_rand_cut(self):
# error if any col has constant values
if len(constantValuesDict) != 0:
raise Exception("Probably got a col NA'ed and constant values as a result %s" % constantValuesDict)

# EXEC*******************************************************
# don't use exec_expr to avoid issues with Inspect following etc.
randICol = random.randint(0,iColCount-1)
randOCol = random.randint(iColCount, iColCount+oColCount-1)

start = time.time()
h2o.nodes[0].exec_query(str='%s=%s[%s,]' % (fKey, eKey, randOCol))
elapsed = time.time() - start
print "exec1 end on ", csvFilename, 'took', elapsed, 'seconds.'
execTime = elapsed

gKey = random.choice(eKeys)

start = time.time()
h2o.nodes[0].exec_query(str='%s=%s' % (gKey, fKey))
print "exec2 end on ", csvFilename, 'took', elapsed, 'seconds.'
elapsed = time.time() - start
print "predict end on ", csvFilename, 'took', elapsed, 'seconds.'
execTime = elapsed

# QUANTILE*******************************************************
quantile = 0.5 if DO_MEDIAN else .999
# first output col
column = iColCount
start = time.time()
q = h2o.nodes[0].quantiles(source_key=fKey, column=column, quantile=quantile, max_qbins=MAX_QBINS, multiple_pass=MULTI_PASS)
elapsed = time.time() - start
print "quantile end on ", csvFilename, 'took', elapsed, 'seconds.'
quantileTime = elapsed


# remove all keys*******************************************************
start = time.time()
h2o.nodes[0].remove_all_keys()
elapsed = time.time() - start
print "remove all keys end on ", csvFilename, 'took', elapsed, 'seconds.'

trial += 1
xList.append(trial)
eList.append(execTime)
fList.append(quantileTime)

if DO_PLOT:
xLabel = 'trial'
eLabel = 'exec cut time'
fLabel = 'quantile time'
eListTitle = ""
fListTitle = ""
h2o_gbm.plotLists(xList, xLabel, eListTitle, eList, eLabel, fListTitle, fList, fLabel)



if __name__ == '__main__':
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/hex/Quantiles.java
Original file line number Diff line number Diff line change
Expand Up @@ -187,11 +187,13 @@ else if ( !Double.isNaN(_min) ) {
}
else { // vec does not contain finite numbers
// do we care here? have to think about whether multiPass is disabled/
// okay this one entry hcnt2 stuff is making the algo die ( I guess the min was nan above)
// for now, just make it length 2
_start2 = vec.min();
_binsz2 = Double.POSITIVE_INFINITY;
hcnt2 = new long[1];
hcnt2_min = new double[1];
hcnt2_max = new double[1];
hcnt2 = new long[2];
hcnt2_min = new double[2];
hcnt2_max = new double[2];
}
hcnt2_low = 0;
hcnt2_high = 0;
Expand Down Expand Up @@ -527,7 +529,8 @@ else if ( hcnt2[k]==1 && targetCntFract!=0 ) {
// Just need to check the one bin below and above k, if they exist.
// They might have zero entries, but then it's okay to ignore them.
// update: use the closest edge in the next bin. better forward progress for small bin counts
// This code may make the practical min bin count around 4 or so (not 2)
// This code may make the practical min bin count around 4 or so (not 2).
// what has length 1 hcnt2 that makese this fail? Enums? shouldn't get here.
newValStart = hcnt2_min[k];
if ( k > 0 ) {
if ( hcnt2[k-1]>0 && (hcnt2_max[k-1]<hcnt2_min[k]) ) {
Expand All @@ -539,6 +542,7 @@ else if ( hcnt2[k]==1 && targetCntFract!=0 ) {
// k might be pointing to one less than that (like k=0 for 1 bin case)
newValEnd = hcnt2_max[k];
if ( k < (maxBinCnt-1) ) {
assert k+1 < hcnt2.length : k+" "+hcnt2.length+" "+_valMaxBinCnt+" "+_isEnum+" "+_isInt;
if ( hcnt2[k+1]>0 && (hcnt2_min[k+1]>hcnt2_max[k]) ) {
newValEnd = hcnt2_min[k+1];
}
Expand Down

0 comments on commit ef2e92d

Please sign in to comment.