Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
anqi committed Dec 5, 2013
2 parents 4c74d17 + cd8be92 commit b0d4b40
Show file tree
Hide file tree
Showing 8 changed files with 186 additions and 99 deletions.
1 change: 1 addition & 0 deletions R/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ifeq ($(PDFLATEX),)
else
R CMD Rd2pdf --force --output="h2o-package/h2o_package.pdf" --title="Package 'h2o'" --no-index --no-preview h2o-package/man 1> /dev/null
endif
mkdir -p h2o-package/inst/java/
cp -f ../target/h2o.jar h2o-package/inst/java/h2o.jar
R CMD build h2o-package
mkdir -p ../target/R
Expand Down
13 changes: 8 additions & 5 deletions py/h2o_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,17 +473,20 @@ def sleep_with_dot(sec, message=None):
dot()
count += 1

def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, outputClass, outputCol, changeToBinomial=False):
def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent,
outputClass=None, outputCol=None, changeToBinomial=False):
# will have to live with random extract. will create variance

print "train: get random", trainPercent
print "test: get remaining", 100 - trainPercent
if changeToBinomial:
print "change class", outputClass, "to 1, everything else to 0. factor() to turn real to int (for rf)"

boundary = (trainPercent + 0.0)/100

execExpr = ""
execExpr += "cct.hex=runif(%s);" % srcKey
execExpr += "%s=%s[cct.hex%s,];" % (trainDstKey, srcKey, '<=0.9')
execExpr += "%s=%s[cct.hex<=%s,];" % (trainDstKey, srcKey, boundary)
if changeToBinomial:
execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, outputCol+1, trainDstKey, outputCol+1, outputClass)
execExpr += "factor(%s[, %s]);" % (trainDstKey, outputCol+1)
Expand All @@ -493,10 +496,10 @@ def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent, outputClass,
inspect = runInspect(key=trainDstKey)
infoFromInspect(inspect, "%s after mungeDataset on %s" % (trainDstKey, srcKey) )

print "test: same, but use the same runif() random result, complement"
print "test: same, but use the same runif() random result, complement comparison"

execExpr = "cct.hex=runif(%s);" % srcKey
execExpr += "%s=%s[cct.hex%s,];" % (testDstKey, srcKey, '>0.9')
execExpr = ""
execExpr += "%s=%s[cct.hex>%s,];" % (testDstKey, srcKey, boundary)
if changeToBinomial:
execExpr += "%s[,%s]=%s[,%s]==%s;" % (testDstKey, outputCol+1, testDstKey, outputCol+1, outputClass)
execExpr += "factor(%s[, %s])" % (testDstKey, outputCol+1)
Expand Down
157 changes: 105 additions & 52 deletions py/h2o_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,13 @@ def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False
# h2o GLM will verboseprint the result and print errors.
# so don't have to do that
# different when cross validation is used? No trainingErrorDetails?
GLMModel = glm['GLMModel']
if h2o.beta_features:
GLMModel = glm['glm_model']
else:
GLMModel = glm['GLMModel']

warnings = None
if 'warnings' in GLMModel:
if 'warnings' in GLMModel and GLMModel['warnings']:
warnings = GLMModel['warnings']
# stop on failed
x = re.compile("failed", re.IGNORECASE)
Expand All @@ -128,82 +132,127 @@ def simpleCheckGLM(self, glm, colX, allowFailWarning=False, allowZeroCoeff=False
# not in GLMGrid?

# FIX! don't get GLMParams if it can't solve?
GLMParams = GLMModel["GLMParams"]
if h2o.beta_features:
GLMParams = GLMModel['glm']
else:
GLMParams = GLMModel["GLMParams"]
family = GLMParams["family"]

iterations = GLMModel['iterations']
if h2o.beta_features:
submodels0 = GLMModel['submodels'][0]
iterations = submodels0['iteration']
else:
iterations = GLMModel['iterations']

print "GLMModel/iterations:", iterations

# if we hit the max_iter, that means it probably didn't converge. should be 1-maxExpectedIter
if maxExpectedIterations is not None and iterations > maxExpectedIterations:
raise Exception("Convergence issue? GLM did iterations: %d which is greater than expected: %d" % (iterations, maxExpectedIterations) )

# pop the first validation from the list
if 'validations' not in GLMModel:
raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel))

validationsList = GLMModel['validations']
# don't want to modify validationsList in case someone else looks at it
validations = validationsList[0]
if h2o.beta_features:
if 'validation' not in submodels0:
raise Exception("Should be a 'validations' key in submodels0: %s" % h2o.dump_json(submodels0))
validationsList = submodels0['validation']
validations = validationsList

else:
# pop the first validation from the list
if 'validations' not in GLMModel:
raise Exception("Should be a 'validations' key in GLMModel: %s" % h2o.dump_json(GLMModel))
validationsList = GLMModel['validations']
# don't want to modify validationsList in case someone else looks at it
validations = validationsList[0]

# xval. compare what we asked for and what we got.
n_folds = kwargs.setdefault('n_folds', None)
if not 'xval_models' in validations:
if n_folds > 1:
raise Exception("No cross validation models returned. Asked for "+n_folds)
else:
xval_models = validations['xval_models']
if n_folds and n_folds > 1:
if len(xval_models) != n_folds:
raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds)

# not checked in v2?
if not h2o.beta_features:
if not 'xval_models' in validations:
if n_folds > 1:
raise Exception("No cross validation models returned. Asked for "+n_folds)
else:
# should be default 10?
if len(xval_models) != 10:
raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10")

print "GLMModel/validations"
validations['err'] = h2o_util.cleanseInfNan(validations['err'])
validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
print "%15s %s" % ("err:\t", validations['err'])
print "%15s %s" % ("nullDev:\t", validations['nullDev'])
print "%15s %s" % ("resDev:\t", validations['resDev'])
xval_models = validations['xval_models']
if n_folds and n_folds > 1:
if len(xval_models) != n_folds:
raise Exception(len(xval_models)+" cross validation models returned. Asked for "+n_folds)
else:
# should be default 10?
if len(xval_models) != 10:
raise Exception(str(len(xval_models))+" cross validation models returned. Default should be 10")

if h2o.beta_features:
print "GLMModel/validations"
validations['avg_err'] = h2o_util.cleanseInfNan(validations['avg_err'])
validations['null_deviance'] = h2o_util.cleanseInfNan(validations['null_deviance'])
validations['residual_deviance'] = h2o_util.cleanseInfNan(validations['residual_deviance'])
print "%15s %s" % ("avg_err:\t", validations['avg_err'])
print "%15s %s" % ("null_deviance:\t", validations['null_deviance'])
print "%15s %s" % ("residual_deviance:\t", validations['residual_deviance'])

else:
print "GLMModel/validations"
validations['err'] = h2o_util.cleanseInfNan(validations['err'])
validations['nullDev'] = h2o_util.cleanseInfNan(validations['nullDev'])
validations['resDev'] = h2o_util.cleanseInfNan(validations['resDev'])
print "%15s %s" % ("err:\t", validations['err'])
print "%15s %s" % ("nullDev:\t", validations['nullDev'])
print "%15s %s" % ("resDev:\t", validations['resDev'])

# threshold only there if binomial?
# auc only for binomial
if family=="binomial":
print "%15s %s" % ("auc:\t", validations['auc'])
print "%15s %s" % ("threshold:\t", validations['threshold'])
if h2o.beta_features:
print "%15s %s" % ("best_threshold:\t", validations['best_threshold'])
else:
print "%15s %s" % ("threshold:\t", validations['threshold'])

if family=="poisson" or family=="gaussian":
print "%15s %s" % ("aic:\t", validations['aic'])

if math.isnan(validations['err']):
emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err'])
raise Exception(emsg)
if not h2o.beta_features:
if math.isnan(validations['err']):
emsg = "Why is this err = 'nan'?? %6s %s" % ("err:\t", validations['err'])
raise Exception(emsg)

if math.isnan(validations['resDev']):
emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev'])
raise Exception(emsg)
if math.isnan(validations['resDev']):
emsg = "Why is this resDev = 'nan'?? %6s %s" % ("resDev:\t", validations['resDev'])
raise Exception(emsg)

# legal?
if math.isnan(validations['nullDev']):
## emsg = "Why is this nullDev = 'nan'?? %6s %s" % ("nullDev:\t", validations['nullDev'])
## raise Exception(emsg)
pass
# legal?
if math.isnan(validations['nullDev']):
pass

# get a copy, so we don't destroy the original when we pop the intercept
if doNormalized:
coefficients = GLMModel['normalized_coefficients'].copy()
if h2o.beta_features:
if doNormalized:
coefficients = submodels0['normalized_beta'].copy()
else:
print "beta:", submodels0['beta']
coefficients = list(submodels0['beta']) # copy

else:
coefficients = GLMModel['coefficients'].copy()
if doNormalized:
coefficients = GLMModel['normalized_coefficients'].copy()
else:
coefficients = GLMModel['coefficients'].copy()

if h2o.beta_features:
column_names = GLMModel['_names']
print "column_names:", column_names
else:
column_names = GLMModel['column_names']
# get the intercept out of there into it's own dictionary
intercept = coefficients.pop('Intercept', None)

column_names = GLMModel['column_names']
# get the intercept out of there into it's own dictionary
intercept = coefficients.pop('Intercept', None)
# have to skip the output col! get it from kwargs
# better always be there!
y = kwargs['y']
if h2o.beta_features:
y = kwargs['response']
else:
y = kwargs['y']


# the dict keys are column headers if they exist...how to order those? new: use the 'column_names'
Expand Down Expand Up @@ -291,9 +340,13 @@ def add_to_coefficient_list_and_string(c,cList,cString):
"sum of abs. value of GLM coefficients/intercept is " + str(s) + ", not >= 1e-26"
))

print "GLMModel model time (milliseconds):", GLMModel['model_time']
print "GLMModel validation time (milliseconds):", validations['val_time']
print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']
if h2o.beta_features:
print "submodels0, run_time (milliseconds):", submodels0['run_time']
else:

print "GLMModel model time (milliseconds):", GLMModel['model_time']
print "GLMModel validation time (milliseconds):", validations['val_time']
print "GLMModel lsm time (milliseconds):", GLMModel['lsm_time']

# shouldn't have any errors
h2o.check_sandbox_for_errors()
Expand Down
2 changes: 1 addition & 1 deletion py/testdir_multi_jvm/test_rf_covtype_train_oobe_fvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def rf_covtype_train_oobe(self, csvFilename, checkExpectedResults=True):
# just do random split for now
dataKeyTrain = 'rTrain.hex'
dataKeyTest = 'rTest.hex'
h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=0.90, outputClass=4,
h2o_cmd.createTestTrain(hex_key, dataKeyTrain, dataKeyTest, trainPercent=90, outputClass=4,
outputCol=numCols-1, changeToBinomial=not DO_MULTINOMIAL)
sliceResult = {'destination_key': dataKeyTrain}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ def setUpClass(cls):
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_GLM_covtype_train(self):
def test_GLM2_covtype_train(self):
h2o.beta_features = True
importFolderPath = "standard"
csvFilename = 'covtype.shuffled.data'
csvPathname = importFolderPath + "/" + csvFilename
Expand All @@ -32,70 +33,64 @@ def test_GLM_covtype_train(self):

inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
print "\n" + csvPathname, \
" num_rows:", "{:,}".format(inspect['num_rows']), \
" num_cols:", "{:,}".format(inspect['num_cols'])
" numRows:", "{:,}".format(inspect['numRows']), \
" numCols:", "{:,}".format(inspect['numCols'])

# how many rows for each pct?
num_rows = inspect['num_rows']
pct10 = int(num_rows * .1)
numRows = inspect['numRows']
pct10 = int(numRows * .1)
rowsForPct = [i * pct10 for i in range(0,11)]
# this can be slightly less than 10%
last10 = num_rows - rowsForPct[9]
last10 = numRows - rowsForPct[9]
rowsForPct[10] = last10
# use mod below for picking "rows-to-do" in case we do more than 9 trials
# use 10 if 0 just to see (we copied 10 to 0 above)
rowsForPct[0] = rowsForPct[10]

print "Creating the key of the last 10% data, for scoring"
dataKeyTest = "rTest"
trainDataKey = "rTrain"
testDataKey = "rTest"
# start at 90% rows + 1

execExpr = dataKeyTest + " = slice(" + hex_key + "," + str(rowsForPct[9]+1) + ")"
h2o_exec.exec_expr(None, execExpr, resultKey=dataKeyTest, timeoutSecs=10)
h2o_cmd.createTestTrain(srcKey=hex_key, trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
# will have to live with random extract. will create variance

kwargs = {
'y': 54,
'response': 'C54',
'max_iter': 20,
'n_folds': 0,
'thresholds': 0.5,
'alpha': 0.1,
'lambda': 1e-5,
'family': 'binomial',
'case_mode': '=',
'case': 2
'case_val': 4,
}
timeoutSecs = 60

for trial in range(10):
# always slice from the beginning
rowsToUse = rowsForPct[trial%10]
resultKey = "r" + str(trial)
execExpr = resultKey + " = slice(" + hex_key + ",1," + str(rowsToUse) + ")"
h2o_exec.exec_expr(None, execExpr, resultKey=resultKey, timeoutSecs=10)
parseResult['destination_key'] = resultKey
# adjust timeoutSecs with the number of trees
# seems ec2 can be really slow

h2o_cmd.createTestTrain(srcKey=hex_key, trainDstKey=trainDataKey, testDstKey=testDataKey, trainPercent=90)
parseResult['destination_key'] = trainDataKey

start = time.time()
glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, **kwargs)
print "glm end on ", parseResult['destination_key'], 'took', time.time() - start, 'seconds'
h2o_glm.simpleCheckGLM(self, glm, None, **kwargs)

GLMModel = glm['GLMModel']
modelKey = GLMModel['model_key']
modelKey = glm['glm_model']['_selfKey']

start = time.time()
glmScore = h2o_cmd.runGLMScore(key=dataKeyTest, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
print "glmScore end on ", dataKeyTest, 'took', time.time() - start, 'seconds'
### print h2o.dump_json(glmScore)
glmScore = h2o_cmd.runGLMScore(key=testDataKey, model_key=modelKey, thresholds="0.5", timeoutSecs=timeoutSecs)
print "glmScore end on ", testDataKey, 'took', time.time() - start, 'seconds'
classErr = glmScore['validation']['classErr']
auc = glmScore['validation']['auc']
err = glmScore['validation']['err']
print "classErr:", classErr
print "err:", err
print "auc:", auc

print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/num_rows), "pct. of all rows"
print "Trial #", trial, "completed", "using %6.2f" % (rowsToUse*100.0/numRows), "pct. of all rows"

if __name__ == '__main__':
h2o.unit_main()
Loading

0 comments on commit b0d4b40

Please sign in to comment.