Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
cliffclick committed Jul 31, 2013
2 parents 60a753d + 4658b04 commit 7c67304
Show file tree
Hide file tree
Showing 21 changed files with 706 additions and 54 deletions.
14 changes: 6 additions & 8 deletions py/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def check_sandbox_for_errors(sandbox_ignore_errors=False):
# don't detect these class loader info messags as errors
#[Loaded java.lang.Error from /usr/lib/jvm/java-7-oracle/jre/lib/rt.jar]
foundBad = regex1.search(line) and not (
('error rate' in line) or ('[Loaded ' in line) or
('error rate' in line) or ('[Loaded ' in line) or ('class.error' in line) or
('[WARN]' in line) or ('CalcSquareErrorsTasks' in line))

if (printing==0 and foundBad):
Expand Down Expand Up @@ -586,17 +586,15 @@ def check_sandbox_for_errors(sandbox_ignore_errors=False):
justInfo &= re.match("INFO:", e) or ("apache" in e)

if not justInfo:
emsg1 = " check_sandbox_for_errors: Errors in sandbox stdout or stderr.\n" + \
emsg1 = " check_sandbox_for_errors: Errors in sandbox stdout or stderr (including R stdout/stderr).\n" + \
"Could have occurred at any prior time\n\n"
emsg2 = "".join(errLines)
if nodes:
nodes[0].sandbox_error_report(True)

# can build a cloud that ignores all sandbox things that normally fatal the test
# kludge, test will set this directly if it wants, rather than thru build_cloud
# parameter.
# we need the sandbox_ignore_errors, for the test teardown_cloud..the state
# disappears!
# Can build a cloud that ignores all sandbox things that normally fatal the test
# Kludge, test will set this directly if it wants, rather than thru build_cloud parameter.
# we need the sandbox_ignore_errors, for the test teardown_cloud..the state disappears!
if sandbox_ignore_errors or (nodes and nodes[0].sandbox_ignore_errors):
pass
else:
Expand Down Expand Up @@ -1174,7 +1172,7 @@ def random_forest_view(self, data_key, model_key, timeoutSecs=300, print_params=
'model_key': model_key,
'out_of_bag_error_estimate': 1,
'class_weights': None,
'response_variable': -3, # put -3 here to make H2O blow up if it's not specified. required to be correct (same as RF)
'response_variable': None,
}
browseAlso = kwargs.pop('browseAlso',False)

Expand Down
28 changes: 18 additions & 10 deletions py/h2o_cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,16 +269,24 @@ def wait_for_live_port(ip, port, retries=3):
# see how it's used in tests named above
def delete_csv_key(csvFilename, importFullList):
# remove the original data key
for k in importFullList['succeeded']:
### print "possible delete:", deleteKey
# don't delete any ".hex" keys. the parse results above have .hex
# this is the name of the multi-file (it comes in as a single file?)
# This deletes the source key?
key = k['key']
if csvFilename in key:
print "\nRemoving", key
removeKeyResult = h2o.nodes[0].remove_key(key=key)
### print "removeKeyResult:", h2o.dump_json(removeKeyResult)
# the list could be from hdfs/s3 or local. They have to different list structures
if 'succeeded' in importFullList:
kDict = importFullList['succeeded']
for k in kDict:
key = k['key']
if csvFilename in key:
print "\nRemoving", key
removeKeyResult = h2o.nodes[0].remove_key(key=key)
elif 'keys' in importFullList:
kDict = importFullList['keys']
for k in kDict:
key = k
if csvFilename in key:
print "\nRemoving", key
removeKeyResult = h2o.nodes[0].remove_key(key=key)
else:
raise Exception ("Can't find 'files' or 'succeeded' in your file dict. why? not from hdfs/s3 or local?")


# checks the key distribution in the cloud, and prints warning if delta against avg
# is > expected
Expand Down
5 changes: 3 additions & 2 deletions py/h2o_glm.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,15 +336,16 @@ def simpleCheckGLMGrid(self, glmGridResult, colX=None, allowFailWarning=False, *

# get input from this.
# (missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
# h2o_cmd.columnInfoFromInspect(parseKey, exceptionOnMissingValues=False, timeoutSecs=300)
# h2o_cmd.columnInfoFromInspect(parseKey['destination_key',
# exceptionOnMissingValues=False, timeoutSecs=300)

def goodXFromColumnInfo(y,
num_cols=None, missingValuesDict=None, constantValuesDict=None, enumSizeDict=None, colTypeDict=None, colNameDict=None,
keepPattern=None, key=None, timeoutSecs=120, forRF=False):

y = str(y)

# if we pass a parseKey, means we want to get the info ourselves here
# if we pass a key, means we want to get the info ourselves here
if key is not None:
(missingValuesDict, constantValuesDict, enumSizeDict, colTypeDict, colNameDict) = \
h2o_cmd.columnInfoFromInspect(key, exceptionOnMissingValues=False, timeoutSecs=timeoutSecs)
Expand Down
29 changes: 22 additions & 7 deletions py/h2o_rf.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,32 @@ def simpleCheckRFView(node, rfv, noPrint=False, **kwargs):
time = response['time']

trees = rfv['trees'] # Dict
depth = trees['depth']
depth = trees['depth'] # Dict
# zero depth okay?
## if ' 0.0 ' in depth:
## raise Exception("depth in RFView seems wrong. depth:", depth)
leaves = trees['leaves']
leaves = trees['leaves'] # Dict
if ' 0.0 ' in leaves:
raise Exception("leaves in RFView seems wrong. leaves:", leaves)

print """
Leaves: {0} / {1} / {2}
Depth: {3} / {4} / {5}
mtry: {6}
Type: {7}
Err: {8} %
""".format(
rfv['trees']['leaves']['min'],
rfv['trees']['leaves']['mean'],
rfv['trees']['leaves']['max'],
rfv['trees']['depth']['min'],
rfv['trees']['depth']['mean'],
rfv['trees']['depth']['max'],
rfv['mtry'],
rfv['confusion_matrix']['type'],
rfv['confusion_matrix']['classification_error'] *100,
)

number_built = trees['number_built']
if (number_built<=0 or number_built>20000):
raise Exception("number_built in RFView seems wrong. number_built:", number_built)
Expand Down Expand Up @@ -160,6 +179,7 @@ def scoreRF(scoreParseKey, trainResult, **kwargs):

start = time.time()
data_key = scoreParseKey['destination_key']
# NOTE: response_variable is required, and passed from kwargs here
scoreResult = h2o_cmd.runRFView(None, data_key, rfModelKey, ntree, **kwargs)

rftime = time.time()-start
Expand Down Expand Up @@ -196,10 +216,6 @@ def pp_rf_result(rf):
mtry: {6}
Type: {7}
Err: {8} %
Time: {9} seconds
Confusion matrix:
{10}
""".format(
rf['trees']['leaves']['min'],
rf['trees']['leaves']['mean'],
Expand All @@ -210,6 +226,5 @@ def pp_rf_result(rf):
rf['mtry'],
rf['confusion_matrix']['type'],
rf['confusion_matrix']['classification_error'] *100,
rf['response']['time'],
cm)

50 changes: 39 additions & 11 deletions py/testdir_0xdata_only/mnist_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import numpy
from numpy import append, array, int8, uint8, zeros

DO_REALS=True

# gzip infile to gzfile
def file_gzip(infile, gzfile):
import gzip
Expand All @@ -16,7 +18,7 @@ def file_gzip(infile, gzfile):
zipped_file.close()
print "\nGzip:", gzfile, "done"

def read(digits, dataset = "training", path = "."):
def read(digits, dataset="training", path="."):
"""
Loads MNIST files into 3D numpy arrays
Expand All @@ -32,7 +34,7 @@ def read(digits, dataset = "training", path = "."):
fname_img = os.path.join(path, 't10k-images-idx3-ubyte')
fname_lbl = os.path.join(path, 't10k-labels-idx1-ubyte')
else:
raise ValueError, "dataset must be 'testing' or 'training'"
raise ValueError, "dataset must be 'testing' or 'training"

flbl = open(fname_lbl, 'rb')
magic_nr, size = struct.unpack(">II", flbl.read(8))
Expand All @@ -47,8 +49,13 @@ def read(digits, dataset = "training", path = "."):
ind = [ k for k in xrange(size) if lbl[k] in digits ]
N = len(ind)

images = zeros((N, rows, cols), dtype=uint8)
labels = zeros((N, 1), dtype=int8)
if DO_REALS:
images = zeros((N, rows, cols), dtype=float)
labels = zeros((N, 1), dtype=int8) # always need these to be int for H2O RF output
else:
images = zeros((N, rows, cols), dtype=int8)
labels = zeros((N, 1), dtype=int8)

for i in xrange(len(ind)):
images[i] = array(img[ ind[i]*rows*cols : (ind[i]+1)*rows*cols ]).reshape((rows, cols))
labels[i] = lbl[ind[i]]
Expand All @@ -60,36 +67,57 @@ def read(digits, dataset = "training", path = "."):
from pylab import *
# from numpy import *

def doit(f):
def doit(prefix, f):
print "we want all the images"
images, labels = read(range(10), f)
if DO_REALS:
# If you want the values as floats between 0.0 and 1.0, just do
images /= 255.0
print images[0]

print "labels.shape", labels.shape
print "images.shape", images.shape
print "images[0].shape", images[0].shape
(a,b,c) = images.shape
if DO_REALS:
# If you want the values as floats between 0.0 and 1.0, just do
images /= 255.0

imagesF = images.reshape(a,b*c)
labelsF = labels

# stick label and pixels together
bothF = numpy.concatenate((labelsF, imagesF), 1)
print "labelsF.shape", labelsF.shape
print "imagesF.shape", imagesF.shape
print "both.shape", bothF.shape
print "bothF.shape", bothF.shape

# the output label was first in the concatenate. do the same for header
headerList = ['label']
headerList += ['p' + str(i) for i in range(784)]
# comma separated!
header = ','.join(map(str,headerList))
print header # just so we can see it.
numpy.savetxt('mnist_'+ f + '.csv', bothF, header=header, delimiter=',', fmt='%d')
if DO_REALS:
# first has to be integer for stupid h2o rf output (doesn't take fp)
# have to create a format string for each one as a result!
fmt = ",".join(["%i"] + ["%f"] * imagesF.shape[1])
else:
fmt = '%d'
numpy.savetxt(prefix + f + '.csv', bothF, header=header, delimiter=',', fmt=fmt)

# create the two csv files
doit('training')
doit('testing')
if DO_REALS:
prefix = "mnist_reals_"
else:
prefix = "mnist_"

doit(prefix, 'training')
doit(prefix, 'testing')

# we can copy this multiple times to get bigger parsed gz
file_gzip('mnist_training.csv', 'mnist_training.csv.gz')
file_gzip('mnist_testing.csv', 'mnist_testing.csv.gz')
file_gzip(prefix + 'training.csv', prefix + 'training.csv.gz')
file_gzip(prefix + 'testing.csv', prefix + 'testing.csv.gz')

# show merged images
if 1==0:
Expand Down
120 changes: 120 additions & 0 deletions py/testdir_0xdata_only/test_GLM_mnist_reals.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
import unittest
import random, sys, time, re
sys.path.extend(['.','..','py'])

import h2o, h2o_cmd, h2o_hosts, h2o_browse as h2b, h2o_import as h2i, h2o_glm, h2o_util
class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
# assume we're at 0xdata with it's hdfs namenode
global localhost
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(1)
else:
# all hdfs info is done thru the hdfs_config michal's ec2 config sets up?
h2o_hosts.build_cloud_with_hosts(1,
# this is for our amazon ec hdfs
# see https://github.com/0xdata/h2o/wiki/H2O-and-s3n
hdfs_name_node='10.78.14.235:9000',
hdfs_version='0.20.2')

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_GLM_mnist_reals(self):
importFolderPath = "/home/0xdiag/datasets/mnist"
csvFilelist = [
("mnist_reals_training.csv.gz", "mnist_reals_testing.csv.gz", 600),
]
# IMPORT**********************************************
# since H2O deletes the source key, we should re-import every iteration if we re-use the src in the list
importFolderResult = h2i.setupImportFolder(None, importFolderPath)
### print "importHDFSResult:", h2o.dump_json(importFolderResult)
succeededList = importFolderResult['files']
### print "succeededList:", h2o.dump_json(succeededList)

self.assertGreater(len(succeededList),1,"Should see more than 1 files in the import?")
# why does this hang? can't look at storeview after import?
print "\nTrying StoreView after the import folder"
h2o_cmd.runStoreView(timeoutSecs=30)

trial = 0
for (trainCsvFilename, testCsvFilename, timeoutSecs) in csvFilelist:
trialStart = time.time()

# PARSE test****************************************
testKey2 = testCsvFilename + "_" + str(trial) + ".hex"
start = time.time()
parseKey = h2i.parseImportFolderFile(None, testCsvFilename, importFolderPath,
key2=testKey2, timeoutSecs=timeoutSecs)
elapsed = time.time() - start
print "parse end on ", testCsvFilename, 'took', elapsed, 'seconds',\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "parse result:", parseKey['destination_key']

print "We won't use this pruning of x on test data. See if it prunes the same as the training"
y = 0 # first column is pixel value
print "y:"
x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)

# PARSE train****************************************
trainKey2 = trainCsvFilename + "_" + str(trial) + ".hex"
start = time.time()
parseKey = h2i.parseImportFolderFile(None, trainCsvFilename, importFolderPath,
key2=trainKey2, timeoutSecs=timeoutSecs)
elapsed = time.time() - start
print "parse end on ", trainCsvFilename, 'took', elapsed, 'seconds',\
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
print "parse result:", parseKey['destination_key']

# GLM****************************************
print "This is the pruned x we'll use"
x = h2o_glm.goodXFromColumnInfo(y, key=parseKey['destination_key'], timeoutSecs=300)
print "x:", x

params = {
'x': x,
'y': y,
'case_mode': '=',
'case': 0,
'family': 'binomial',
'lambda': 1.0E-5,
'alpha': 0.0,
'max_iter': 5,
'thresholds': 0.5,
'n_folds': 1,
'weight': 1,
'beta_epsilon': 1.0E-4,
}

for c in [0,1,2,3,4,5,6,7,8,9]:
kwargs = params.copy()
print "Trying binomial with case:", c
kwargs['case'] = c

timeoutSecs = 1800
start = time.time()
glm = h2o_cmd.runGLMOnly(parseKey=parseKey, timeoutSecs=timeoutSecs, pollTimeoutsecs=60, **kwargs)
elapsed = time.time() - start
print "GLM completed in", elapsed, "seconds.", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)

h2o_glm.simpleCheckGLM(self, glm, None, noPrint=True, **kwargs)
GLMModel = glm['GLMModel']
modelKey = GLMModel['model_key']

start = time.time()
glmScore = h2o_cmd.runGLMScore(key=testKey2, model_key=modelKey, thresholds="0.5",
timeoutSecs=60)
elapsed = time.time() - start
print "GLMScore in", elapsed, "secs", \
"%d pct. of timeout" % ((elapsed*100)/timeoutSecs)
h2o_glm.simpleCheckGLMScore(self, glmScore, **kwargs)

if __name__ == '__main__':
h2o.unit_main()
Loading

0 comments on commit 7c67304

Please sign in to comment.