Skip to content

Commit

Permalink
Merge branch 'master' of github.com:h2oai/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
tomkraljevic committed Dec 19, 2014
2 parents 8ea863e + 704541c commit b731076
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 38 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ dw_3:
sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/deployment/hadoop_tutorial.html
sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/Ruser/Rinstall.html
sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/deployment/ec2_build_ami.html
sed -i -e "s/SUBST_RELEASE_NAME/$(RELEASE_NAME)/g; s/SUBST_PROJECT_VERSION/$(PROJECT_VERSION)/g; s/SUBST_BUILD_NUMBER/$(BUILD_NUMBER)/g" $(BUILD_WEBSITE_DIR)/deployment/H2O_Hadoop_Mapr.html

#
# Set appropriately for your data size to quickly try out H2O.
Expand Down
8 changes: 4 additions & 4 deletions h2o-docs/source/deployment/H2O_Hadoop_Mapr.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,25 +10,25 @@ All mappers must be able to communicate with each other and need to run at the s

0. Log in to the Hadoop cluster:

`ssh <username>@<HadoopClusterName>`
`ssh <username>@<HadoopNodeAddress>`

If you are asked if you want to continue connecting, enter `yes`.
0. Enter the following:

`wget http://h2o-release.s3.amazonaws.com/h2o/master/1624/h2o-2.9.0.1624.zip`
`wget http://h2o-release.s3.amazonaws.com/h2o/SUBST_RELEASE_NAME/SUBST_BUILD_NUMBER/h2o-SUBST_PROJECT_VERSION.zip`

0. Wait while H2O downloads - the progress bar indicates completion.

`100%[=================================>] 140,951,040 2.23M/s in 65s`

0. Enter the following:

`unzip h2o-2.9.0.1624.zip`
`unzip h2o-SUBST_PROJECT_VERSION.zip`

0. Wait while the H2O package unzips.
0. On the Hadoop node, change the current directory to the location of the Hadoop and H2O driver jar files:

`cd h2o-2.9.0.1624/hadoop`
`cd h2o-SUBST_PROJECT_VERSION/hadoop`

0. Enter the following:

Expand Down
16 changes: 8 additions & 8 deletions py/h2o_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def fill_in_expr_template(exprTemplate, colX=None, n=None, row=None, keyX=None,
# just a string?
execExpr = exprTemplate
if colX is not None:
print "Assume colX %s is zero-based..added 1 for R based exec2" % colX
### print "Assume colX %s is zero-based..added 1 for R based exec2" % colX
execExpr = re.sub('<col1>', str(colX+1), execExpr)
# this is just another value
execExpr = re.sub('<col2>', str(colX+2), execExpr)
Expand All @@ -93,11 +93,11 @@ def fill_in_expr_template(exprTemplate, colX=None, n=None, row=None, keyX=None,
execExpr = re.sub('<m>', str(m), execExpr)
execExpr = re.sub('<m-1>', str(m-1), execExpr)
### verboseprint("\nexecExpr:", execExpr)
print "execExpr:", execExpr
### print "execExpr:", execExpr
return execExpr


def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False):
def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2oError=False, **kwargs):
if not node:
node = h2o_nodes.nodes[0]
start = time.time()
Expand Down Expand Up @@ -129,7 +129,7 @@ def exec_expr(node=None, execExpr=None, resultKey=None, timeoutSecs=10, ignoreH2
print "function return"
result = resultExec['funstr']
else:
print "scalar return"
### print "scalar return"
result = resultExec['scalar']

return resultExec, result
Expand Down Expand Up @@ -192,7 +192,7 @@ def exec_expr_list_rand(lenNodes, exprList, keyX,
print "Trial #", trial, "completed\n"

def exec_expr_list_across_cols(lenNodes, exprList, keyX,
minCol=0, maxCol=55, timeoutSecs=10, incrementingResult=True):
minCol=0, maxCol=55, timeoutSecs=10, incrementingResult=True, **kwargs):
colResultList = []
for colX in range(minCol, maxCol):
for i, exprTemplate in enumerate(exprList):
Expand All @@ -215,16 +215,16 @@ def exec_expr_list_across_cols(lenNodes, exprList, keyX,
resultKey = keyX

# v2
(resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs)
print "\nexecResult:", dump_json(resultExec)
(resultExec, result) = exec_expr(h2o_nodes.nodes[execNode], execExpr, None, timeoutSecs, **kwargs)
# print "\nexecResult:", dump_json(resultExec)

### h2b.browseJsonHistoryAsUrlLastMatch("Inspect")
# slows things down to check every iteration, but good for isolation
if check_sandbox_for_errors():
raise Exception(
"Found errors in sandbox stdout or stderr, on trial #%s." % trial)

print "Column #", colX, "completed\n"
### print "Column #", colX, "completed\n"
colResultList.append(result)

return colResultList
Expand Down
2 changes: 1 addition & 1 deletion py/h2o_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@ def import_files(self, path, timeoutSecs=180):
return a

# 'destination_key', 'escape_nan' 'expression'
def exec_query(self, timeoutSecs=20, ignoreH2oError=False, print_params=True, **kwargs):
def exec_query(self, timeoutSecs=20, ignoreH2oError=False, print_params=False, **kwargs):
# only v2 now
params_dict = {
'str': None,
Expand Down
40 changes: 28 additions & 12 deletions py/testdir_multi_jvm/test_many_fp_formats_libsvm_2_fvec.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,19 @@
sys.path.extend(['.','..','../..','py'])
import h2o, h2o_cmd, h2o_browse as h2b, h2o_import as h2i, h2o_exec as h2e, h2o_glm
import h2o_util
from collections import OrderedDict

zeroList = [
'Result0 = 0',
]
# the first column should use this
exprList = [
'Result<n> = sum(<keyX>[<col1>])',
'Result<n> = sum(<keyX>[,<col1>])',
]

DO_SUMMARY = False
DO_COMPARE_SUM = False
DO_COMPARE_SUM = False
DO_BAD_SEED = True

def write_syn_dataset(csvPathname, rowCount, colCount, SEEDPERFILE, sel, distribution):
# we can do all sorts of methods off the r object
Expand Down Expand Up @@ -41,6 +43,7 @@ def addRandValToRowStuff(colNumber, valMin, valMax, rowData, synColSumDict):
classMin = -36
classMax = 36
dsf = open(csvPathname, "w+")
# ordinary dict
synColSumDict = {0: 0} # guaranteed to have col 0 for output
# even though we try to get a max colCount with random, we might fall short
# track what max we really got
Expand Down Expand Up @@ -88,15 +91,19 @@ def tearDown(self):
@classmethod
def setUpClass(cls):
global SEED
SEED = h2o.setup_random_seed()
if DO_BAD_SEED:
SEED = h2o.setup_random_seed(seed=5605820711843900818)
else:
SEED = h2o.setup_random_seed()
h2o.init(2,java_heap_GB=5)

@classmethod
def tearDownClass(cls):
# h2o.sleep(3600)
h2o.tear_down_cloud()

def test_many_fp_formats_libsvm_2_fvec(self):
# h2b.browseTheCloud()
h2b.browseTheCloud()
SYNDATASETS_DIR = h2o.make_syn_dir()
tryList = [
(100, 10000, 'cA', 300, 'sparse50'),
Expand Down Expand Up @@ -146,18 +153,20 @@ def test_many_fp_formats_libsvm_2_fvec(self):
if DO_COMPARE_SUM:
h2e.exec_zero_list(zeroList)
colResultList = h2e.exec_expr_list_across_cols(None, exprList, selKey2, maxCol=colNumberMax+1,
timeoutSecs=timeoutSecs)
print "\n*************"
print "colResultList", colResultList
print "*************"
timeoutSecs=timeoutSecs, print_params=False)
#print "\n*************"
#print "colResultList", colResultList
#print "*************"

self.assertEqual(rowCount, numRows, msg="generated %s rows, parsed to %s rows" % (rowCount, numRows))
# need to fix this for compare to expected
# we should be able to keep the list of fp sums per col above
# when we generate the dataset
### print "\nsynColSumDict:", synColSumDict

for k,v in synColSumDict.iteritems():
sortedColSumDict = OrderedDict(sorted(synColSumDict.items()))
print sortedColSumDict
for k,v in sortedColSumDict.iteritems():
print k
if DO_COMPARE_SUM:
# k should be integers that match the number of cols
self.assertTrue(k>=0 and k<len(colResultList))
Expand All @@ -172,8 +181,15 @@ def test_many_fp_formats_libsvm_2_fvec(self):
# enums don't have mean, but we're not enums
mean = float(inspect['cols'][k]['mean'])
# our fp formats in the syn generation sometimes only have two places?
self.assertAlmostEqual(mean, synMean, places=0,
msg='col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))
if not h2o_util.approxEqual(mean, synMean, tol=1e-4):
execExpr = 'sum(%s[,%s])' % (selKey2, k+1)
resultExec = h2o_cmd.runExec(str=execExpr, timeoutSecs=300)
print "Result of exec sum on failing col:..:", k, h2o.dump_json(resultExec)
print "Result of remembered sum on failing col:..:", k, v
print "Result of inspect mean * rowCount on failing col..:", mean * rowCount
print "k: ",k , "mean: ", mean, "remembered sum/rowCount : ", synMean
sys.stdout.flush()
raise Exception('col %s mean %0.6f is not equal to generated mean %0.6f' % (k, mean, synMean))

naCnt = inspect['cols'][k]['naCnt']
self.assertEqual(0, naCnt,
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/water/fvec/CXDChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ protected final double getFValue(int off){
return (long)d;
}
@Override protected double atd_impl(int idx) {
int off = _offCache;
/* int off = _offCache;
int prevIdx = getId(off);
if(prevIdx == idx)
return getFValue(off);
Expand All @@ -55,8 +55,8 @@ protected final double getFValue(int off){
_offCache = (off += _ridsz + _valsz);
return getFValue(off);
}
}
off = findOffset(idx);
}*/
int off = findOffset(idx);
if(getId(off) != idx)return 0;
return getFValue(off);
}
Expand Down
21 changes: 11 additions & 10 deletions src/main/java/water/fvec/CXIChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ public class CXIChunk extends Chunk {
protected transient int _ridsz; // byte size of stored (chunk-relative) row nums
protected static final int OFF = 6;
protected transient int _lastOff = OFF;

protected transient volatile int _offCache = OFF;

private static final long [] NAS = {C1Chunk._NA,C2Chunk._NA,C4Chunk._NA,C8Chunk._NA};

Expand Down Expand Up @@ -57,7 +57,7 @@ protected CXIChunk(int len, int nzs, int valsz, byte [] buf){
@Override boolean setNA_impl(int idx) { return false; }

@Override protected long at8_impl(int idx) {
int off = _offCache;
/* int off = _offCache;
int prevIdx = getId(off);
if(prevIdx == idx)
return getIValue(off);
Expand All @@ -68,10 +68,10 @@ protected CXIChunk(int len, int nzs, int valsz, byte [] buf){
_offCache = (off += _ridsz + _valsz);
return getIValue(off);
}
}
off = findOffset(idx);
} */
int off = findOffset(idx);
if(getId(off) != idx)return 0;
_offCache = off;
// _offCache = off;
long v = getIValue(off);
if( v== NAS[_valsz_log])
throw new IllegalArgumentException("at8 but value is missing");
Expand Down Expand Up @@ -186,18 +186,19 @@ protected final int findOffset(int idx) {
return this;
}

protected transient volatile int _offCache = OFF;

@Override public final int nextNZ(int rid){
if(rid == -1) {
final int off = rid == -1?OFF:findOffset(rid);
/* if(rid == -1) {
_offCache = OFF;
return getId(OFF);
}
int off = _offCache;
int off = _offCache; */
int x = getId(off);
if(x != rid) {
/* if(x != rid) {
off = _offCache = rid == -1 ? OFF : findOffset(rid);
x = getId(off);
}
}*/
if(x > rid)return x;
if(off < _mem.length - _ridsz - _valsz)
return getId(off + _ridsz + _valsz);
Expand Down

0 comments on commit b731076

Please sign in to comment.