Skip to content

Commit

Permalink
Merge branch 'master' of github.com:0xdata/h2o
Browse files Browse the repository at this point in the history
  • Loading branch information
tomasnykodym committed Oct 19, 2013
2 parents 301e796 + bae348c commit cb60500
Show file tree
Hide file tree
Showing 8 changed files with 265 additions and 23 deletions.
21 changes: 11 additions & 10 deletions bench/BMLogs/bigLogger.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ function echoLine {
fi
fi
}

checkDExists ${OUTDIR}
checkDExists ${rawLogs}
checkDExists ${rawLogs}/procstat
Expand All @@ -78,11 +79,10 @@ do
PREVTOTALS[$i]=0
done

start=`date +%s`
start=`cat starttime`
while :; do
#dump raw logs first
ts=`date +"%Y-%m-%d-%H-%M-%S"`
echo $ts
cat /proc/stat >> ${rawLogs}/procstat/${ts}_procstat_${mach}
cat /proc/meminfo >> ${rawLogs}/meminfo/${ts}_meminfo_${mach}
cat /proc/net/dev >> ${rawLogs}/netdev/${ts}_netdev_${mach}
Expand Down Expand Up @@ -124,16 +124,17 @@ while :; do
echo $(( `date +%s` - $start )),$lineidle >> $idlePerfFile
echo $(( `date +%s` - $start )),$lineiowait >> $iowaitPerfFile

cat /proc/meminfo | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > memTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $2,$3,$4,$5}' > recTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $10,$11,$12,$13}' > traTMP
echoLine memTMP $start $memPerfFile 1 1
echoLine recTMP $start $netReceivePerfFile 0
echoLine traTMP $start $netTransmitPerfFile 0
cat /proc/meminfo | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > bmemTMP
echo $pwd
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $2,$3,$4,$5}' > brecTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $10,$11,$12,$13}' > btraTMP
echoLine bmemTMP $start $memPerfFile 1 1
echoLine brecTMP $start $netReceivePerfFile 0
echoLine btraTMP $start $netTransmitPerfFile 0
#get top 10 processes from top and then just store them, may/not be interesting...
ti="$(( `date +%s` - ${start} ))"
top -b | head -n 17 | tail -n 10 | awk -v t=$ti -F' ' 'OFS="," {print t,$1,$2,$6,$9,$10,$12}' >> $topPerfFile
vmstat | tail -n 1 | awk -v t=$ti -F' ' 'OFS="," {print t,$7,$8}' >> $swapPerfFile
rm *TMP
sleep 1
rm b*TMP
sleep 30
done
29 changes: 19 additions & 10 deletions bench/BMLogs/littleLogger.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
#!/bin/bash

OUTDIR='LittleLoggerFiles'
mkdir ${OUTDIR}

if [ ! -d ${OUTDIR} ]
then
mkdir ${OUTDIR}
fi

#last 3 digits of inet addr
mach=`ifconfig | grep -o "inet addr:192.168.1.[0-9]*" | grep -o 192.168.1.* | awk -F'.' '{print $4}'`
Expand All @@ -20,7 +24,12 @@ swapPerfFile=${OUTDIR}/$1-`date +%Y-%m-%d`"-sisoPerf_"$mach".csv"
cpuheader='time(s)'
head -n 33 /proc/stat | tail -n 32 | awk -F' ' 'OFS="," {print $1}' > tmpfile
cpuheader=$cpuheader,`./transpose.sh tmpfile`
rm tmpfile

if [ -a tmpfile ]
then
rm tmpfile
fi

memheader='time(s),MemTotal,MemFree,Cached,Writeback'
topheader='time(s),PID,USER,RES,%CPU,%MEM,COMMAND'
netheader='time(s),bytes,packets,errs,drop'
Expand Down Expand Up @@ -65,7 +74,7 @@ do
PREVTOTALS[$i]=0
done

start=`date +%s`
start=`cat starttime`
while :; do
a=1
for i in {0..34}
Expand Down Expand Up @@ -103,17 +112,17 @@ while :; do
echo $(( `date +%s` - $start )),$lineidle >> $idlePerfFile
echo $(( `date +%s` - $start )),$lineiowait >> $iowaitPerfFile

cat /proc/meminfo | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > memTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $2,$3,$4,$5}' > recTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $10,$11,$12,$13}' > traTMP
echoLine memTMP $start $memPerfFile 1 1
echoLine recTMP $start $netReceivePerfFile 0
echoLine traTMP $start $netTransmitPerfFile 0
cat /proc/meminfo | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > lmemTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $2,$3,$4,$5}' > lrecTMP
grep lo /proc/net/dev | awk -F' ' 'OFS="," {print $10,$11,$12,$13}' > ltraTMP
echoLine lmemTMP $start $memPerfFile 1 1
echoLine lrecTMP $start $netReceivePerfFile 0
echoLine ltraTMP $start $netTransmitPerfFile 0
#get top 10 processes from top and then just store them, may/not be interesting...
ti="$(( `date +%s` - ${start} ))"
top -b | head -n 17 | tail -n 10 | awk -v t=$ti -F' ' 'OFS="," {print t,$1,$2,$6,$9,$10,$12}' >> $topPerfFile
vmstat | tail -n 1 | awk -v t=$ti -F' ' 'OFS="," {print t,$7,$8}' >> $swapPerfFile
rm *TMP
rm l*TMP
sleep 30
done

Expand Down
1 change: 1 addition & 0 deletions bench/BMLogs/starttime
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
1382158596
15 changes: 13 additions & 2 deletions bench/runBench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,11 @@ function doAlgo {
#sudo bash -c "sync; echo 3 > /proc/sys/vm/drop_caches"

echo "Running $1 benchmark..."
echo "Changing little logger phase..."
bash startLoggers.sh ${JSON} changePhase $1

pyScript="BMscripts/"$1"Bench.py"

wait
if [ ! $1 = "bigkmeans" ]
then
python ${pyScript} -cj BMscripts/${JSON} ${h2oBuild}
Expand Down Expand Up @@ -110,15 +112,24 @@ if [ ! -d ${benchmarks}/${h2oBuild}/${DATE} ]; then
mkdir -p ${benchmarks}/${h2oBuild}/${DATE}
fi

#global starttime out to all loggers
starttime=`date +%s`
echo $starttime > BMLogs/starttime

#Gentlemen...Start your loggers!
bash startLoggers.sh ${JSON} big
bash startLoggers.sh {$JSON} little

if [ ! $TEST = "all" ]
then
echo "$TEST"
doAlgo $TEST
else
$TEST
fi
wait

bash startLoggers.sh ${JSON} stop_

#remove annoying useless files
#rm pytest*flatfile*
#rm benchmark*log
Expand Down
65 changes: 65 additions & 0 deletions bench/startLoggers.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
#!/bin/bash

#ssh into each of the machines from ${JSON} (passed as first argument)
MACHINES=`cat BMscripts/$1 | python -c 'import sys, json; print json.load(sys.stdin)[sys.argv[1]]' ip | awk -F, 'OFS="," {gsub("[ u\x27\[\]]","", $0); print}'`
IFS=","
MACHINES=($MACHINES)

function startBigLoggers {
for i in ${MACHINES[@]}
do
echo "Starting bigLogger on ${i}"
scp BMLogs/starttime spencer@$i:/home/spencer/h2o/bench/BMLogs
# ssh spencer@$i "cd /home/spencer/h2o/bench/BMLogs; bash bigLogger.sh" &
done
}

function startLittleLoggers {
for i in ${MACHINES[@]}
do
echo "Starting littleLogger on ${i} on phase $1"
ssh spencer@$i "cd /home/spencer/h2o/bench/BMLogs; bash littleLogger.sh $1" &
done
}

function stopLittleLoggers {
for i in ${MACHINES[@]}
do
ssh spencer@$i ps ax|grep bash|grep littleLogger|awk '{print $1}'| xargs kill
done
}

function stopAllLoggers {
for i in ${MACHINES[@]}
do
ssh spencer@$i ps ax|grep bash|grep Logger|awk '{print $1}'| xargs kill
done
}

function changePhase {
echo "Stopping little loggers"
stopLittleLoggers >/dev/null
newPhase=$1
startLittleLoggers $1 >/dev/null
}

if [ $2 = "big" ]
then
startBigLoggers >/dev/null
fi

if [ $2 = "little" ]
then
startLittleLoggers START >/dev/null
fi

if [ $2 = "changePhase" ]
then
changePhase $3 >/dev/null
fi

if [ $2 = "stop_" ]
then
stopAllLoggers
fi

3 changes: 2 additions & 1 deletion py/h2o.py
Original file line number Diff line number Diff line change
Expand Up @@ -1863,7 +1863,8 @@ def GLM_shared(self, key,
'link': 'familyDefault',
}
else:
params_dict = {'vresponse' : None,
params_dict = {'source' : key,
'vresponse' : None,
'ignored_cols' : None,
'family' : None,
'lambda' : None,
Expand Down
67 changes: 67 additions & 0 deletions py/testdir_single_jvm/test_GLM2_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import unittest, random, sys, time
sys.path.extend(['.','..','py'])

import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i, h2o_exec, h2o_glm, h2o_jobs

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global localhost
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(node_count=1, java_heap_GB=10)
else:
h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=10)

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_GLM_prostate(self):
h2o.beta_features=True
importFolderPath = "logreg"
csvFilename = 'prostate.csv'
csvPathname = importFolderPath + "/" + csvFilename
hex_key = csvFilename + ".hex"

parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', hex_key=hex_key,
timeoutSecs=180, noPoll=True, doSummary=False)
h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
print inspect
print "\n" + csvPathname, \
" numRows:", "{:,}".format(inspect['numRows']), \
" numCols:", "{:,}".format(inspect['numCols'])

x = 'ID'
y = 'CAPSULE'
family = 'binomial'
alpha = '0.5'
lambda_ = '1E-4'
nfolds = '5'
case_mode = '='
case_val = '1'
f = 'prostate'

kwargs = { 'vresponse' : y,
'ignored_cols' : x,
'family' : family,
'lambda' : lambda_,
'alpha' : alpha,
'n_folds' : nfolds,
#'case_mode' : case_mode,
#'case_val' : case_val,
'destination_key' : "GLM("+f+")",
}
timeoutSecs = 60

start = time.time()
glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs)

h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

if __name__ == '__main__':
h2o.unit_main()
87 changes: 87 additions & 0 deletions py/testdir_single_jvm/test_KMeans2_basic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import unittest, time, sys
sys.path.extend(['.','..','py'])
import h2o, h2o_cmd, h2o_kmeans, h2o_hosts, h2o_import as h2i, h2o_jobs

class Basic(unittest.TestCase):
def tearDown(self):
h2o.check_sandbox_for_errors()

@classmethod
def setUpClass(cls):
global localhost
localhost = h2o.decide_if_localhost()
if (localhost):
h2o.build_cloud(1)
else:
h2o_hosts.build_cloud_with_hosts(1)

@classmethod
def tearDownClass(cls):
h2o.tear_down_cloud()

def test_B_kmeans_benign(self):
h2o.beta_features = True
csvPathname = "logreg"
csvFilename = "benign.csv"
print "\nStarting", csvFilename

parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False)
h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

expected = [
([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,

]
# all are multipliers of expected tuple value
allowedDelta = (0.01, 0.01, 0.01)

# loop, to see if we get same centers
for trial in range(2):
params = {'k' : 3,
'initialization' : 'Furthest',
'ignored_cols_by_name' : None,
'destination_key' : 'benign_k.hex',
'max_iter' : 50,
'seed' : 265211114317615310,
}
kwargs = params.copy()
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
(centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)


def test_C_kmeans_prostate(self):
h2o.beta_features = True
csvFilename = "prostate.csv"
print "\nStarting", csvFilename
parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex")
h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)

# loop, to see if we get same centers
expected = [
([55.63235294117647], 68, 667.8088235294117) ,
([63.93984962406015], 133, 611.5187969924812) ,
([71.55307262569832], 179, 1474.2458100558654) ,
]

# all are multipliers of expected tuple value
allowedDelta = (0.01, 0.01, 0.01)
for trial in range(2):
params = {'k' : 3,
'initialization' : 'Furthest',
'ignored_cols_by_name' : "ID",
'destination_key' : 'prostate_k.hex',
'max_iter' : 100,
'seed' : 265211114317615310
}
kwargs = params.copy()
kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
(centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)

if __name__ == '__main__':
h2o.unit_main()

0 comments on commit cb60500

Please sign in to comment.