Merge branch 'master' of github.com:0xdata/h2o

dhara27patel · Oct 19, 2013 · cb60500 · cb60500
2 parents 301e796 + bae348c
commit cb60500
Show file tree

Hide file tree

Showing 8 changed files with 265 additions and 23 deletions.
diff --git a/bench/BMLogs/bigLogger.sh b/bench/BMLogs/bigLogger.sh
@@ -57,6 +57,7 @@ function echoLine {
         fi
     fi
 }
+
 checkDExists ${OUTDIR}
 checkDExists ${rawLogs}
 checkDExists ${rawLogs}/procstat
@@ -78,11 +79,10 @@ do
     PREVTOTALS[$i]=0
 done
 
-start=`date +%s`
+start=`cat starttime`
 while :; do
     #dump raw logs first
     ts=`date +"%Y-%m-%d-%H-%M-%S"`
-    echo $ts
     cat /proc/stat    >> ${rawLogs}/procstat/${ts}_procstat_${mach}
     cat /proc/meminfo >> ${rawLogs}/meminfo/${ts}_meminfo_${mach}
     cat /proc/net/dev >> ${rawLogs}/netdev/${ts}_netdev_${mach}
@@ -124,16 +124,17 @@ while :; do
     echo $(( `date +%s` - $start )),$lineidle   >> $idlePerfFile
     echo $(( `date +%s` - $start )),$lineiowait >> $iowaitPerfFile
 
-    cat /proc/meminfo      | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > memTMP
-    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $2,$3,$4,$5}'          > recTMP
-    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $10,$11,$12,$13}'      > traTMP
-    echoLine memTMP $start $memPerfFile         1 1
-    echoLine recTMP $start $netReceivePerfFile  0 
-    echoLine traTMP $start $netTransmitPerfFile 0
+    cat /proc/meminfo      | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > bmemTMP
+    echo $pwd
+    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $2,$3,$4,$5}'          > brecTMP
+    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $10,$11,$12,$13}'      > btraTMP
+    echoLine bmemTMP $start $memPerfFile         1 1
+    echoLine brecTMP $start $netReceivePerfFile  0 
+    echoLine btraTMP $start $netTransmitPerfFile 0
     #get top 10 processes from top and then just store them, may/not be interesting...
     ti="$(( `date +%s` - ${start} ))"
     top -b | head -n 17 | tail -n 10 | awk -v t=$ti -F' ' 'OFS="," {print t,$1,$2,$6,$9,$10,$12}' >> $topPerfFile
     vmstat | tail -n 1               | awk -v t=$ti -F' ' 'OFS="," {print t,$7,$8}'               >> $swapPerfFile
-    rm *TMP
-    sleep 1
+    rm b*TMP
+    sleep 30
 done
diff --git a/bench/BMLogs/littleLogger.sh b/bench/BMLogs/littleLogger.sh
@@ -1,7 +1,11 @@
 #!/bin/bash
 
 OUTDIR='LittleLoggerFiles'
-mkdir ${OUTDIR}
+
+if [ ! -d ${OUTDIR} ]
+then
+    mkdir ${OUTDIR}
+fi
 
 #last 3 digits of inet addr
 mach=`ifconfig | grep -o "inet addr:192.168.1.[0-9]*" | grep -o 192.168.1.* | awk -F'.' '{print $4}'`
@@ -20,7 +24,12 @@ swapPerfFile=${OUTDIR}/$1-`date        +%Y-%m-%d`"-sisoPerf_"$mach".csv"
 cpuheader='time(s)'
 head -n 33 /proc/stat | tail -n 32 | awk -F' ' 'OFS="," {print $1}' > tmpfile
 cpuheader=$cpuheader,`./transpose.sh tmpfile`
-rm tmpfile
+
+if [ -a tmpfile ]
+then
+    rm tmpfile
+fi
+
 memheader='time(s),MemTotal,MemFree,Cached,Writeback'
 topheader='time(s),PID,USER,RES,%CPU,%MEM,COMMAND'
 netheader='time(s),bytes,packets,errs,drop'
@@ -65,7 +74,7 @@ do
     PREVTOTALS[$i]=0
 done
 
-start=`date +%s`
+start=`cat starttime`
 while :; do
     a=1
     for i in {0..34}
@@ -103,17 +112,17 @@ while :; do
     echo $(( `date +%s` - $start )),$lineidle   >> $idlePerfFile
     echo $(( `date +%s` - $start )),$lineiowait >> $iowaitPerfFile
 
-    cat /proc/meminfo      | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > memTMP
-    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $2,$3,$4,$5}'          > recTMP
-    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $10,$11,$12,$13}'      > traTMP
-    echoLine memTMP $start $memPerfFile         1 1
-    echoLine recTMP $start $netReceivePerfFile  0 
-    echoLine traTMP $start $netTransmitPerfFile 0
+    cat /proc/meminfo      | awk -F' ' 'OFS="," {gsub(":","", $1); print $2}' > lmemTMP
+    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $2,$3,$4,$5}'          > lrecTMP
+    grep lo /proc/net/dev  | awk -F' ' 'OFS="," {print $10,$11,$12,$13}'      > ltraTMP
+    echoLine lmemTMP $start $memPerfFile         1 1
+    echoLine lrecTMP $start $netReceivePerfFile  0 
+    echoLine ltraTMP $start $netTransmitPerfFile 0
     #get top 10 processes from top and then just store them, may/not be interesting...
     ti="$(( `date +%s` - ${start} ))"
     top -b | head -n 17 | tail -n 10 | awk -v t=$ti -F' ' 'OFS="," {print t,$1,$2,$6,$9,$10,$12}' >> $topPerfFile
     vmstat | tail -n 1               | awk -v t=$ti -F' ' 'OFS="," {print t,$7,$8}'               >> $swapPerfFile
-    rm *TMP
+    rm l*TMP
     sleep 30
 done
 

diff --git a/bench/BMLogs/starttime b/bench/BMLogs/starttime
@@ -0,0 +1 @@
+1382158596
diff --git a/bench/runBench.sh b/bench/runBench.sh
@@ -22,9 +22,11 @@ function doAlgo {
     #sudo bash -c "sync; echo 3 > /proc/sys/vm/drop_caches"
 
     echo "Running $1 benchmark..."
+    echo "Changing little logger phase..."
+    bash startLoggers.sh ${JSON} changePhase $1
 
     pyScript="BMscripts/"$1"Bench.py"
-
+    wait
     if [ ! $1 = "bigkmeans" ]
     then
         python ${pyScript} -cj BMscripts/${JSON} ${h2oBuild}
@@ -110,15 +112,24 @@ if [ ! -d ${benchmarks}/${h2oBuild}/${DATE} ]; then
   mkdir -p ${benchmarks}/${h2oBuild}/${DATE}
 fi
 
+#global starttime out to all loggers
+starttime=`date +%s`
+echo $starttime > BMLogs/starttime
+
+#Gentlemen...Start your loggers!
+bash startLoggers.sh ${JSON} big
+bash startLoggers.sh {$JSON} little
+
 if [ ! $TEST = "all" ]
 then
-    echo "$TEST"
     doAlgo $TEST
 else
     $TEST
 fi
 wait
 
+bash startLoggers.sh ${JSON} stop_
+
 #remove annoying useless files
 #rm pytest*flatfile*
 #rm benchmark*log

diff --git a/bench/startLoggers.sh b/bench/startLoggers.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+#ssh into each of the machines from ${JSON} (passed as first argument)
+MACHINES=`cat BMscripts/$1 | python -c 'import sys, json; print json.load(sys.stdin)[sys.argv[1]]' ip | awk -F, 'OFS="," {gsub("[ u\x27\[\]]","", $0); print}'`
+IFS=","
+MACHINES=($MACHINES)
+
+function startBigLoggers {
+    for i in ${MACHINES[@]}
+    do
+        echo "Starting bigLogger on ${i}"
+        scp BMLogs/starttime spencer@$i:/home/spencer/h2o/bench/BMLogs
+  #      ssh spencer@$i "cd /home/spencer/h2o/bench/BMLogs; bash bigLogger.sh" &
+    done
+}
+
+function startLittleLoggers {
+    for i in ${MACHINES[@]}
+    do
+        echo "Starting littleLogger on ${i} on phase $1"
+        ssh spencer@$i "cd /home/spencer/h2o/bench/BMLogs; bash littleLogger.sh $1" &
+    done
+}
+
+function stopLittleLoggers {
+    for i in ${MACHINES[@]}
+    do
+        ssh spencer@$i ps ax|grep bash|grep littleLogger|awk '{print $1}'| xargs kill
+    done
+}
+
+function stopAllLoggers {
+    for i in ${MACHINES[@]}
+    do
+        ssh spencer@$i ps ax|grep bash|grep Logger|awk '{print $1}'| xargs kill
+    done
+}
+
+function changePhase {
+  echo "Stopping little loggers"
+  stopLittleLoggers >/dev/null
+  newPhase=$1
+  startLittleLoggers $1 >/dev/null
+}
+
+if [ $2 = "big" ]
+then
+    startBigLoggers >/dev/null
+fi
+
+if [ $2 = "little" ]
+then
+    startLittleLoggers START >/dev/null
+fi
+
+if [ $2 = "changePhase" ]
+then
+    changePhase $3 >/dev/null
+fi
+
+if [ $2 = "stop_" ]
+then
+    stopAllLoggers
+fi
+
diff --git a/py/h2o.py b/py/h2o.py
@@ -1863,7 +1863,8 @@ def GLM_shared(self, key,
                 'link': 'familyDefault',
             }
         else:
-            params_dict =      {'vresponse'          : None,
+            params_dict =      {'source'             : key,
+                                'vresponse'          : None,
                                 'ignored_cols'       : None,
                                 'family'             : None,
                                 'lambda'             : None,

diff --git a/py/testdir_single_jvm/test_GLM2_basic.py b/py/testdir_single_jvm/test_GLM2_basic.py
@@ -0,0 +1,67 @@
+import unittest, random, sys, time
+sys.path.extend(['.','..','py'])
+
+import h2o, h2o_cmd, h2o_hosts, h2o_import as h2i, h2o_exec, h2o_glm, h2o_jobs
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global localhost
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(node_count=1, java_heap_GB=10)
+        else:
+            h2o_hosts.build_cloud_with_hosts(node_count=1, java_heap_GB=10)
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_GLM_prostate(self):
+        h2o.beta_features=True
+        importFolderPath = "logreg"
+        csvFilename = 'prostate.csv'
+        csvPathname = importFolderPath + "/" + csvFilename
+        hex_key = csvFilename + ".hex"
+
+        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname, schema='local', hex_key=hex_key,
+             timeoutSecs=180, noPoll=True, doSummary=False)
+        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
+        inspect = h2o_cmd.runInspect(None, parseResult['destination_key'])
+        print inspect
+        print "\n" + csvPathname, \
+            "    numRows:", "{:,}".format(inspect['numRows']), \
+            "    numCols:", "{:,}".format(inspect['numCols'])
+
+        x         = 'ID'
+        y         = 'CAPSULE'
+        family    = 'binomial'
+        alpha     = '0.5'
+        lambda_   = '1E-4'
+        nfolds    = '5'
+        case_mode = '='
+        case_val  = '1'
+        f         = 'prostate'
+
+        kwargs = {       'vresponse'          : y,
+                         'ignored_cols'       : x,
+                         'family'             : family,
+                         'lambda'             : lambda_,
+                         'alpha'              : alpha,
+                         'n_folds'            : nfolds,
+                         #'case_mode'          : case_mode,
+                         #'case_val'           : case_val, 
+                         'destination_key'    : "GLM("+f+")",
+                 }
+        timeoutSecs = 60
+
+        start = time.time()
+        glm = h2o_cmd.runGLM(parseResult=parseResult, timeoutSecs=timeoutSecs, pollTimeoutSecs=180, noPoll=True, **kwargs)
+
+        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
+
+if __name__ == '__main__':
+    h2o.unit_main()
diff --git a/py/testdir_single_jvm/test_KMeans2_basic.py b/py/testdir_single_jvm/test_KMeans2_basic.py
@@ -0,0 +1,87 @@
+import unittest, time, sys
+sys.path.extend(['.','..','py'])
+import h2o, h2o_cmd, h2o_kmeans, h2o_hosts, h2o_import as h2i, h2o_jobs
+
+class Basic(unittest.TestCase):
+    def tearDown(self):
+        h2o.check_sandbox_for_errors()
+
+    @classmethod
+    def setUpClass(cls):
+        global localhost
+        localhost = h2o.decide_if_localhost()
+        if (localhost):
+            h2o.build_cloud(1)
+        else:
+            h2o_hosts.build_cloud_with_hosts(1)
+
+    @classmethod
+    def tearDownClass(cls):
+        h2o.tear_down_cloud()
+
+    def test_B_kmeans_benign(self):
+        h2o.beta_features = True
+        csvPathname = "logreg"
+        csvFilename = "benign.csv"
+        print "\nStarting", csvFilename
+
+        parseResult = h2i.import_parse(bucket='smalldata', path=csvPathname + "/"+csvFilename, schema='local', hex_key=csvFilename+".hex", noPoll=True, doSummary=False)
+        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
+
+        expected = [
+            ([24.538961038961038, 2.772727272727273, 46.89032467532467, 0.1266233766233766, 12.012142857142857, 1.0105194805194804, 1.5222727272727272, 22.26039690646432, 12.582467532467534, 0.5275062016635049, 2.9477601050634767, 162.52136363636365, 41.94558441558441, 1.661883116883117], 77, 46889.32010560476) ,
+            ([25.587719298245613, 2.2719298245614037, 45.64035087719298, 0.35964912280701755, 13.026315789473685, 1.4298245614035088, 1.3070175438596492, 24.393307707470925, 13.333333333333334, 0.5244431302976542, 2.7326039818647745, 122.46491228070175, 40.973684210526315, 1.6754385964912282], 114, 64011.20272144667) ,
+            ([30.833333333333332, 2.9166666666666665, 46.833333333333336, 0.0, 13.083333333333334, 1.4166666666666667, 1.5833333333333333, 24.298220973782772, 11.666666666666666, 0.37640449438202245, 3.404494382022472, 224.91666666666666, 39.75, 1.4166666666666667], 12, 13000.485226507595) ,
+
+        ]
+        # all are multipliers of expected tuple value
+        allowedDelta = (0.01, 0.01, 0.01)
+
+        # loop, to see if we get same centers
+        for trial in range(2):
+            params = {'k'                    : 3, 
+                      'initialization'       : 'Furthest', 
+                      'ignored_cols_by_name' : None, 
+                      'destination_key'      : 'benign_k.hex',
+                      'max_iter'             : 50,
+                      'seed'                 : 265211114317615310,
+                     }
+            kwargs = params.copy()
+            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
+            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
+            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
+            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
+
+
+    def test_C_kmeans_prostate(self):
+        h2o.beta_features = True
+        csvFilename = "prostate.csv"
+        print "\nStarting", csvFilename
+        parseResult = h2i.import_parse(bucket='smalldata', path='logreg/'+csvFilename, schema='local', hex_key=csvFilename+".hex")
+        h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
+
+        # loop, to see if we get same centers
+        expected = [
+            ([55.63235294117647], 68, 667.8088235294117) ,
+            ([63.93984962406015], 133, 611.5187969924812) ,
+            ([71.55307262569832], 179, 1474.2458100558654) ,
+        ]
+
+        # all are multipliers of expected tuple value
+        allowedDelta = (0.01, 0.01, 0.01)
+        for trial in range(2):
+            params = {'k'                    : 3, 
+                     'initialization'        : 'Furthest', 
+                      'ignored_cols_by_name' : "ID",
+                      'destination_key'      : 'prostate_k.hex',
+                      'max_iter'             : 100,
+                      'seed'                 : 265211114317615310
+                    }
+            kwargs = params.copy()
+            kmeans = h2o_cmd.runKMeans(parseResult=parseResult, timeoutSecs=5, **kwargs)
+            (centers, tupleResultList) = h2o_kmeans.bigCheckResults(self, kmeans, csvFilename, parseResult, 'd', **kwargs)
+            h2o_jobs.pollWaitJobs(timeoutSecs=300, pollTimeoutSecs=300, retryDelaySecs=5)
+            h2o_kmeans.compareResultsToExpected(self, tupleResultList, expected, allowedDelta, trial=trial)
+
+if __name__ == '__main__':
+    h2o.unit_main()