Merge branch 'master' of https://github.com/0xdata/h2o

zehrilatezab · Feb 20, 2014 · 75ed805 · 75ed805
2 parents 32361b4 + e9f46e6
commit 75ed805
Show file tree

Hide file tree

Showing 27 changed files with 214 additions and 189 deletions.
diff --git a/R/Makefile b/R/Makefile
@@ -128,7 +128,8 @@ endif
 	# Build binary for each OS
 	rm -rf $(TMP_BUILD_DIR)
 	mkdir -p $(TMP_BUILD_DIR)
-	R CMD INSTALL -l $(TMP_BUILD_DIR) --build h2oRClient-package
+	[ -x "`which gnutar 2>/dev/null`" ] || echo 'Note: gnutar not found; package install in R may fail in the next step'
+	R CMD INSTALL -l $(TMP_BUILD_DIR) --build h2oRClient-package || echo 'If you got an error like "Dependency foo is not available for package h2oRClient" you need to install the required R package by running R and executing the R command: install.packages("foo")'
 
 ifneq ($(OS),Windows_NT)
 	# rm -rf h2oRClient

diff --git a/h2o-docs/source/developuser/quickstart_mac.rst b/h2o-docs/source/developuser/quickstart_mac.rst
@@ -60,7 +60,7 @@ Downloading and Building H\ :sub:`2`\ O
 
   $ git clone https://github.com/0xdata/h2o.git
 
-9. Build H\ :sub:`2`\ O from source. After the build finishes, some JUnit tests will run automatically.
+9. Build H\ :sub:`2`\ O from source. After the build finishes, some JUnit tests will run automatically.  Note that if you normally compile a different way, e.g. with an IDE, you may not have built the Hadoop driver jars that you create when building with make:
 
  ::
 
@@ -86,7 +86,11 @@ Installing Hadoop on a Mac
 
 12. Configure Hadoop (modify the file paths or version number if applicable): 
 
- Modify ``/usr/local/Cellar/hadoop/1.2.1/libexec/conf/core-site.xml`` to contain the following:
+ Note:
+ In Hadoop 1.x these files are found in, e.g., ``/usr/local/Cellar/hadoop/1.2.1/libexec/conf/``.
+ In Hadoop 2.x these files are found in, e.g., ``/usr/local/Cellar/hadoop/2.2.0/libexec/etc/hadoop/``.
+
+ Modify ``core-site.xml`` to contain the following:
 
  ::
 
@@ -96,8 +100,8 @@ Installing Hadoop on a Mac
 				<value>hdfs://localhost:8020</value>
 			</property>
 		</configuration>
-	
- Modify ``/usr/local/Cellar/hadoop/1.2.1/libexec/conf/mapred-site.xml`` to contain the following:
+
+ Modify ``mapred-site.xml`` to contain the following (NOTE: you may need to create the file from mapred-site.xml.template):
 
  ::
 
@@ -112,7 +116,7 @@ Installing Hadoop on a Mac
 		</property>
 	</configuration>
 
- Modify ``/usr/local/Cellar/hadoop/1.2.1/libexec/conf/hdfs-site.xml`` to contain the following:
+ Modify ``hdfs-site.xml`` to contain the following:
 
  ::
 
@@ -123,18 +127,27 @@ Installing Hadoop on a Mac
 		</property>
 	</configuration>
 
-13. Optional: Enable password-less SSH from localhost to localhost for convenience
+13. Optional: Enable password-less SSH from localhost to localhost for convenience.  
+
+ First enable remote login in the system sharing control panel, and then:
 
  ::
 
+		$ brew install ssh-copy-id
 		$ ssh-keygen
 		$ ssh-copy-id -i ~/.ssh/id_rsa.pub localhost
 
-14. Start Hadoop MapReduce services
+14. Start Hadoop MapReduce services, e.g.:
 
  ::
 
 		$ /usr/local/Cellar/hadoop/1.2.1/bin/start-all.sh
+or
+
+ ::
+
+		$ /usr/local/Cellar/hadoop/2.2.0/sbin/start-dfs.sh
+		$ /usr/local/Cellar/hadoop/2.2.0/sbin/start-yarn.sh
 
 15. Verify that Hadoop is up and running by checking the output of ``jps`` (look for NameNode, DataNode, JobTracker, TaskTracker)
 
@@ -148,7 +161,7 @@ Installing Hadoop on a Mac
 		81655 DataNode
 		81928 TaskTracker
 
-16. Format HDFS and leave the safe mode
+16. Format HDFS and leave the safe mode.
 
  ::
 		

diff --git a/h2o-samples/src/main/java/samples/Frames.java b/h2o-samples/src/main/java/samples/Frames.java
@@ -20,7 +20,7 @@ public static void main(String[] args) throws Exception {
   }
 
   @Override
-  protected Status exec() {
+  protected JobState exec() {
     // From file
     parse(new File(VM.h2oFolder(), "smalldata/iris/iris.csv"));
 
@@ -35,7 +35,7 @@ protected Status exec() {
     Key key = Key.make("MyFrame");
     UKV.put(key, frame);
 
-    return Status.Done;
+    return JobState.DONE;
   }
 
   /**

diff --git a/h2o-samples/src/main/java/samples/LoadDatasets.java b/h2o-samples/src/main/java/samples/LoadDatasets.java
@@ -293,8 +293,8 @@ void load() {
     TestUtil.parseFromH2OFolder("smalldata/./zipcodes");
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     load();
-    return Status.Running;
+    return JobState.RUNNING;
   }
 }
diff --git a/h2o-samples/src/main/java/samples/MapReduce.java b/h2o-samples/src/main/java/samples/MapReduce.java
@@ -24,7 +24,7 @@ public static void main(String[] args) throws Exception {
   }
 
   @Override
-  protected Status exec() {
+  protected JobState exec() {
     // Parse a dataset into a Frame, H2O's distributed table-like data structure
     File file = new File(VM.h2oFolder(), "smalldata/iris/iris.csv");
     Frame frame = Frames.parse(file);
@@ -44,7 +44,7 @@ protected Status exec() {
     // At this point, all task instances have been merged by their 'reduce' method. We
     // are back to a state where only one instance exist, and it contains the overall sum.
     System.out.println("Sum is " + sum.value);
-    return Status.Done;
+    return JobState.DONE;
   }
 
   static class Sum extends MRTask2<Sum> {

diff --git a/h2o-samples/src/main/java/samples/MapReduceKMeans.java b/h2o-samples/src/main/java/samples/MapReduceKMeans.java
@@ -16,7 +16,7 @@ public static void main(String[] args) throws Exception {
     samples.launchers.CloudProcess.launch(MapReduceKMeans.class, 2);
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     // Load and parse a file. Data is distributed to other nodes in a round-robin way
     Key file = NFSFileVec.make(new File("../lib/resources/datasets/gaussian.csv"));
     Frame frame = ParseDataset2.parse(Key.make("test"), new Key[] { file });
@@ -62,7 +62,7 @@ public static void main(String[] args) throws Exception {
         System.out.print(df.format(clusters[c][v]) + ", ");
       System.out.println("");
     }
-    return Status.Done;
+    return JobState.DONE;
   }
 
   /**

diff --git a/h2o-samples/src/main/java/samples/NeuralNetMnist.java b/h2o-samples/src/main/java/samples/NeuralNetMnist.java
@@ -110,7 +110,7 @@ protected void startTraining(Layer[] ls) {
     _trainer.start();
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     final double fraction = 1.0;
     final long seed = 0xC0FFEE;
     load(fraction, seed);
@@ -160,7 +160,7 @@ protected void startTraining(Layer[] ls) {
       }
     }, 0, 10);
     startTraining(ls);
-    return Status.Running;
+    return JobState.RUNNING;
   }
 
   // Remaining code was used to shuffle & convert to CSV

diff --git a/src/main/java/hex/GridSearch.java b/src/main/java/hex/GridSearch.java
@@ -19,7 +19,7 @@ public class GridSearch extends Job {
   public GridSearch(){
 
   }
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     UKV.put(destination_key, this);
     int max = jobs[0].gridParallelism();
     int head = 0, tail = 0;
@@ -34,7 +34,7 @@ public GridSearch(){
         }
       }
     }
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override protected void onCancelled() {
@@ -96,8 +96,8 @@ public static class GridSearchProgress extends Progress2 {
         for( Job job : jobs ) {
           JobInfo info = new JobInfo();
           info._job = job;
-          if(job.destination_key != null){
-            Object value = UKV.get(job.destination_key);
+          if(job.dest() != null){
+            Object value = UKV.get(job.dest());
             info._model = value instanceof Model ? (Model) value : null;
             if( info._model != null ) {
               info._cm = info._model.cm();
@@ -138,18 +138,18 @@ public static class GridSearchProgress extends Progress2 {
           if( perf != null )
             sb.append("<td>").append(speed).append("</td>");
           String link = "";
-          if( info._job.start_time != 0 && DKV.get(info._job.destination_key) != null ) {
-            link = info._job.destination_key.toString();
+          if( info._job.start_time != 0 && DKV.get(info._job.dest()) != null ) {
+            link = info._job.dest().toString();
             if( info._model instanceof GBMModel )
-              link = GBMModelView.link(link, info._job.destination_key);
+              link = GBMModelView.link(link, info._job.dest());
             else if( info._model instanceof DRFModel )
-              link = DRFModelView.link(link, info._job.destination_key);
+              link = DRFModelView.link(link, info._job.dest());
             else if( info._model instanceof NeuralNetModel )
-              link = NeuralNetModelView.link(link, info._job.destination_key);
+              link = NeuralNetModelView.link(link, info._job.dest());
             if( info._model instanceof KMeans2Model )
-              link = KMeans2ModelView.link(link, info._job.destination_key);
+              link = KMeans2ModelView.link(link, info._job.dest());
             else
-              link = Inspect.link(link, info._job.destination_key);
+              link = Inspect.link(link, info._job.dest());
           }
           sb.append("<td>").append(link).append("</td>");
 

diff --git a/src/main/java/hex/KMeans2.java b/src/main/java/hex/KMeans2.java
@@ -43,7 +43,7 @@ public KMeans2() {
     description = "K-means";
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     source.read_lock(self());
     String sourceArg = input("source");
     Key sourceKey = null;
@@ -107,7 +107,7 @@ public KMeans2() {
         clusters = Utils.append(clusters, sampler._sampled);
 
         if( !isRunning(self()) )
-          return Status.Done;
+          return JobState.DONE;
         model.centers = normalize ? denormalize(clusters, vecs) : clusters;
         model.total_within_SS = sqr._sqr;
         model.iterations++;
@@ -154,7 +154,7 @@ public KMeans2() {
     }
     model.unlock(self());
     source.unlock(self());
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override protected Response redirect() {

diff --git a/src/main/java/hex/drf/DRF.java b/src/main/java/hex/drf/DRF.java
@@ -145,16 +145,17 @@ public static String link(Key k, String content) {
     Log.info("    seed: " + _seed);
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     logStart();
     buildModel();
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override protected Response redirect() {
     return DRFProgressPage.redirect(this, self(), dest());
   }
 
+  @SuppressWarnings("unused")
   @Override protected void init() {
     super.init();
     // Initialize local variables

diff --git a/src/main/java/hex/gbm/GBM.java b/src/main/java/hex/gbm/GBM.java
@@ -115,10 +115,10 @@ public static String link(Key k, String content) {
     Log.info("    learn_rate: " + learn_rate);
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     logStart();
     buildModel();
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override public int gridParallelism() {

diff --git a/src/main/java/hex/glm/GLM2.java b/src/main/java/hex/glm/GLM2.java
@@ -193,7 +193,7 @@ public static Job gridSearch(Key jobKey, Key destinationKey, DataInfo dinfo, GLM
       if(destination_key == null)destination_key = Key.make("GLMGridModel_"+Key.make());
       if(job_key == null)job_key = Key.make("GLMGridJob_"+Key.make());
       Job j = gridSearch(self(),destination_key, _dinfo, _glm, lambda, alpha,n_folds);
-      return GLMGridView.redirect(this,j.destination_key);
+      return GLMGridView.redirect(this,j.dest());
     } else {
       if(destination_key == null)destination_key = Key.make("GLMModel_"+Key.make());
       if(job_key == null)job_key = Key.make("GLM2Job_"+Key.make());

diff --git a/src/main/java/hex/pca/PCA.java b/src/main/java/hex/pca/PCA.java
@@ -56,7 +56,7 @@ public PCA(String desc, Key dest, Frame src, int max_pc, double tolerance, boole
     this.standardize = standardize;
   }
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     Frame fr = selectFrame(source);
     Vec[] vecs = fr.vecs();
 
@@ -81,7 +81,7 @@ public PCA(String desc, Key dest, Frame src, int max_pc, double tolerance, boole
     PCAModel myModel = buildModel(dinfo, tsk);
     myModel.delete_and_lock(self());
     myModel.unlock(self());
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override protected void init() {

diff --git a/src/main/java/hex/pca/PCAImpute.java b/src/main/java/hex/pca/PCAImpute.java
@@ -21,10 +21,10 @@ public class PCAImpute extends FrameJob {
   @API(help = "Scale columns by their standard deviations", filter = Default.class)
   boolean scale = true;
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     Frame fr = source;
     new Frame(destination_key,fr._names.clone(),fr.vecs().clone()).delete_and_lock(null).unlock(null);
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override protected void init() {

diff --git a/src/main/java/hex/pca/PCAScore.java b/src/main/java/hex/pca/PCAScore.java
@@ -2,11 +2,8 @@
 
 import hex.FrameTask;
 import hex.FrameTask.DataInfo;
-
-import java.util.Arrays;
-
-import water.Job.*;
 import water.*;
+import water.Job.FrameJob;
 import water.api.DocGen;
 import water.fvec.*;
 import water.util.RString;
@@ -31,7 +28,7 @@ public class PCAScore extends FrameJob {
   @API(help = "Number of principal components to return", filter = Default.class, lmin = 1, lmax = 10000)
   int num_pc = 1;
 
-  @Override protected Status exec() {
+  @Override protected JobState exec() {
     // Note: Source data MUST contain all features (matched by name) used to build PCA model!
     // If additional columns exist in source, they are automatically ignored in scoring
     new Frame(destination_key, new String[0], new Vec[0]).delete_and_lock(self());
@@ -47,7 +44,7 @@ public class PCAScore extends FrameJob {
       domains[i] = null;
     }
     tsk.outputFrame(destination_key, names, domains).unlock(self());
-    return Status.Done;
+    return JobState.DONE;
   }
 
   @Override protected void init() {

diff --git a/src/main/java/hex/rf/ConfusionTask.java b/src/main/java/hex/rf/ConfusionTask.java
@@ -103,10 +103,10 @@ public static void remove(RFModel model, Key datakey, int classcol, boolean comp
   /**Apply a model to a dataset to produce a Confusion Matrix.  To support
      incremental & repeated model application, hash the model & data and look
      for that Key to already exist, returning a prior CM if one is available.*/
-  static public CMJob make(RFModel model, Key datakey, int classcol, double[] classWt, boolean computeOOB) {
+  static public Job make(RFModel model, Key datakey, int classcol, double[] classWt, boolean computeOOB) {
     return make(model, model.size(), datakey, classcol, classWt, computeOOB);
   }
-  static public CMJob make(final RFModel model, final int modelSize, final Key datakey, final int classcol, final double[] classWt, final boolean computeOOB) {
+  static public Job make(final RFModel model, final int modelSize, final Key datakey, final int classcol, final double[] classWt, final boolean computeOOB) {
     // Create a unique key for CM regarding given RFModel, validation data and parameters
     final Key cmKey = keyForCM(model._key, modelSize, datakey, classcol, computeOOB);
     // Start a new job if CM is not yet computed
@@ -145,7 +145,7 @@ static public CMJob make(final RFModel model, final int modelSize, final Key dat
       return cmJob;
     } else {
       // We should return Job which is/was computing the CM with given cmKey
-      return (CMJob) Job.findJobByDest(cmKey);
+      return Job.findJobByDest(cmKey);
     }
   }