subramaniano
diff --git a/‎experiments/src/main/java/hex/MnistDist16x.java
+22-19 b/‎experiments/src/main/java/hex/MnistDist16x.java
+22-19
diff --git a/‎src/main/java/hex/GridSearch.java
+2-2 b/‎src/main/java/hex/GridSearch.java
+2-2
diff --git a/‎src/main/java/hex/KMeans.java
+2-2 b/‎src/main/java/hex/KMeans.java
+2-2
diff --git a/‎src/main/java/hex/KMeans2.java
+58-39 b/‎src/main/java/hex/KMeans2.java
+58-39
diff --git a/‎src/main/java/hex/KMeansGrid.java
-83 b/‎src/main/java/hex/KMeansGrid.java
-83
@@ -4,22 +4,21 @@
 
 import water.*;
 import water.deploy.Cloud;
-import water.fvec.NFSFileVec;
-import water.fvec.ParseDataset2;
+import water.fvec.*;
 import water.util.Log;
+import water.util.Utils;
 
 public class MnistDist16x {
   public static void main(String[] args) throws Exception {
     Cloud cloud = new Cloud();
     for( int i = LOW; i < LOW + LEN; i++ )
-      cloud._publicIPs.add("192.168.1." + (161 + i));
-    cloud._clientRSyncIncludes.add("../libs/jdk");
-    cloud._clientRSyncIncludes.add("smalldata");
-    cloud._clientRSyncIncludes.add("experiments/target");
-    cloud._fannedRSyncIncludes.add("jdk");
-    cloud._fannedRSyncIncludes.add("smalldata");
+      cloud.publicIPs.add("192.168.1." + (161 + i));
+    cloud.clientRSyncIncludes.add("smalldata");
+    cloud.clientRSyncIncludes.add("experiments/target");
+    cloud.fannedRSyncIncludes.add("smalldata");
+    cloud.jdk = "../libs/jdk";
     String java = "-ea -Xmx120G -Dh2o.debug";
-    String node = "-mainClass " + MnistDist16x.UserCode.class.getName() + " -beta";
+    String node = "-mainClass " + UserCode.class.getName() + " -beta";
     cloud.start(java.split(" "), node.split(" "));
   }
 
@@ -29,22 +28,25 @@ public static class UserCode {
     public static void userMain(String[] args) throws Exception {
       H2O.main(args);
 
-      Log.info("blah: " + System.getProperty("java.home"));
+      Log.info("java: " + System.getProperty("java.home"));
 
       TestUtil.stall_till_cloudsize(LEN);
       //Sample08_DeepNeuralNet_EC2.run();
       //Sample07_NeuralNet_Mnist8m.run();
-      //Sample07_NeuralNet_Mnist.run();
+      //new Sample07_NeuralNetLowLevel().run();
 
-      File f = new File("smalldata/mnist/train.csv.gz");
-      Key dest = Key.make("train.hex");
-      Key fkey = NFSFileVec.make(f);
-      ParseDataset2.parse(dest, new Key[] { fkey });
+      Key fkey = NFSFileVec.make(new File("/home/0xdiag/home-0xdiag-datasets/mnist/mnist8m.csv"));
+      Key mnist8m = Key.make("mnist8m.csv");
+      Frame frame = ParseDataset2.parse(mnist8m, new Key[] { fkey });
 
-      f = new File("smalldata/mnist/test.csv.gz");
-      dest = Key.make("test.hex");
-      fkey = NFSFileVec.make(f);
-      ParseDataset2.parse(dest, new Key[] { fkey });
+      Vec response = frame.vecs()[0];
+      Vec[] vecs = Utils.remove(frame.vecs(), 0);
+      Key train = Key.make("train.hex");
+      UKV.put(train, new Frame(frame.names(), Utils.append(vecs, response)));
+
+      Key dest = Key.make("test.hex");
+      Key ftest = NFSFileVec.make(new File("smalldata/mnist/test.csv.gz"));
+      ParseDataset2.parse(dest, new Key[] { ftest });
 
       // Basic visualization of images and weights
 //      JFrame frame = new JFrame("H2O");
@@ -54,6 +56,7 @@ public static void userMain(String[] args) throws Exception {
 //      frame.pack();
 //      frame.setLocationRelativeTo(null);
 //      frame.setVisible(true);
+      Log.info("Ready");
     }
   }
 }
@@ -38,7 +38,7 @@ public class GridSearch extends Job {
 
   @Override public Response redirect() {
     String n = GridSearchProgress.class.getSimpleName();
-    return new Response(Response.Status.redirect, this, -1, -1, n, "job", job_key, "dst_key", destination_key);
+    return new Response(Response.Status.redirect, this, -1, -1, n, "job_key", job_key, "destination_key", destination_key);
   }
 
   public static class GridSearchProgress extends Progress2 {
@@ -105,7 +105,7 @@ public static class GridSearchProgress extends Progress2 {
             sb.append("<td>").append(speed).append("</td>");
 
           String link = info._job.destination_key.toString();
-          if( info._job.start_time != 0 ) {
+          if( info._job.start_time != 0 && DKV.get(info._job.destination_key) != null ) {
             if( info._model instanceof GBMModel )
               link = GBMModelView.link(link, info._job.destination_key);
             else if( info._model instanceof NeuralNetModel )
 
@@ -102,7 +102,7 @@ private void run(KMeansModel res, ValueArray va, int k, Initialization init, int
         sampler._probability = k * 3; // Over-sampling
         sampler._seed = res._randSeed;
         sampler.invoke(va._key);
-        clusters = DRemoteTask.merge(clusters, sampler._clust2);
+        clusters = Utils.append(clusters, sampler._clust2);
 
         if( cancelled() ) {
           remove();
@@ -210,7 +210,7 @@ public static class Sampler extends MRTask {
 
     @Override public void reduce(DRemoteTask rt) {
       Sampler task = (Sampler) rt;
-      _clust2 = _clust2 == null ? task._clust2 : merge(_clust2, task._clust2);
+      _clust2 = _clust2 == null ? task._clust2 : Utils.append(_clust2, task._clust2);
     }
   }
 
 
@@ -8,7 +8,8 @@
 import water.*;
 import water.Job.ColumnsJob;
 import water.api.*;
-import water.fvec.*;
+import water.fvec.Chunk;
+import water.fvec.Vec;
 import water.util.Utils;
 
 /**
@@ -49,14 +50,13 @@ public KMeans2() {
     for( int i = 0; i < cols.length; i++ )
       names[i] = source._names[cols[i]];
     Vec[] vecs = selectVecs(source);
-    Frame frame = new Frame(names, vecs);
     // Fill-in response based on K
-    Vec response = frame.anyVec().makeZero();
-    response._domain = new String[k];
-    for( int i = 0; i < response._domain.length; i++ )
-      response._domain[i] = "Cluster " + i;
-    frame.add("response", response);
-    KMeans2Model model = new KMeans2Model(destination_key, sourceKey, frame);
+    String[] domain = new String[k];
+    for( int i = 0; i < domain.length; i++ )
+      domain[i] = "Cluster " + i;
+    String[] namesResp = Utils.append(names, "response");
+    String[][] domaiResp = (String[][]) Utils.append(source.domains(), (Object) domain);
+    KMeans2Model model = new KMeans2Model(destination_key, sourceKey, namesResp, domaiResp);
 
     double[] subs = null, muls = null;
     if( normalize ) {
@@ -101,7 +101,7 @@ public KMeans2() {
         sampler._subs = subs;
         sampler._muls = muls;
         sampler.doAll(vecs);
-        clusters = DRemoteTask.merge(clusters, sampler._sampled);
+        clusters = Utils.append(clusters, sampler._sampled);
 
         if( cancelled() )
           return;
@@ -120,15 +120,11 @@ public KMeans2() {
       task._subs = subs;
       task._muls = muls;
       task.doAll(vecs);
-      for( int cluster = 0; cluster < clusters.length; cluster++ ) {
-        if( task._counts[cluster] > 0 ) {
-          for( int vec = 0; vec < vecs.length; vec++ ) {
-            double value = task._sums[cluster][vec] / task._counts[cluster];
-            clusters[cluster][vec] = value;
-          }
-        }
-      }
-      model.clusters = normalize ? denormalize(clusters, vecs) : clusters;
+      model.clusters = normalize ? denormalize(task._means, vecs) : task._means;
+      for( int clu = 0; clu < task._sigms.length; clu++ )
+        for( int col = 0; col < task._sigms[clu].length; col++ )
+          task._sigms[clu][col] = task._sigms[clu][col] / (task._rows[clu] - 1);
+      model.variances = task._sigms;
       model.error = task._sqr;
       model.iterations++;
       UKV.put(destination_key, model);
@@ -141,7 +137,9 @@ public KMeans2() {
 
   @Override protected Response redirect() {
     String n = KMeans2Progress.class.getSimpleName();
-    return new Response(Response.Status.redirect, this, -1, -1, n, "job", job_key, "dst_key", destination_key);
+    return new Response(Response.Status.redirect, this, -1, -1, n, //
+        "job_key", job_key, //
+        "destination_key", destination_key);
   }
 
   public static class KMeans2Progress extends Progress2 {
@@ -204,7 +202,7 @@ public static class KMeans2Model extends Model implements Progress {
     @API(help = "Sum of min square distances")
     public double error;
 
-    @API(help = "Whether data should be normalized")
+    @API(help = "Whether data was normalized")
     public boolean normalized;
 
     @API(help = "Maximum number of iterations before stopping")
@@ -213,11 +211,14 @@ public static class KMeans2Model extends Model implements Progress {
     @API(help = "Iterations the algorithm ran")
     public int iterations;
 
+    @API(help = "In-cluster variances")
+    public double[][] variances;
+
     private transient double[] _subs, _muls; // Normalization
     private transient double[][] _normClust;
 
-    public KMeans2Model(Key selfKey, Key dataKey, Frame fr) {
-      super(selfKey, dataKey, fr);
+    public KMeans2Model(Key selfKey, Key dataKey, String names[], String domains[][]) {
+      super(selfKey, dataKey, names, domains);
     }
 
     @Override public float progress() {
@@ -301,47 +302,65 @@ public static class Sampler extends MRTask2<Sampler> {
     }
 
     @Override public void reduce(Sampler other) {
-      _sampled = DRemoteTask.merge(_sampled, other._sampled);
+      _sampled = Utils.append(_sampled, other._sampled);
     }
   }
 
   public static class Lloyds extends MRTask2<Lloyds> {
     // IN
     double[][] _clusters;
-    double[] _subs, _muls; // Normalization
+    double[] _subs, _muls;      // Normalization
 
     // OUT
-    double[][] _sums;      // Sum of (normalized) features in each cluster
-    int[] _counts;         // Count of rows in cluster
-    double _sqr;           // Total sqr distance
+    double[][] _means, _sigms;  // Means and sigma for each cluster
+    long[] _rows;               // Rows per cluster
+    double _sqr;                // Total sqr distance
 
     @Override public void map(Chunk[] cs) {
-      double[] values = new double[_clusters[0].length];
-      _sums = new double[_clusters.length][values.length];
-      _counts = new int[_clusters.length];
-      ClusterDist cd = new ClusterDist();
+      _means = new double[_clusters.length][_clusters[0].length];
+      _sigms = new double[_clusters.length][_clusters[0].length];
+      _rows = new long[_clusters.length];
 
       // Find closest cluster for each row
+      double[] values = new double[_clusters[0].length];
+      ClusterDist cd = new ClusterDist();
+      int[] clusters = new int[cs[0]._len];
       for( int row = 0; row < cs[0]._len; row++ ) {
         data(values, cs, row, _subs, _muls);
         closest(_clusters, values, cd);
-        int cluster = cd._cluster;
+        int clu = clusters[row] = cd._cluster;
         _sqr += cd._dist;
-        if( cluster == -1 )
+        if( clu == -1 )
           continue; // Ignore broken row
 
         // Add values and increment counter for chosen cluster
-        Utils.add(_sums[cluster], values);
-        _counts[cluster]++;
+        for( int col = 0; col < values.length; col++ )
+          _means[clu][col] += values[col];
+        _rows[clu]++;
+      }
+      for( int clu = 0; clu < _means.length; clu++ )
+        for( int col = 0; col < _means[clu].length; col++ )
+          _means[clu][col] /= _rows[clu];
+      // Second pass for in-cluster variances
+      for( int row = 0; row < cs[0]._len; row++ ) {
+        int clu = clusters[row];
+        if( clu == -1 )
+          continue;
+        data(values, cs, row, _subs, _muls);
+        for( int col = 0; col < values.length; col++ ) {
+          double delta = values[col] - _means[clu][col];
+          _sigms[clu][col] += delta * delta;
+        }
       }
       _clusters = null;
       _subs = _muls = null;
     }
 
-    @Override public void reduce(Lloyds other) {
-      Utils.add(_sums, other._sums);
-      Utils.add(_counts, other._counts);
-      _sqr += other._sqr;
+    @Override public void reduce(Lloyds mr) {
+      for( int clu = 0; clu < _means.length; clu++ )
+        Layer.Stats.reduce(_means[clu], _sigms[clu], _rows[clu], mr._means[clu], mr._sigms[clu], mr._rows[clu]);
+      Utils.add(_rows, mr._rows);
+      _sqr += mr._sqr;
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -102,7 +102,7 @@ private void run(KMeansModel res, ValueArray va, int k, Initialization init, int`
`102`	`102`	`sampler._probability = k * 3; // Over-sampling`
`103`	`103`	`sampler._seed = res._randSeed;`
`104`	`104`	`sampler.invoke(va._key);`
`105`		`- clusters = DRemoteTask.merge(clusters, sampler._clust2);`
	`105`	`+ clusters = Utils.append(clusters, sampler._clust2);`
`106`	`106`
`107`	`107`	`if( cancelled() ) {`
`108`	`108`	`remove();`
`@@ -210,7 +210,7 @@ public static class Sampler extends MRTask {`
`210`	`210`
`211`	`211`	`@Override public void reduce(DRemoteTask rt) {`
`212`	`212`	`Sampler task = (Sampler) rt;`
`213`		`- _clust2 = _clust2 == null ? task._clust2 : merge(_clust2, task._clust2);`
	`213`	`+ _clust2 = _clust2 == null ? task._clust2 : Utils.append(_clust2, task._clust2);`
`214`	`214`	`}`
`215`	`215`	`}`
`216`	`216`