Skip to content

Commit

Permalink
KMeans2 tests and fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
cypof committed Oct 22, 2013
1 parent 59a9dd1 commit 2fe96a8
Show file tree
Hide file tree
Showing 20 changed files with 359 additions and 253 deletions.
41 changes: 22 additions & 19 deletions experiments/src/main/java/hex/MnistDist16x.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,21 @@

import water.*;
import water.deploy.Cloud;
import water.fvec.NFSFileVec;
import water.fvec.ParseDataset2;
import water.fvec.*;
import water.util.Log;
import water.util.Utils;

public class MnistDist16x {
public static void main(String[] args) throws Exception {
Cloud cloud = new Cloud();
for( int i = LOW; i < LOW + LEN; i++ )
cloud._publicIPs.add("192.168.1." + (161 + i));
cloud._clientRSyncIncludes.add("../libs/jdk");
cloud._clientRSyncIncludes.add("smalldata");
cloud._clientRSyncIncludes.add("experiments/target");
cloud._fannedRSyncIncludes.add("jdk");
cloud._fannedRSyncIncludes.add("smalldata");
cloud.publicIPs.add("192.168.1." + (161 + i));
cloud.clientRSyncIncludes.add("smalldata");
cloud.clientRSyncIncludes.add("experiments/target");
cloud.fannedRSyncIncludes.add("smalldata");
cloud.jdk = "../libs/jdk";
String java = "-ea -Xmx120G -Dh2o.debug";
String node = "-mainClass " + MnistDist16x.UserCode.class.getName() + " -beta";
String node = "-mainClass " + UserCode.class.getName() + " -beta";
cloud.start(java.split(" "), node.split(" "));
}

Expand All @@ -29,22 +28,25 @@ public static class UserCode {
public static void userMain(String[] args) throws Exception {
H2O.main(args);

Log.info("blah: " + System.getProperty("java.home"));
Log.info("java: " + System.getProperty("java.home"));

TestUtil.stall_till_cloudsize(LEN);
//Sample08_DeepNeuralNet_EC2.run();
//Sample07_NeuralNet_Mnist8m.run();
//Sample07_NeuralNet_Mnist.run();
//new Sample07_NeuralNetLowLevel().run();

File f = new File("smalldata/mnist/train.csv.gz");
Key dest = Key.make("train.hex");
Key fkey = NFSFileVec.make(f);
ParseDataset2.parse(dest, new Key[] { fkey });
Key fkey = NFSFileVec.make(new File("/home/0xdiag/home-0xdiag-datasets/mnist/mnist8m.csv"));
Key mnist8m = Key.make("mnist8m.csv");
Frame frame = ParseDataset2.parse(mnist8m, new Key[] { fkey });

f = new File("smalldata/mnist/test.csv.gz");
dest = Key.make("test.hex");
fkey = NFSFileVec.make(f);
ParseDataset2.parse(dest, new Key[] { fkey });
Vec response = frame.vecs()[0];
Vec[] vecs = Utils.remove(frame.vecs(), 0);
Key train = Key.make("train.hex");
UKV.put(train, new Frame(frame.names(), Utils.append(vecs, response)));

Key dest = Key.make("test.hex");
Key ftest = NFSFileVec.make(new File("smalldata/mnist/test.csv.gz"));
ParseDataset2.parse(dest, new Key[] { ftest });

// Basic visualization of images and weights
// JFrame frame = new JFrame("H2O");
Expand All @@ -54,6 +56,7 @@ public static void userMain(String[] args) throws Exception {
// frame.pack();
// frame.setLocationRelativeTo(null);
// frame.setVisible(true);
Log.info("Ready");
}
}
}
4 changes: 2 additions & 2 deletions src/main/java/hex/GridSearch.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ public class GridSearch extends Job {

@Override public Response redirect() {
String n = GridSearchProgress.class.getSimpleName();
return new Response(Response.Status.redirect, this, -1, -1, n, "job", job_key, "dst_key", destination_key);
return new Response(Response.Status.redirect, this, -1, -1, n, "job_key", job_key, "destination_key", destination_key);
}

public static class GridSearchProgress extends Progress2 {
Expand Down Expand Up @@ -105,7 +105,7 @@ public static class GridSearchProgress extends Progress2 {
sb.append("<td>").append(speed).append("</td>");

String link = info._job.destination_key.toString();
if( info._job.start_time != 0 ) {
if( info._job.start_time != 0 && DKV.get(info._job.destination_key) != null ) {
if( info._model instanceof GBMModel )
link = GBMModelView.link(link, info._job.destination_key);
else if( info._model instanceof NeuralNetModel )
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/hex/KMeans.java
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ private void run(KMeansModel res, ValueArray va, int k, Initialization init, int
sampler._probability = k * 3; // Over-sampling
sampler._seed = res._randSeed;
sampler.invoke(va._key);
clusters = DRemoteTask.merge(clusters, sampler._clust2);
clusters = Utils.append(clusters, sampler._clust2);

if( cancelled() ) {
remove();
Expand Down Expand Up @@ -210,7 +210,7 @@ public static class Sampler extends MRTask {

@Override public void reduce(DRemoteTask rt) {
Sampler task = (Sampler) rt;
_clust2 = _clust2 == null ? task._clust2 : merge(_clust2, task._clust2);
_clust2 = _clust2 == null ? task._clust2 : Utils.append(_clust2, task._clust2);
}
}

Expand Down
97 changes: 58 additions & 39 deletions src/main/java/hex/KMeans2.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import water.*;
import water.Job.ColumnsJob;
import water.api.*;
import water.fvec.*;
import water.fvec.Chunk;
import water.fvec.Vec;
import water.util.Utils;

/**
Expand Down Expand Up @@ -49,14 +50,13 @@ public KMeans2() {
for( int i = 0; i < cols.length; i++ )
names[i] = source._names[cols[i]];
Vec[] vecs = selectVecs(source);
Frame frame = new Frame(names, vecs);
// Fill-in response based on K
Vec response = frame.anyVec().makeZero();
response._domain = new String[k];
for( int i = 0; i < response._domain.length; i++ )
response._domain[i] = "Cluster " + i;
frame.add("response", response);
KMeans2Model model = new KMeans2Model(destination_key, sourceKey, frame);
String[] domain = new String[k];
for( int i = 0; i < domain.length; i++ )
domain[i] = "Cluster " + i;
String[] namesResp = Utils.append(names, "response");
String[][] domaiResp = (String[][]) Utils.append(source.domains(), (Object) domain);
KMeans2Model model = new KMeans2Model(destination_key, sourceKey, namesResp, domaiResp);

double[] subs = null, muls = null;
if( normalize ) {
Expand Down Expand Up @@ -101,7 +101,7 @@ public KMeans2() {
sampler._subs = subs;
sampler._muls = muls;
sampler.doAll(vecs);
clusters = DRemoteTask.merge(clusters, sampler._sampled);
clusters = Utils.append(clusters, sampler._sampled);

if( cancelled() )
return;
Expand All @@ -120,15 +120,11 @@ public KMeans2() {
task._subs = subs;
task._muls = muls;
task.doAll(vecs);
for( int cluster = 0; cluster < clusters.length; cluster++ ) {
if( task._counts[cluster] > 0 ) {
for( int vec = 0; vec < vecs.length; vec++ ) {
double value = task._sums[cluster][vec] / task._counts[cluster];
clusters[cluster][vec] = value;
}
}
}
model.clusters = normalize ? denormalize(clusters, vecs) : clusters;
model.clusters = normalize ? denormalize(task._means, vecs) : task._means;
for( int clu = 0; clu < task._sigms.length; clu++ )
for( int col = 0; col < task._sigms[clu].length; col++ )
task._sigms[clu][col] = task._sigms[clu][col] / (task._rows[clu] - 1);
model.variances = task._sigms;
model.error = task._sqr;
model.iterations++;
UKV.put(destination_key, model);
Expand All @@ -141,7 +137,9 @@ public KMeans2() {

@Override protected Response redirect() {
String n = KMeans2Progress.class.getSimpleName();
return new Response(Response.Status.redirect, this, -1, -1, n, "job", job_key, "dst_key", destination_key);
return new Response(Response.Status.redirect, this, -1, -1, n, //
"job_key", job_key, //
"destination_key", destination_key);
}

public static class KMeans2Progress extends Progress2 {
Expand Down Expand Up @@ -204,7 +202,7 @@ public static class KMeans2Model extends Model implements Progress {
@API(help = "Sum of min square distances")
public double error;

@API(help = "Whether data should be normalized")
@API(help = "Whether data was normalized")
public boolean normalized;

@API(help = "Maximum number of iterations before stopping")
Expand All @@ -213,11 +211,14 @@ public static class KMeans2Model extends Model implements Progress {
@API(help = "Iterations the algorithm ran")
public int iterations;

@API(help = "In-cluster variances")
public double[][] variances;

private transient double[] _subs, _muls; // Normalization
private transient double[][] _normClust;

public KMeans2Model(Key selfKey, Key dataKey, Frame fr) {
super(selfKey, dataKey, fr);
public KMeans2Model(Key selfKey, Key dataKey, String names[], String domains[][]) {
super(selfKey, dataKey, names, domains);
}

@Override public float progress() {
Expand Down Expand Up @@ -301,47 +302,65 @@ public static class Sampler extends MRTask2<Sampler> {
}

@Override public void reduce(Sampler other) {
_sampled = DRemoteTask.merge(_sampled, other._sampled);
_sampled = Utils.append(_sampled, other._sampled);
}
}

public static class Lloyds extends MRTask2<Lloyds> {
// IN
double[][] _clusters;
double[] _subs, _muls; // Normalization
double[] _subs, _muls; // Normalization

// OUT
double[][] _sums; // Sum of (normalized) features in each cluster
int[] _counts; // Count of rows in cluster
double _sqr; // Total sqr distance
double[][] _means, _sigms; // Means and sigma for each cluster
long[] _rows; // Rows per cluster
double _sqr; // Total sqr distance

@Override public void map(Chunk[] cs) {
double[] values = new double[_clusters[0].length];
_sums = new double[_clusters.length][values.length];
_counts = new int[_clusters.length];
ClusterDist cd = new ClusterDist();
_means = new double[_clusters.length][_clusters[0].length];
_sigms = new double[_clusters.length][_clusters[0].length];
_rows = new long[_clusters.length];

// Find closest cluster for each row
double[] values = new double[_clusters[0].length];
ClusterDist cd = new ClusterDist();
int[] clusters = new int[cs[0]._len];
for( int row = 0; row < cs[0]._len; row++ ) {
data(values, cs, row, _subs, _muls);
closest(_clusters, values, cd);
int cluster = cd._cluster;
int clu = clusters[row] = cd._cluster;
_sqr += cd._dist;
if( cluster == -1 )
if( clu == -1 )
continue; // Ignore broken row

// Add values and increment counter for chosen cluster
Utils.add(_sums[cluster], values);
_counts[cluster]++;
for( int col = 0; col < values.length; col++ )
_means[clu][col] += values[col];
_rows[clu]++;
}
for( int clu = 0; clu < _means.length; clu++ )
for( int col = 0; col < _means[clu].length; col++ )
_means[clu][col] /= _rows[clu];
// Second pass for in-cluster variances
for( int row = 0; row < cs[0]._len; row++ ) {
int clu = clusters[row];
if( clu == -1 )
continue;
data(values, cs, row, _subs, _muls);
for( int col = 0; col < values.length; col++ ) {
double delta = values[col] - _means[clu][col];
_sigms[clu][col] += delta * delta;
}
}
_clusters = null;
_subs = _muls = null;
}

@Override public void reduce(Lloyds other) {
Utils.add(_sums, other._sums);
Utils.add(_counts, other._counts);
_sqr += other._sqr;
@Override public void reduce(Lloyds mr) {
for( int clu = 0; clu < _means.length; clu++ )
Layer.Stats.reduce(_means[clu], _sigms[clu], _rows[clu], mr._means[clu], mr._sigms[clu], mr._rows[clu]);
Utils.add(_rows, mr._rows);
_sqr += mr._sqr;
}
}

Expand Down
83 changes: 0 additions & 83 deletions src/main/java/hex/KMeansGrid.java

This file was deleted.

Loading

0 comments on commit 2fe96a8

Please sign in to comment.