Skip to content

Commit

Permalink
Add isFloat test to vecs
Browse files Browse the repository at this point in the history
Also drop "sum" on vecs (should have recursive mean calc instead)
GBM.Hist takes the response column in also
  • Loading branch information
cliffclick committed Jun 23, 2013
1 parent c959cd2 commit bd01401
Show file tree
Hide file tree
Showing 17 changed files with 282 additions and 30 deletions.
2 changes: 1 addition & 1 deletion prj.el
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
'(jde-run-option-debug nil)
'(jde-run-option-vm-args nil)
'(jde-compile-option-directory "./target/classes")
'(jde-run-option-application-args (quote ("-mainClass" "org.junit.runner.JUnitCore" "water.fvec.NewVectorTest" "water.fvec.NATest" "water.fvec.FVecTest" "water.fvec.GBMTest")))
'(jde-run-option-application-args (quote ("-mainClass" "org.junit.runner.JUnitCore" "water.fvec.NewVectorTest" "water.fvec.NATest" "water.fvec.FVecTest" "hex.GBMTest")))
'(jde-debugger (quote ("JDEbug")))
'(jde-compile-option-source (quote ("1.6")))
'(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar")))
Expand Down
230 changes: 230 additions & 0 deletions src/main/java/hex/#GBM.java#
Original file line number Diff line number Diff line change
@@ -0,0 +1,230 @@
package hex;

import java.util.Arrays;
import java.util.Formatter;
import water.*;
import water.H2O.H2OCountedCompleter;
import water.fvec.*;
import water.util.Log;

public class GBM extends Job {
public static final String KEY_PREFIX = "__GBMModel_";

public static final Key makeKey() { return Key.make(KEY_PREFIX + Key.make()); }
private GBM(Key dest, Frame fr) { super("GBM "+fr, dest); }
// Called from a non-FJ thread; makea a GBM and hands it over to FJ threads
public static GBM start(Key dest, final Frame fr) {
final GBM job = new GBM(dest, fr);
H2O.submitTask(job.start(new H2OCountedCompleter() {
@Override public void compute2() { job.run(fr); tryComplete(); }
}));
return job;
}

// ==========================================================================

// Compute a GBM tree.

// Start by splitting all the data according to some criteria (minimize
// variance at the leaves). Record on each row which split it goes to, and
// assign a split number to it (for next pass). On *this* pass, use the
// split-number to build a per-split histogram, with a per-histogram-bucket
// variance.

int _splitLevel; // Tree split level.

// Number of active splits at this level
int _numSplits;

// A histogram; one per split and each one has Histogram.BINS bins.
Histogram _hists[];

// Compute a single GBM tree
private void run(Frame fr) {
// Initially setup as-if an empty-split had just happened
_numSplits = 1;
final int ncols = fr._vecs.length; // Last column is the response column
double mins[] = new double[ncols], maxs[] = new double[ncols];
for( int i=0; i<ncols; i++ ) {
mins[i] = fr._vecs[i].min();
maxs[i] = fr._vecs[i].max();
}
Histogram hist = new Histogram(fr._names,fr._vecs[0].length(), mins, maxs);
Log.unwrap(System.out,hist.toString()+"\n");

double[] ds = new double[ncols];
for( int j=0; j<fr._vecs[0].length(); j++ ) {
for( int i=0; i<ncols; i++ )
ds[i] = fr._vecs[i].at(j);
//Log.unwrap(System.out,Arrays.toString(ds));
hist.incr(ds);
//Log.unwrap(System.out,hist.toString()+"\n");
}

Log.unwrap(System.out,hist.toString()+"\n");
StringBuilder sb = new StringBuilder();
for( int i=0; i<ncols-1; i++ )
sb.append(i).append("=").append(hist.score(i)).append(" ");
Log.unwrap(System.out,sb.toString());
Log.unwrap(System.out,"Best split is column "+hist.bestSplit());

//while( true ) {
//
// // Build an array of histograms, one histogram per split.
// // The histogram array is "ragged" - we use smaller arrays
// // to avoid having too few elements per bin.
// _hists = new Histogram[_numRowsPerSplit];
// for( int i=0; i<_numRowsPerSplit; i++ )
// _hists[i] = new Histogram(_numRowsPerSplit[i],x);
//
//
//}
}

// --------------------------------------------------------------------------
// A Histogram over a particular Split. The histogram runs from min to max
// per each column (i.e., we actually make #cols histograms in parallel), and
// is given the number of elements that will land in some bin (for small
// enough elements, we make fewer bins). Each column's range is independent
// and recomputed at each split/histogram
private static class Histogram extends Iced {
public static final int BINS=4;
transient final String[] _names; // Column names
2 public final double[] _steps; // Linear interpolation step per bin
public final long [][] _bins; // Bins by column, then bin
public final double[][] _Ms; // Rolling mean, per-column-per-bin
public final double[][] _Ss; // Rolling var , per-column-per-bin
public final double[][] _mins, _maxs; // Per-column-per-bin min/max

public Histogram( String[] names, long nelems, double[] mins, double[] maxs ) {
assert nelems >= 0;
_names = names;
int nbins = Math.max((int)Math.min(BINS,nelems),1);
int ncols = mins.length-1; // Last column is the response column
assert maxs[ncols] > mins[ncols] : "Caller ensures max>min, since if max==min the column is all constants";
_steps= new double[ncols];
_bins = new long [ncols][nbins]; // Counts per bin
_Ms = new double[ncols][nbins]; // Rolling bin mean
_Ss = new double[ncols][nbins]; // Rolling bin Variance*(cnt-1)
_mins = new double[ncols][nbins]; // Rolling min per-bin
_maxs = new double[ncols][nbins]; // Rolling max per-bin
for( int i=0; i<ncols; i++ ) {
assert maxs[i] > mins[i] : "Caller ensures max>min, since if max==min the column is all constants";
// See if we can show there are fewer unique elements than nbins.
// Common for e.g. boolean columns, or near leaves.




_steps[i] = (maxs[i]-mins[i])/nbins; // Step size for linear interpolation
for( int j=0; j<nbins; j++ ) { // Set bad bounds for min/max
_mins[i][j] = Double.MAX_VALUE;
_maxs[i][j] = -Double.MAX_VALUE;
}
_mins[i][ 0] = mins[i]; // Know better bounds for whole column min/max
_maxs[i][nbins-1] = maxs[i];
}
}

// Add 1 count to bin specified by double, for all doubles in a row.
// Simple linear interpolation to specify bin. Last column is response
// variable; also add to the variance per-bin using the recursive strategy.
// http://www.johndcook.com/standard_deviation.html
void incr( double[] ds ) {
assert _bins.length == ds.length-1;
double y = ds[ds.length-1];
int nbins = _bins[0].length;
for( int i=0; i<ds.length-1; i++ ) {
double d = ds[i];
int idx = _steps[i] <= 0.0 ? 0 : (int)((d-_mins[i][0])/_steps[i]);
int idx2 = Math.max(Math.min(idx,nbins-1),0); // saturate at bounds
_bins[i][idx2]++;
// Track actual lower/upper bound per-bin
if( d < _mins[i][idx2] ) _mins[i][idx2] = d;
if( d > _maxs[i][idx2] ) _maxs[i][idx2] = d;
// Recursive mean & variance
// http://www.johndcook.com/standard_deviation.html
long k = _bins[i][idx2];
double oldM = _Ms[i][idx2], newM = oldM + (y-oldM)/k;
double oldS = _Ss[i][idx2], newS = oldS + (y-oldM)*(y-newM);
_Ms[i][idx2] = newM;
_Ss[i][idx2] = newS;
}
}
double mean( int col, int bin ) { return _Ms[col][bin]; }
double var ( int col, int bin ) {
return _bins[col][bin] > 1 ? _Ss[col][bin]/(_bins[col][bin]-1) : 0;
}

// Compute a "score" for a column; lower score "wins" (is a better split).
// Score is related to variance; a lower variance is better. For now
// return the sum of variance across the column, divided by the mean.
// Dividing normalizes the column to other columns.
double score( int col ) {
double sum = 0;
int ncols = _bins[0].length;
for( int i=0; i<ncols; i++ ) {
double m = mean(col,i);
double x = m==0.0 ? 0 : var(col,i)/m;
sum += x;
}
return sum;
}

// Find the column with the best split (lowest score)
int bestSplit() {
double bs = Double.MAX_VALUE;
int idx = -1;
int ncols = _bins.length;
for( int i=0; i<ncols; i++ ) {
double s = score(i);
if( s < bs ) { bs = s; idx = i; }
}
return idx;
}

// Pretty-print a histogram
@Override public String toString() {
final String colPad=" ";
final int cntW=4, mmmW=4, varW=4;
final int colW=cntW+1+mmmW+1+mmmW+1+mmmW+1+varW;
StringBuilder sb = new StringBuilder();
for( int j=0; j<_bins.length; j++ )
p(sb,_names[j],colW).append(colPad);
sb.append('\n');
for( int j=0; j<_bins.length; j++ ) {
p(sb,"cnt" ,cntW).append('/');
p(sb,"min" ,mmmW).append('/');
p(sb,"max" ,mmmW).append('/');
p(sb,"mean",mmmW).append('/');
p(sb,"var" ,varW).append(colPad);
}
sb.append('\n');
Formatter f = new Formatter(sb);
for( int i=0; i<_bins[0].length; i++ ) {
for( int j=0; j<_bins.length; j++ ) {
p(sb,Long.toString(_bins[j][i]),cntW).append('/');
p(sb, _mins[j][i] ,mmmW).append('/');
p(sb, _maxs[j][i] ,mmmW).append('/');
p(sb, mean(j, i) ,mmmW).append('/');
p(sb, var (j, i) ,varW).append(colPad);
}
sb.append('\n');
}
return sb.toString();
}
static private StringBuilder p(StringBuilder sb, String s, int w) {
return sb.append(Log.fixedLength(s,w));
}
static private StringBuilder p(StringBuilder sb, double d, int w) {
String s = Double.isNaN(d) ? "NaN" :
((d==Double.MAX_VALUE || d==-Double.MAX_VALUE) ? " -" :
Double.toString(d));
if( s.length() <= w ) return p(sb,s,w);
s = String.format("%4.1f",d);
if( s.length() > w )
s = String.format("%4.0f",d);
return sb.append(s);
}
}
}
20 changes: 14 additions & 6 deletions src/main/java/hex/GBM.java
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ public static GBM start(Key dest, final Frame fr) {
private void run(Frame fr) {
// Initially setup as-if an empty-split had just happened
_numSplits = 1;
final int ncols = fr._vecs.length-1; // Last column is the response column
final int ncols = fr._vecs.length; // Last column is the response column
double mins[] = new double[ncols], maxs[] = new double[ncols];
for( int i=0; i<ncols; i++ ) {
mins[i] = fr._vecs[i].min();
Expand All @@ -52,9 +52,9 @@ private void run(Frame fr) {
Histogram hist = new Histogram(fr._names,fr._vecs[0].length(), mins, maxs);
Log.unwrap(System.out,hist.toString()+"\n");

double[] ds = new double[ncols+1];
double[] ds = new double[ncols];
for( int j=0; j<fr._vecs[0].length(); j++ ) {
for( int i=0; i<ncols+1; i++ )
for( int i=0; i<ncols; i++ )
ds[i] = fr._vecs[i].at(j);
//Log.unwrap(System.out,Arrays.toString(ds));
hist.incr(ds);
Expand All @@ -63,7 +63,7 @@ private void run(Frame fr) {

Log.unwrap(System.out,hist.toString()+"\n");
StringBuilder sb = new StringBuilder();
for( int i=0; i<ncols; i++ )
for( int i=0; i<ncols-1; i++ )
sb.append(i).append("=").append(hist.score(i)).append(" ");
Log.unwrap(System.out,sb.toString());
Log.unwrap(System.out,"Best split is column "+hist.bestSplit());
Expand All @@ -81,6 +81,7 @@ private void run(Frame fr) {
//}
}

// --------------------------------------------------------------------------
// A Histogram over a particular Split. The histogram runs from min to max
// per each column (i.e., we actually make #cols histograms in parallel), and
// is given the number of elements that will land in some bin (for small
Expand All @@ -99,15 +100,22 @@ public Histogram( String[] names, long nelems, double[] mins, double[] maxs ) {
assert nelems >= 0;
_names = names;
int nbins = Math.max((int)Math.min(BINS,nelems),1);
int ncols = mins.length;
int ncols = mins.length-1; // Last column is the response column
assert maxs[ncols] > mins[ncols] : "Caller ensures max>min, since if max==min the column is all constants";
_steps= new double[ncols];
_bins = new long [ncols][nbins]; // Counts per bin
_Ms = new double[ncols][nbins]; // Rolling bin mean
_Ss = new double[ncols][nbins]; // Rolling bin Variance*(cnt-1)
_mins = new double[ncols][nbins]; // Rolling min per-bin
_maxs = new double[ncols][nbins]; // Rolling max per-bin
for( int i=0; i<ncols; i++ ) {
assert maxs[i] >= mins[i];
assert maxs[i] > mins[i] : "Caller ensures max>min, since if max==min the column is all constants";
// See if we can show there are fewer unique elements than nbins.
// Common for e.g. boolean columns, or near leaves.




_steps[i] = (maxs[i]-mins[i])/nbins; // Step size for linear interpolation
for( int j=0; j<nbins; j++ ) { // Set bad bounds for min/max
_mins[i][j] = Double.MAX_VALUE;
Expand Down
9 changes: 5 additions & 4 deletions src/main/java/water/fvec/AppendableVec.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,24 @@
// other Vec type. NEW Vectors do NOT support reads!
public class AppendableVec extends Vec {
long _espc[];
boolean _hasFloat; // True if we found a float chunk

AppendableVec( Key key ) {
super(key,null,Double.MAX_VALUE,Double.MIN_VALUE,0);
super(key,null,false,Double.MAX_VALUE,Double.MIN_VALUE);
_espc = new long[4];
}

// A NewVector chunk was "closed" - completed. Add it's info to the roll-up.
// This call is made in parallel across all node-local created chunks, but is
// not called distributed.
synchronized void closeChunk( int cidx, int len, double min, double max, double sum ) {
synchronized void closeChunk( int cidx, int len, boolean hasFloat, double min, double max ) {
while( cidx >= _espc.length )
_espc = Arrays.copyOf(_espc,_espc.length<<1);
_espc[cidx] = len;
_hasFloat |= hasFloat;
// Roll-up totals for each chunk as it closes
if( min < _min ) _min = min;
if( max > _max ) _max = max;
_sum += sum;
}

// Class 'reduce' call on new vectors; to combine the roll-up info.
Expand Down Expand Up @@ -71,7 +72,7 @@ public Vec close() {
}
espc[nchunk]=x; // Total element count in last
// Replacement plain Vec for AppendableVec.
Vec vec = new Vec(_key,espc,_min,_max,_sum);
Vec vec = new Vec(_key,espc,!_hasFloat,_min,_max);
DKV.put(_key,vec); // Inject the header
return vec;
}
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/water/fvec/ByteVec.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// A vector of plain Bytes.
public class ByteVec extends Vec {

ByteVec( Key key, long espc[] ) { super(key,espc,Double.NaN,Double.NaN,Double.NaN); }
ByteVec( Key key, long espc[] ) { super(key,espc,true,Double.NaN,Double.NaN); }

public C1Chunk elem2BV( int cidx ) { return (C1Chunk)super.elem2BV(cidx); }

Expand Down
1 change: 1 addition & 0 deletions src/main/java/water/fvec/C1Chunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class C1Chunk extends Chunk {
return (res == _NA)?_vec._fNA:res;
}
@Override void append2 ( long l, int exp ) { throw H2O.fail(); }
@Override boolean hasFloat() { return false; }
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
@Override public C1Chunk read(AutoBuffer bb) {
_mem = bb.bufClose();
Expand Down
1 change: 1 addition & 0 deletions src/main/java/water/fvec/C1SChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class C1SChunk extends Chunk {
return (res == _NA)?_vec._fNA:(res+_bias)*_scale;
}
@Override void append2 ( long l, int exp ) { throw H2O.fail(); }
@Override boolean hasFloat() { return _scale < 1.0; }
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
@Override public C1SChunk read(AutoBuffer bb) {
_mem = bb.bufClose();
Expand Down
1 change: 1 addition & 0 deletions src/main/java/water/fvec/C2Chunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public class C2Chunk extends Chunk {
return res == _NA?_vec._fNA:res;
}
@Override void append2 ( long l, int exp ) { throw H2O.fail(); }
@Override boolean hasFloat() { return false; }
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
@Override public C2Chunk read(AutoBuffer bb) {
_mem = bb.bufClose();
Expand Down
1 change: 1 addition & 0 deletions src/main/java/water/fvec/C2SChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class C2SChunk extends Chunk {
return (res == _NA)?_vec._fNA:(res + _bias)*_scale;
}
@Override void append2 ( long l, int exp ) { throw H2O.fail(); }
@Override boolean hasFloat() { return _scale < 1.0; }
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
@Override public C2SChunk read(AutoBuffer bb) {
_mem = bb.bufClose();
Expand Down
1 change: 1 addition & 0 deletions src/main/java/water/fvec/C4Chunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public class C4Chunk extends Chunk {
return res == _NA?_vec._fNA:res;
}
@Override void append2 ( long l, int exp ) { throw H2O.fail(); }
@Override boolean hasFloat() { return false; }
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
@Override public C4Chunk read(AutoBuffer bb) {
_mem = bb.bufClose();
Expand Down
1 change: 1 addition & 0 deletions src/main/java/water/fvec/C4FChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ public class C4FChunk extends Chunk {
return Float.isNaN(res)?_vec._fNA:res;
}
@Override void append2 ( long l, int exp ) { throw H2O.fail(); }
@Override boolean hasFloat() { return true; }
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
@Override public C4FChunk read(AutoBuffer bb) {
_mem = bb.bufClose();
Expand Down
Loading

0 comments on commit bd01401

Please sign in to comment.