Add isFloat test to vecs

Also drop "sum" on vecs (should have recursive mean calc instead) GBM.Hist takes the response column in also
a-b · Jun 23, 2013 · bd01401 · bd01401
1 parent c959cd2
commit bd01401
Show file tree

Hide file tree

Showing 17 changed files with 282 additions and 30 deletions.
diff --git a/prj.el b/prj.el
@@ -7,7 +7,7 @@
  '(jde-run-option-debug nil)
  '(jde-run-option-vm-args nil)
  '(jde-compile-option-directory "./target/classes")
- '(jde-run-option-application-args (quote ("-mainClass" "org.junit.runner.JUnitCore" "water.fvec.NewVectorTest" "water.fvec.NATest" "water.fvec.FVecTest" "water.fvec.GBMTest")))
+ '(jde-run-option-application-args (quote ("-mainClass" "org.junit.runner.JUnitCore" "water.fvec.NewVectorTest" "water.fvec.NATest" "water.fvec.FVecTest" "hex.GBMTest")))
  '(jde-debugger (quote ("JDEbug")))
  '(jde-compile-option-source (quote ("1.6")))
  '(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar")))

diff --git a/src/main/java/hex/#GBM.java# b/src/main/java/hex/#GBM.java#
@@ -0,0 +1,230 @@
+package hex;
+
+import java.util.Arrays;
+import java.util.Formatter;
+import water.*;
+import water.H2O.H2OCountedCompleter;
+import water.fvec.*;
+import water.util.Log;
+
+public class GBM extends Job {
+  public static final String KEY_PREFIX = "__GBMModel_";
+
+  public static final Key makeKey() { return Key.make(KEY_PREFIX + Key.make());  }
+  private GBM(Key dest, Frame fr) { super("GBM "+fr, dest); }
+  // Called from a non-FJ thread; makea a GBM and hands it over to FJ threads
+  public static GBM start(Key dest, final Frame fr) {
+    final GBM job = new GBM(dest, fr);
+    H2O.submitTask(job.start(new H2OCountedCompleter() {
+        @Override public void compute2() { job.run(fr); tryComplete(); }
+      })); 
+    return job;
+  }
+
+  // ==========================================================================
+
+  // Compute a GBM tree.  
+
+  // Start by splitting all the data according to some criteria (minimize
+  // variance at the leaves).  Record on each row which split it goes to, and
+  // assign a split number to it (for next pass).  On *this* pass, use the
+  // split-number to build a per-split histogram, with a per-histogram-bucket
+  // variance.
+
+  int _splitLevel;              // Tree split level.
+
+  // Number of active splits at this level
+  int _numSplits;
+
+  // A histogram; one per split and each one has Histogram.BINS bins.
+  Histogram _hists[];
+
+  // Compute a single GBM tree
+  private void run(Frame fr) {
+    // Initially setup as-if an empty-split had just happened
+    _numSplits = 1;
+    final int ncols = fr._vecs.length; // Last column is the response column
+    double mins[] = new double[ncols], maxs[] = new double[ncols];
+    for( int i=0; i<ncols; i++ ) {
+      mins[i] = fr._vecs[i].min();
+      maxs[i] = fr._vecs[i].max();
+    }
+    Histogram hist = new Histogram(fr._names,fr._vecs[0].length(), mins, maxs);
+    Log.unwrap(System.out,hist.toString()+"\n");
+
+    double[] ds = new double[ncols];
+    for( int j=0; j<fr._vecs[0].length(); j++ ) {
+      for( int i=0; i<ncols; i++ )
+        ds[i] = fr._vecs[i].at(j);
+      //Log.unwrap(System.out,Arrays.toString(ds));
+      hist.incr(ds);
+      //Log.unwrap(System.out,hist.toString()+"\n");
+    }
+
+    Log.unwrap(System.out,hist.toString()+"\n");
+    StringBuilder sb = new StringBuilder();
+    for( int i=0; i<ncols-1; i++ )
+      sb.append(i).append("=").append(hist.score(i)).append("  ");
+    Log.unwrap(System.out,sb.toString());
+    Log.unwrap(System.out,"Best split is column "+hist.bestSplit());
+
+    //while( true ) {
+    //
+    //  // Build an array of histograms, one histogram per split.
+    //  // The histogram array is "ragged" - we use smaller arrays
+    //  // to avoid having too few elements per bin.
+    //  _hists = new Histogram[_numRowsPerSplit];
+    //  for( int i=0; i<_numRowsPerSplit; i++ ) 
+    //    _hists[i] = new Histogram(_numRowsPerSplit[i],x);
+    //
+    //  
+    //}
+  }
+
+  // --------------------------------------------------------------------------
+  // A Histogram over a particular Split.  The histogram runs from min to max
+  // per each column (i.e., we actually make #cols histograms in parallel), and
+  // is given the number of elements that will land in some bin (for small
+  // enough elements, we make fewer bins).  Each column's range is independent
+  // and recomputed at each split/histogram
+  private static class Histogram extends Iced {
+    public static final int BINS=4;
+    transient final String[] _names; // Column names
+2    public final double[]   _steps;  // Linear interpolation step per bin
+    public final long  [][] _bins;   // Bins by column, then bin
+    public final double[][] _Ms;     // Rolling mean, per-column-per-bin
+    public final double[][] _Ss;     // Rolling var , per-column-per-bin
+    public final double[][] _mins, _maxs; // Per-column-per-bin min/max
+
+    public Histogram( String[] names, long nelems, double[] mins, double[] maxs ) {
+      assert nelems >= 0;
+      _names = names;
+      int nbins = Math.max((int)Math.min(BINS,nelems),1);
+      int ncols = mins.length-1; // Last column is the response column
+      assert maxs[ncols] > mins[ncols] : "Caller ensures max>min, since if max==min the column is all constants";
+      _steps= new double[ncols];
+      _bins = new long  [ncols][nbins]; // Counts per bin
+      _Ms   = new double[ncols][nbins]; // Rolling bin mean
+      _Ss   = new double[ncols][nbins]; // Rolling bin Variance*(cnt-1)
+      _mins = new double[ncols][nbins]; // Rolling min per-bin
+      _maxs = new double[ncols][nbins]; // Rolling max per-bin
+      for( int i=0; i<ncols; i++ ) {    
+        assert maxs[i] > mins[i] : "Caller ensures max>min, since if max==min the column is all constants";
+        // See if we can show there are fewer unique elements than nbins.
+        // Common for e.g. boolean columns, or near leaves.
+
+
+
+
+        _steps[i] = (maxs[i]-mins[i])/nbins; // Step size for linear interpolation
+        for( int j=0; j<nbins; j++ ) { // Set bad bounds for min/max
+          _mins[i][j] =  Double.MAX_VALUE;
+          _maxs[i][j] = -Double.MAX_VALUE;
+        }
+        _mins[i][      0] = mins[i]; // Know better bounds for whole column min/max
+        _maxs[i][nbins-1] = maxs[i];
+      }
+    }
+
+    // Add 1 count to bin specified by double, for all doubles in a row.
+    // Simple linear interpolation to specify bin.  Last column is response
+    // variable; also add to the variance per-bin using the recursive strategy.
+    //    http://www.johndcook.com/standard_deviation.html
+    void incr( double[] ds ) {
+      assert _bins.length == ds.length-1;
+      double y = ds[ds.length-1];
+      int nbins = _bins[0].length;
+      for( int i=0; i<ds.length-1; i++ ) {
+        double d = ds[i];
+        int idx = _steps[i] <= 0.0 ? 0 : (int)((d-_mins[i][0])/_steps[i]);
+        int idx2 = Math.max(Math.min(idx,nbins-1),0); // saturate at bounds
+        _bins[i][idx2]++;
+        // Track actual lower/upper bound per-bin
+        if( d < _mins[i][idx2] ) _mins[i][idx2] = d;
+        if( d > _maxs[i][idx2] ) _maxs[i][idx2] = d;
+        // Recursive mean & variance
+        //    http://www.johndcook.com/standard_deviation.html
+        long k = _bins[i][idx2];
+        double oldM = _Ms[i][idx2], newM = oldM + (y-oldM)/k;
+        double oldS = _Ss[i][idx2], newS = oldS + (y-oldM)*(y-newM);
+        _Ms[i][idx2] = newM;
+        _Ss[i][idx2] = newS;
+      }
+    }
+    double mean( int col, int bin ) { return _Ms[col][bin]; }
+    double var ( int col, int bin ) { 
+      return _bins[col][bin] > 1 ? _Ss[col][bin]/(_bins[col][bin]-1) : 0; 
+    }
+
+    // Compute a "score" for a column; lower score "wins" (is a better split).
+    // Score is related to variance; a lower variance is better.  For now
+    // return the sum of variance across the column, divided by the mean.
+    // Dividing normalizes the column to other columns.
+    double score( int col ) {
+      double sum = 0;
+      int ncols = _bins[0].length;
+      for( int i=0; i<ncols; i++ ) {
+        double m = mean(col,i);
+        double x = m==0.0 ? 0 : var(col,i)/m;
+        sum += x;
+      }
+      return sum;
+    }
+
+    // Find the column with the best split (lowest score)
+    int bestSplit() {
+      double bs = Double.MAX_VALUE;
+      int idx = -1;
+      int ncols = _bins.length;
+      for( int i=0; i<ncols; i++ ) {
+        double s = score(i);
+        if( s < bs ) { bs = s; idx = i; }
+      }
+      return idx;
+    }
+
+    // Pretty-print a histogram
+    @Override public String toString() {
+      final String colPad="  ";
+      final int cntW=4, mmmW=4, varW=4;
+      final int colW=cntW+1+mmmW+1+mmmW+1+mmmW+1+varW;
+      StringBuilder sb = new StringBuilder();
+      for( int j=0; j<_bins.length; j++ )
+        p(sb,_names[j],colW).append(colPad);
+      sb.append('\n');
+      for( int j=0; j<_bins.length; j++ ) {
+        p(sb,"cnt" ,cntW).append('/');
+        p(sb,"min" ,mmmW).append('/');
+        p(sb,"max" ,mmmW).append('/');
+        p(sb,"mean",mmmW).append('/');
+        p(sb,"var" ,varW).append(colPad);
+      }
+      sb.append('\n');
+      Formatter f = new Formatter(sb);
+      for( int i=0; i<_bins[0].length; i++ ) {
+        for( int j=0; j<_bins.length; j++ ) {
+          p(sb,Long.toString(_bins[j][i]),cntW).append('/');
+          p(sb,              _mins[j][i] ,mmmW).append('/');
+          p(sb,              _maxs[j][i] ,mmmW).append('/');
+          p(sb,               mean(j, i) ,mmmW).append('/');
+          p(sb,               var (j, i) ,varW).append(colPad);
+        }
+        sb.append('\n');
+      }
+      return sb.toString();
+    }
+    static private StringBuilder p(StringBuilder sb, String s, int w) {
+      return sb.append(Log.fixedLength(s,w));
+    }
+    static private StringBuilder p(StringBuilder sb, double d, int w) {
+      String s = Double.isNaN(d) ? "NaN" :
+        ((d==Double.MAX_VALUE || d==-Double.MAX_VALUE) ? " -" : 
+         Double.toString(d));
+      if( s.length() <= w ) return p(sb,s,w);
+      s = String.format("%4.1f",d);
+      if( s.length() > w )
+        s = String.format("%4.0f",d);
+      return sb.append(s);
+    }
+  }
+}
diff --git a/src/main/java/hex/GBM.java b/src/main/java/hex/GBM.java
@@ -43,7 +43,7 @@ public static GBM start(Key dest, final Frame fr) {
   private void run(Frame fr) {
     // Initially setup as-if an empty-split had just happened
     _numSplits = 1;
-    final int ncols = fr._vecs.length-1; // Last column is the response column
+    final int ncols = fr._vecs.length; // Last column is the response column
     double mins[] = new double[ncols], maxs[] = new double[ncols];
     for( int i=0; i<ncols; i++ ) {
       mins[i] = fr._vecs[i].min();
@@ -52,9 +52,9 @@ private void run(Frame fr) {
     Histogram hist = new Histogram(fr._names,fr._vecs[0].length(), mins, maxs);
     Log.unwrap(System.out,hist.toString()+"\n");
 
-    double[] ds = new double[ncols+1];
+    double[] ds = new double[ncols];
     for( int j=0; j<fr._vecs[0].length(); j++ ) {
-      for( int i=0; i<ncols+1; i++ )
+      for( int i=0; i<ncols; i++ )
         ds[i] = fr._vecs[i].at(j);
       //Log.unwrap(System.out,Arrays.toString(ds));
       hist.incr(ds);
@@ -63,7 +63,7 @@ private void run(Frame fr) {
 
     Log.unwrap(System.out,hist.toString()+"\n");
     StringBuilder sb = new StringBuilder();
-    for( int i=0; i<ncols; i++ )
+    for( int i=0; i<ncols-1; i++ )
       sb.append(i).append("=").append(hist.score(i)).append("  ");
     Log.unwrap(System.out,sb.toString());
     Log.unwrap(System.out,"Best split is column "+hist.bestSplit());
@@ -81,6 +81,7 @@ private void run(Frame fr) {
     //}
   }
 
+  // --------------------------------------------------------------------------
   // A Histogram over a particular Split.  The histogram runs from min to max
   // per each column (i.e., we actually make #cols histograms in parallel), and
   // is given the number of elements that will land in some bin (for small
@@ -99,15 +100,22 @@ public Histogram( String[] names, long nelems, double[] mins, double[] maxs ) {
       assert nelems >= 0;
       _names = names;
       int nbins = Math.max((int)Math.min(BINS,nelems),1);
-      int ncols = mins.length;
+      int ncols = mins.length-1; // Last column is the response column
+      assert maxs[ncols] > mins[ncols] : "Caller ensures max>min, since if max==min the column is all constants";
       _steps= new double[ncols];
       _bins = new long  [ncols][nbins]; // Counts per bin
       _Ms   = new double[ncols][nbins]; // Rolling bin mean
       _Ss   = new double[ncols][nbins]; // Rolling bin Variance*(cnt-1)
       _mins = new double[ncols][nbins]; // Rolling min per-bin
       _maxs = new double[ncols][nbins]; // Rolling max per-bin
       for( int i=0; i<ncols; i++ ) {    
-        assert maxs[i] >= mins[i];
+        assert maxs[i] > mins[i] : "Caller ensures max>min, since if max==min the column is all constants";
+        // See if we can show there are fewer unique elements than nbins.
+        // Common for e.g. boolean columns, or near leaves.
+
+
+
+
         _steps[i] = (maxs[i]-mins[i])/nbins; // Step size for linear interpolation
         for( int j=0; j<nbins; j++ ) { // Set bad bounds for min/max
           _mins[i][j] =  Double.MAX_VALUE;

diff --git a/src/main/java/water/fvec/AppendableVec.java b/src/main/java/water/fvec/AppendableVec.java
@@ -16,23 +16,24 @@
 // other Vec type.  NEW Vectors do NOT support reads!
 public class AppendableVec extends Vec {
   long _espc[];
+  boolean _hasFloat;            // True if we found a float chunk
 
   AppendableVec( Key key ) {
-    super(key,null,Double.MAX_VALUE,Double.MIN_VALUE,0);
+    super(key,null,false,Double.MAX_VALUE,Double.MIN_VALUE);
     _espc = new long[4];
   }
 
   // A NewVector chunk was "closed" - completed.  Add it's info to the roll-up.
   // This call is made in parallel across all node-local created chunks, but is
   // not called distributed.
-  synchronized void closeChunk( int cidx, int len, double min, double max, double sum ) {
+  synchronized void closeChunk( int cidx, int len, boolean hasFloat, double min, double max ) {
     while( cidx >= _espc.length )
       _espc = Arrays.copyOf(_espc,_espc.length<<1);
     _espc[cidx] = len;
+    _hasFloat |= hasFloat;
     // Roll-up totals for each chunk as it closes
     if( min < _min ) _min = min;
     if( max > _max ) _max = max;
-    _sum += sum;
   }
 
   // Class 'reduce' call on new vectors; to combine the roll-up info.
@@ -71,7 +72,7 @@ public Vec close() {
     }
     espc[nchunk]=x;             // Total element count in last
     // Replacement plain Vec for AppendableVec.
-    Vec vec = new Vec(_key,espc,_min,_max,_sum);
+    Vec vec = new Vec(_key,espc,!_hasFloat,_min,_max);
     DKV.put(_key,vec);          // Inject the header
     return vec;
   }

diff --git a/src/main/java/water/fvec/ByteVec.java b/src/main/java/water/fvec/ByteVec.java
@@ -8,7 +8,7 @@
 // A vector of plain Bytes.
 public class ByteVec extends Vec {
 
-  ByteVec( Key key, long espc[] ) { super(key,espc,Double.NaN,Double.NaN,Double.NaN); }
+  ByteVec( Key key, long espc[] ) { super(key,espc,true,Double.NaN,Double.NaN); }
 
   public C1Chunk elem2BV( int cidx ) { return (C1Chunk)super.elem2BV(cidx); }
 

diff --git a/src/main/java/water/fvec/C1Chunk.java b/src/main/java/water/fvec/C1Chunk.java
@@ -16,6 +16,7 @@ public class C1Chunk extends Chunk {
     return (res == _NA)?_vec._fNA:res;
   }
   @Override void   append2 ( long l, int exp ) { throw H2O.fail(); }
+  @Override boolean hasFloat() { return false; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C1Chunk read(AutoBuffer bb) {
     _mem = bb.bufClose();

diff --git a/src/main/java/water/fvec/C1SChunk.java b/src/main/java/water/fvec/C1SChunk.java
@@ -22,6 +22,7 @@ public class C1SChunk extends Chunk {
     return (res == _NA)?_vec._fNA:(res+_bias)*_scale;
   }
   @Override void   append2 ( long l, int exp ) { throw H2O.fail(); }
+  @Override boolean hasFloat() { return _scale < 1.0; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C1SChunk read(AutoBuffer bb) {
     _mem = bb.bufClose();

diff --git a/src/main/java/water/fvec/C2Chunk.java b/src/main/java/water/fvec/C2Chunk.java
@@ -16,6 +16,7 @@ public class C2Chunk extends Chunk {
     return res == _NA?_vec._fNA:res;
   }
   @Override void   append2 ( long l, int exp ) { throw H2O.fail(); }
+  @Override boolean hasFloat() { return false; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C2Chunk read(AutoBuffer bb) {
     _mem = bb.bufClose();

diff --git a/src/main/java/water/fvec/C2SChunk.java b/src/main/java/water/fvec/C2SChunk.java
@@ -22,6 +22,7 @@ public class C2SChunk extends Chunk {
     return (res == _NA)?_vec._fNA:(res + _bias)*_scale;
   }
   @Override void   append2 ( long l, int exp ) { throw H2O.fail(); }
+  @Override boolean hasFloat() { return _scale < 1.0; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C2SChunk read(AutoBuffer bb) {
     _mem = bb.bufClose();

diff --git a/src/main/java/water/fvec/C4Chunk.java b/src/main/java/water/fvec/C4Chunk.java
@@ -15,6 +15,7 @@ public class C4Chunk extends Chunk {
     return res == _NA?_vec._fNA:res;
   }
   @Override void   append2 ( long l, int exp ) { throw H2O.fail(); }
+  @Override boolean hasFloat() { return false; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C4Chunk read(AutoBuffer bb) {
     _mem = bb.bufClose();

diff --git a/src/main/java/water/fvec/C4FChunk.java b/src/main/java/water/fvec/C4FChunk.java
@@ -14,6 +14,7 @@ public class C4FChunk extends Chunk {
     return Float.isNaN(res)?_vec._fNA:res;
   }
   @Override void   append2 ( long l, int exp ) { throw H2O.fail(); }
+  @Override boolean hasFloat() { return true; }
   @Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem,_mem.length); }
   @Override public C4FChunk read(AutoBuffer bb) {
     _mem = bb.bufClose();