Skip to content

Commit

Permalink
Better caching, more slice smarts
Browse files Browse the repository at this point in the history
Do DeepSlice rollups BEFORE the MR job.
Better handling printing Frames with no names.
  • Loading branch information
cliffclick committed Apr 3, 2014
1 parent 9f9a926 commit 641ddfb
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 19 deletions.
20 changes: 14 additions & 6 deletions src/main/java/water/fvec/Frame.java
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,9 @@ public Frame add( String name, Vec vec ) {
/** Appends an entire Frame */
public Frame add( Frame fr, String names[] ) {
assert _vecs.length==0 || anyVec().group().equals(fr.anyVec().group()) : "Adding a vector from different vector group. Current frame contains "+Arrays.toString(_names)+ " vectors. New frame contains "+Arrays.toString(fr.names()) + " vectors.";
for( String name : names )
if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame");
if( _names != null && fr._names != null )
for( String name : names )
if( find(name) != -1 ) throw new IllegalArgumentException("Duplicate name '"+name+"' in Frame");
final int len0= _names!=null ? _names.length : 0;
final int len1= names!=null ? names.length : 0;
final int len = len0+len1;
Expand Down Expand Up @@ -660,12 +661,12 @@ else if (ocols instanceof Frame) {
// Do Da Slice
// orows is either a long[] or a Vec
if (orows == null)
return new DeepSlice((long[])orows,c2).doAll(c2.length,this).outputFrame(names(c2),domains(c2));
return new DeepSlice((long[])orows,c2,vecs()).doAll(c2.length,this).outputFrame(names(c2),domains(c2));
else if (orows instanceof long[]) {
final long CHK_ROWS=1000000;
long[] rows = (long[])orows;
if( rows.length==0 || rows[0] < 0 )
return new DeepSlice(rows,c2).doAll(c2.length, this).outputFrame(names(c2), domains(c2));
return new DeepSlice(rows,c2,vecs()).doAll(c2.length, this).outputFrame(names(c2), domains(c2));
// Vec'ize the index array
Futures fs = new Futures();
AppendableVec av = new AppendableVec("rownames");
Expand Down Expand Up @@ -745,8 +746,15 @@ private static class Slice extends MRTask2<Slice> {
private static class DeepSlice extends MRTask2<DeepSlice> {
final int _cols[];
final long _rows[];
final byte _isInt[];
boolean _ex = true;
DeepSlice( long rows[], int cols[]) { _cols=cols; _rows=rows;}
DeepSlice( long rows[], int cols[], Vec vecs[] ) {
_cols=cols;
_rows=rows;
_isInt = new byte[cols.length];
for( int i=0; i<cols.length; i++ )
_isInt[i] = (byte)(vecs[cols[i]].isInt() ? 1 : 0);
}
@Override public void map( Chunk chks[], NewChunk nchks[] ) {
long rstart = chks[0]._start;
int rlen = chks[0]._len; // Total row count
Expand Down Expand Up @@ -791,7 +799,7 @@ private static class DeepSlice extends MRTask2<DeepSlice> {
for( int i=0; i<_cols.length; i++ ) {
Chunk oc = chks[_cols[i]];
NewChunk nc = nchks[ i ];
if( oc._vec.isInt() ) { // Slice on integer columns
if( _isInt[_cols[i]] == 1 ) { // Slice on integer columns
for( int j=rlo; j<rhi; j++ )
if( oc.isNA0(j) ) nc.addNA();
else nc.addNum(oc.at80(j),0);
Expand Down
26 changes: 13 additions & 13 deletions src/main/java/water/fvec/Vec.java
Original file line number Diff line number Diff line change
Expand Up @@ -328,7 +328,8 @@ Vec setRollupStats( RollupStats rs ) {
// the rollup in the background. *Always* returns "this".
public Vec rollupStats(Futures fs) {
Vec vthis = DKV.get(_key).get();
if( vthis._naCnt==-2 ) throw new IllegalArgumentException("Cannot ask for roll-up stats while the vector is being actively written.");
if( vthis._naCnt==-2 )
throw new IllegalArgumentException("Cannot ask for roll-up stats while the vector is being actively written.");
if( vthis._naCnt>= 0 ) { // KV store has a better answer
if( vthis == this ) return this;
_min = vthis._min; _max = vthis._max;
Expand Down Expand Up @@ -534,24 +535,23 @@ public Chunk chunkForChunkIdx(int cidx) {
return c;
}
/** The Chunk for a row#. Warning: this loads the data locally! */
public final Chunk chunkForRow(long i) {
return chunkForChunkIdx(elem2ChunkIdx(i));
}
private Chunk chunkForRow_impl(long i) { return chunkForChunkIdx(elem2ChunkIdx(i)); }

// Cache of last Chunk accessed via at/set api
transient Chunk _cache;
private Chunk c(long i) {
/** The Chunk for a row#. Warning: this loads the data locally! */
public final Chunk chunkForRow(long i) {
Chunk c = _cache;
return (c != null && c._chk2==null && c._start <= i && i < c._start+c._len) ? c : (_cache = chunkForRow(i));
return (c != null && c._chk2==null && c._start <= i && i < c._start+c._len) ? c : (_cache = chunkForRow_impl(i));
}
/** Fetch element the slow way, as a long. Floating point values are
* silently rounded to an integer. Throws if the value is missing. */
public final long at8( long i ) { return c(i).at8(i); }
public final long at8( long i ) { return chunkForRow(i).at8(i); }
/** Fetch element the slow way, as a double. Missing values are
* returned as Double.NaN instead of throwing. */
public final double at( long i ) { return c(i).at(i); }
public final double at( long i ) { return chunkForRow(i).at(i); }
/** Fetch the missing-status the slow way. */
public final boolean isNA(long row){ return c(row).isNA(row); }
public final boolean isNA(long row){ return chunkForRow(row).isNA(row); }


/** Write element the slow way, as a long. There is no way to write a
Expand All @@ -561,18 +561,18 @@ private Chunk c(long i) {
* common compatible data representation.
*
* */
public final long set( long i, long l) {return c(i).set(i,l);}
public final long set( long i, long l) {return chunkForRow(i).set(i,l);}

/** Write element the slow way, as a double. Double.NaN will be treated as
* a set of a missing element.
* */
public final double set( long i, double d) {return c(i).set(i,d);}
public final double set( long i, double d) {return chunkForRow(i).set(i,d);}
/** Write element the slow way, as a float. Float.NaN will be treated as
* a set of a missing element.
* */
public final float set( long i, float f) {return c(i).set(i,f);}
public final float set( long i, float f) {return chunkForRow(i).set(i,f);}
/** Set the element as missing the slow way. */
public final boolean setNA( long i ) { return c(i).setNA(i);}
public final boolean setNA( long i ) { return chunkForRow(i).setNA(i);}

/** Pretty print the Vec: [#elems, min/mean/max]{chunks,...} */
@Override public String toString() {
Expand Down

0 comments on commit 641ddfb

Please sign in to comment.