Skip to content

Commit

Permalink
Simple compressed boolean chunk
Browse files Browse the repository at this point in the history
 - compression distinguish between data with/without NA
 - junit test verifying correctness of compression
   - junit test is testing only compression and does not
     use the rest of H2O infrastructure. It mocks required
     classes with help of Mockito
  • Loading branch information
mmalohlava committed Jul 13, 2013
1 parent f9be085 commit f478cb7
Show file tree
Hide file tree
Showing 5 changed files with 217 additions and 0 deletions.
1 change: 1 addition & 0 deletions .classpath
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
<classpathentry kind="lib" path="lib/jama/Jama.jar"/>
<classpathentry kind="lib" path="lib/javassist.jar" sourcepath="lib/javassist-sources.jar"/>
<classpathentry kind="lib" path="lib/apache/commons-codec-1.4.jar" sourcepath="lib/apache/commons-codec-1.4-sources.zip"/>
<classpathentry kind="lib" path="lib/mockito/mockito-all-1.9.5.jar"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/jets3t/commons-httpclient-3.1.jar"/>
<classpathentry kind="lib" path="lib/jets3t/jets3t-0.6.1.jar"/>
Expand Down
Binary file added lib/mockito/mockito-all-1.9.5.jar
Binary file not shown.
84 changes: 84 additions & 0 deletions src/main/java/water/fvec/CBSChunk.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
/**
*
*/
package water.fvec;

import water.AutoBuffer;
import water.H2O;

/** A simple chunk for boolean values. In fact simple bit vector.
* Each boolean is represented by 2bits since we need to represent
* NA.
*/
public class CBSChunk extends Chunk {
static protected final byte _NA = 0x02; // Internal representation of NA
static final int OFF = 2;

protected byte _bpv;
protected byte _gap;
public CBSChunk(byte[] bs, byte gap, byte bpv) {
assert gap < 8; assert bpv == 1 || bpv == 2;
_mem = bs; _start = -1; _gap = gap; _bpv = bpv;
_len = ((_mem.length - OFF)*8 - _gap) / _bpv; // number of boolean items
}
@Override protected double atd_impl(int idx) {
byte b = atb(idx);
return b == _NA ? _vec._fNA : b;
}
@Override protected long at8_impl(int idx) {
byte b = atb(idx);
return b == _NA ? _vec._iNA : b;
}
protected byte atb(int idx) {
int vpb = 8 / _bpv; // values per byte
int bix = OFF + idx / vpb; // byte index
int off = _bpv * (idx % vpb);
byte b = _mem[bix];
switch( _bpv ) {
case 1: return read1b(b, off);
case 2: return read2b(b, off);
default: H2O.fail();
}
return -1;
}
@Override boolean set8_impl(int idx, long l) { return false; }
@Override boolean set8_impl(int idx, double d) { return false; }
@Override boolean hasFloat () { return false; }

@Override NewChunk inflate_impl(NewChunk nc) {
throw H2O.unimpl();
}

@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem, _mem.length); }

@Override public Chunk read(AutoBuffer bb) {
_mem = bb.bufClose();
_start = -1;
_gap = _mem[0];
_bpv = _mem[1];
_len = ((_mem.length - OFF)*8 - _gap) / _bpv;
return this;
}

/** Writes 1bit from value into b at given offset and return b */
public static byte write1b(byte b, byte val, int off) {
val = (byte) ((val & 0x1) << (7-off));
return (byte) (b | val);
}
/** Writes 2bits from value into b at given offset and return b */
public static byte write2b(byte b, byte val, int off) {
val = (byte) ((val & 0x3) << (6-off)); // 0000 00xx << (6-off)
return (byte) (b | val);
}

/** Reads 1bit from given b in given offset. */
public static byte read1b(byte b, int off) { return (byte) ((b >> (7-off)) & 0x1); }
/** Reads 1bit from given b in given offset. */
public static byte read2b(byte b, int off) { return (byte) ((b >> (6-off)) & 0x3); }

/** Returns compressed len of the given array length if the value if represented by bpv-bits. */
public static int clen(int values, int bpv) {
int len = (values*bpv) >> 3;
return values*bpv % 8 == 0 ? len : len + 1;
}
}
38 changes: 38 additions & 0 deletions src/main/java/water/fvec/NewChunk.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,14 @@ Chunk compress() {
return new C0LChunk((long)_min,_len);
}

// Boolean column? (or in general two value column)
if (lemax-lemin == 1 && lemin == 0) {
int bpv = _naCnt > 0 ? 2 : 1;
byte[] cbuf = bufB(CBSChunk.OFF, bpv);
return new CBSChunk(cbuf, cbuf[0], cbuf[1]);
}


// Exponent scaling: replacing numbers like 1.3 with 13e-1. '13' fits in a
// byte and we scale the column by 0.1. A set of numbers like
// {1.2,23,0.34} then is normalized to always be represented with 2 digits
Expand Down Expand Up @@ -232,6 +240,36 @@ private byte[] bufF( int log ) {
return bs;
}

// Compute compressed boolean buffer
private byte[] bufB(int off, int bpv) {
assert bpv == 1 || bpv == 2 : "Only bit vectors with/without NA are supported";
int clen = off + CBSChunk.clen(_len, bpv);
byte bs[] = new byte[clen];
int boff = 0;
byte b = 0;
int idx = off;
for (int i=0; i<_len; i++) {
byte val = isNA(i) ? CBSChunk._NA : (byte) _ls[i];
switch (bpv) {
case 1: assert val!=CBSChunk._NA;
b = CBSChunk.write1b(b, val, boff); break;
case 2: b = CBSChunk.write2b(b, val, boff); break;
}
boff += bpv;
if (boff>8-bpv) { bs[idx] = b; boff = 0; b = 0; idx++; }
}
// Save the gap = number of unfilled bits and bpv value
bs[0] = (byte) (boff == 0 ? 0 : 8-boff);
bs[1] = (byte) bpv;
// Flush last byte
if (boff>0) bs[idx++] = b;
/*for (int i=0; i<idx; i++) {
if (i==0 || i==1) System.err.println(bs[i]);
else System.err.println(bs[i] + " = " + Integer.toBinaryString(bs[i]));
}*/
return bs;
}

// Set & At on NewChunks are weird: only used after inflating some other
// chunk. At this point the NewChunk is full size, no more appends allowed,
// and the xs exponent array should be only full of zeros. Accesses must be
Expand Down
94 changes: 94 additions & 0 deletions src/test/java/water/fvec/CBSChunkTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package water.fvec;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.mockito.Matchers.any;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;

import org.junit.Ignore;
import org.junit.Test;

import water.Futures;

/** Test for CBSChunk implementation.
*
* The objective of the test is to verify compression method, not the H2O environment.
*
* NOTE: The test is attempt to not require H2O infrastructure to run.
* It tries to use Mockito (perhaps PowerMock in the future) to wrap
* expected results. In this case expectation is little bit missused
* since it is used to avoid DKV call.
* */
public class CBSChunkTest {

void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expNA) {
// The following code mock underlying vector since we are not
// tested them (=we are not interested in them), but chunk compression.
// Mock the appendable vector.
AppendableVec av = mock(AppendableVec.class);
// Create an expectation - I know what should I expect
// after closing the appendable vector.
Vec vv = mock(Vec.class);
vv.setNAs(Double.NaN, Long.MIN_VALUE);
when(av.close(any(Futures.class))).thenReturn(vv);

// Create a new chunk
NewChunk nc = new NewChunk(av,0);
nc._ls = ls;
nc._xs = xs;
nc._len = ls.length;
for (int i=0;i<ls.length; i++) nc._naCnt += nc.isNA(i) ? 1 : 0; // Compute number of NAs
assertEquals(expNA, nc._naCnt);
// Compress chunk
Chunk cc = nc.compress();
cc._vec = av.close(new Futures());

assertTrue( "Found chunk class "+cc.getClass()+" but expected " + CBSChunk.class, CBSChunk.class.isInstance(cc) );
assertEquals(nc._len, cc._len);
assertEquals(expGap, ((CBSChunk)cc)._gap);
assertEquals(expBpv, ((CBSChunk)cc)._bpv);
assertEquals(expClen, cc._mem.length - CBSChunk.OFF);
// Also, we can decompress correctly
for( int i=0; i<ls.length; i++ )
assertEquals(xs[i]==0 ? ls[i] : Long.MIN_VALUE, cc.at80(i));
}

// Test one bit per value compression which is used
// for data without NAs
@Test @Ignore public void test1BPV() {
// Simple case only compressing into 4bits of one byte
testImpl(new long[] {0,0,0,1},
new int [] {0,0,0,0},
1, 4, 1, 0);
// Filling whole byte
testImpl(new long[] {1,0,0,0,1,1,1,0},
new int [] {0,0,0,0,0,0,0,0},
1, 0, 1, 0);
// Crossing the border of two bytes by 1bit
testImpl(new long[] {1,0,0,0,1,1,1,0, 1},
new int [] {0,0,0,0,0,0,0,0, 0},
1, 7, 2, 0);
}

// Test two bits per value compression used for case with NAs
// used for data containing NAs
@Test public void test2BPV() {
// Simple case only compressing 2*3bits into 1byte including 1 NA
testImpl(new long[] {0,0,1},
new int [] {0,1,0},
2, 2, 1, 1);
// Filling whole byte, one NA
testImpl(new long[] {1,0,0,1},
new int [] {0,1,0,0},
2, 0, 1, 1);
// crossing the border of two bytes by 4bits, one NA
testImpl(new long[] {1,0,0,1, 0,0},
new int [] {0,0,1,0, 0,0},
2, 4, 2, 1);
// Two full bytes, 5 NAs
testImpl(new long[] {0,0,0,1, 0,0,1,0},
new int [] {1,1,1,0, 0,1,0,1},
2, 0, 2, 5);
}
}

0 comments on commit f478cb7

Please sign in to comment.