forked from h2oai/h2o-2
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- compression distinguish between data with/without NA - junit test verifying correctness of compression - junit test is testing only compression and does not use the rest of H2O infrastructure. It mocks required classes with help of Mockito
- Loading branch information
1 parent
f9be085
commit f478cb7
Showing
5 changed files
with
217 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
/** | ||
* | ||
*/ | ||
package water.fvec; | ||
|
||
import water.AutoBuffer; | ||
import water.H2O; | ||
|
||
/** A simple chunk for boolean values. In fact simple bit vector. | ||
* Each boolean is represented by 2bits since we need to represent | ||
* NA. | ||
*/ | ||
public class CBSChunk extends Chunk { | ||
static protected final byte _NA = 0x02; // Internal representation of NA | ||
static final int OFF = 2; | ||
|
||
protected byte _bpv; | ||
protected byte _gap; | ||
public CBSChunk(byte[] bs, byte gap, byte bpv) { | ||
assert gap < 8; assert bpv == 1 || bpv == 2; | ||
_mem = bs; _start = -1; _gap = gap; _bpv = bpv; | ||
_len = ((_mem.length - OFF)*8 - _gap) / _bpv; // number of boolean items | ||
} | ||
@Override protected double atd_impl(int idx) { | ||
byte b = atb(idx); | ||
return b == _NA ? _vec._fNA : b; | ||
} | ||
@Override protected long at8_impl(int idx) { | ||
byte b = atb(idx); | ||
return b == _NA ? _vec._iNA : b; | ||
} | ||
protected byte atb(int idx) { | ||
int vpb = 8 / _bpv; // values per byte | ||
int bix = OFF + idx / vpb; // byte index | ||
int off = _bpv * (idx % vpb); | ||
byte b = _mem[bix]; | ||
switch( _bpv ) { | ||
case 1: return read1b(b, off); | ||
case 2: return read2b(b, off); | ||
default: H2O.fail(); | ||
} | ||
return -1; | ||
} | ||
@Override boolean set8_impl(int idx, long l) { return false; } | ||
@Override boolean set8_impl(int idx, double d) { return false; } | ||
@Override boolean hasFloat () { return false; } | ||
|
||
@Override NewChunk inflate_impl(NewChunk nc) { | ||
throw H2O.unimpl(); | ||
} | ||
|
||
@Override public AutoBuffer write(AutoBuffer bb) { return bb.putA1(_mem, _mem.length); } | ||
|
||
@Override public Chunk read(AutoBuffer bb) { | ||
_mem = bb.bufClose(); | ||
_start = -1; | ||
_gap = _mem[0]; | ||
_bpv = _mem[1]; | ||
_len = ((_mem.length - OFF)*8 - _gap) / _bpv; | ||
return this; | ||
} | ||
|
||
/** Writes 1bit from value into b at given offset and return b */ | ||
public static byte write1b(byte b, byte val, int off) { | ||
val = (byte) ((val & 0x1) << (7-off)); | ||
return (byte) (b | val); | ||
} | ||
/** Writes 2bits from value into b at given offset and return b */ | ||
public static byte write2b(byte b, byte val, int off) { | ||
val = (byte) ((val & 0x3) << (6-off)); // 0000 00xx << (6-off) | ||
return (byte) (b | val); | ||
} | ||
|
||
/** Reads 1bit from given b in given offset. */ | ||
public static byte read1b(byte b, int off) { return (byte) ((b >> (7-off)) & 0x1); } | ||
/** Reads 1bit from given b in given offset. */ | ||
public static byte read2b(byte b, int off) { return (byte) ((b >> (6-off)) & 0x3); } | ||
|
||
/** Returns compressed len of the given array length if the value if represented by bpv-bits. */ | ||
public static int clen(int values, int bpv) { | ||
int len = (values*bpv) >> 3; | ||
return values*bpv % 8 == 0 ? len : len + 1; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
package water.fvec; | ||
|
||
import static org.junit.Assert.assertEquals; | ||
import static org.junit.Assert.assertTrue; | ||
import static org.mockito.Matchers.any; | ||
import static org.mockito.Mockito.mock; | ||
import static org.mockito.Mockito.when; | ||
|
||
import org.junit.Ignore; | ||
import org.junit.Test; | ||
|
||
import water.Futures; | ||
|
||
/** Test for CBSChunk implementation. | ||
* | ||
* The objective of the test is to verify compression method, not the H2O environment. | ||
* | ||
* NOTE: The test is attempt to not require H2O infrastructure to run. | ||
* It tries to use Mockito (perhaps PowerMock in the future) to wrap | ||
* expected results. In this case expectation is little bit missused | ||
* since it is used to avoid DKV call. | ||
* */ | ||
public class CBSChunkTest { | ||
|
||
void testImpl(long[] ls, int[] xs, int expBpv, int expGap, int expClen, int expNA) { | ||
// The following code mock underlying vector since we are not | ||
// tested them (=we are not interested in them), but chunk compression. | ||
// Mock the appendable vector. | ||
AppendableVec av = mock(AppendableVec.class); | ||
// Create an expectation - I know what should I expect | ||
// after closing the appendable vector. | ||
Vec vv = mock(Vec.class); | ||
vv.setNAs(Double.NaN, Long.MIN_VALUE); | ||
when(av.close(any(Futures.class))).thenReturn(vv); | ||
|
||
// Create a new chunk | ||
NewChunk nc = new NewChunk(av,0); | ||
nc._ls = ls; | ||
nc._xs = xs; | ||
nc._len = ls.length; | ||
for (int i=0;i<ls.length; i++) nc._naCnt += nc.isNA(i) ? 1 : 0; // Compute number of NAs | ||
assertEquals(expNA, nc._naCnt); | ||
// Compress chunk | ||
Chunk cc = nc.compress(); | ||
cc._vec = av.close(new Futures()); | ||
|
||
assertTrue( "Found chunk class "+cc.getClass()+" but expected " + CBSChunk.class, CBSChunk.class.isInstance(cc) ); | ||
assertEquals(nc._len, cc._len); | ||
assertEquals(expGap, ((CBSChunk)cc)._gap); | ||
assertEquals(expBpv, ((CBSChunk)cc)._bpv); | ||
assertEquals(expClen, cc._mem.length - CBSChunk.OFF); | ||
// Also, we can decompress correctly | ||
for( int i=0; i<ls.length; i++ ) | ||
assertEquals(xs[i]==0 ? ls[i] : Long.MIN_VALUE, cc.at80(i)); | ||
} | ||
|
||
// Test one bit per value compression which is used | ||
// for data without NAs | ||
@Test @Ignore public void test1BPV() { | ||
// Simple case only compressing into 4bits of one byte | ||
testImpl(new long[] {0,0,0,1}, | ||
new int [] {0,0,0,0}, | ||
1, 4, 1, 0); | ||
// Filling whole byte | ||
testImpl(new long[] {1,0,0,0,1,1,1,0}, | ||
new int [] {0,0,0,0,0,0,0,0}, | ||
1, 0, 1, 0); | ||
// Crossing the border of two bytes by 1bit | ||
testImpl(new long[] {1,0,0,0,1,1,1,0, 1}, | ||
new int [] {0,0,0,0,0,0,0,0, 0}, | ||
1, 7, 2, 0); | ||
} | ||
|
||
// Test two bits per value compression used for case with NAs | ||
// used for data containing NAs | ||
@Test public void test2BPV() { | ||
// Simple case only compressing 2*3bits into 1byte including 1 NA | ||
testImpl(new long[] {0,0,1}, | ||
new int [] {0,1,0}, | ||
2, 2, 1, 1); | ||
// Filling whole byte, one NA | ||
testImpl(new long[] {1,0,0,1}, | ||
new int [] {0,1,0,0}, | ||
2, 0, 1, 1); | ||
// crossing the border of two bytes by 4bits, one NA | ||
testImpl(new long[] {1,0,0,1, 0,0}, | ||
new int [] {0,0,1,0, 0,0}, | ||
2, 4, 2, 1); | ||
// Two full bytes, 5 NAs | ||
testImpl(new long[] {0,0,0,1, 0,0,1,0}, | ||
new int [] {1,1,1,0, 0,1,0,1}, | ||
2, 0, 2, 5); | ||
} | ||
} |