Skip to content

Commit

Permalink
Fix of HEX-1351.
Browse files Browse the repository at this point in the history
The model adapt corrected again - it missed corner case when a column
contained negative values which were transformed to enum.
Furthermore, double call of makeTransf led to memory leak of a key.
  • Loading branch information
mmalohlava committed Nov 28, 2013
1 parent 9d05b41 commit 8a87b22
Show file tree
Hide file tree
Showing 11 changed files with 115 additions and 40 deletions.
File renamed without changes.
File renamed without changes.
3 changes: 3 additions & 0 deletions smalldata/test/classifier/coldom_test_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
A,b,-1,0,1
A,a,-1,0,0
D,b,-1,3,1
File renamed without changes.
13 changes: 13 additions & 0 deletions smalldata/test/classifier/coldom_train_3.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
A,a,-1
A,b,1
A,c,1
A,d,1
B,a,-1
B,b,-1
B,c,1
A,d,1
C,a,-1
C,b,-1
C,c,-1
A,d,1
D,a,-1
33 changes: 21 additions & 12 deletions src/main/java/water/Model.java
Original file line number Diff line number Diff line change
Expand Up @@ -212,27 +212,36 @@ private int[][] adapt( String names[], String domains[][], boolean exact) {
* frame which contains only vectors which where adapted (the purpose of the
* second frame is to delete all adapted vectors with deletion of the
* frame). */
public Frame[] adapt( Frame fr, boolean exact) {
Frame vfr = new Frame(fr);
public Frame[] adapt( final Frame fr, boolean exact) {
Frame vfr = new Frame(fr); // To avoid modification of original frame fr
int ridx = vfr.find(_names[_names.length-1]);
if(ridx != -1 && ridx != vfr._names.length-1){ // put response to the end
if(ridx != -1 && ridx != vfr._names.length-1){ // Unify frame - put response to the end
String n = vfr._names[ridx];
vfr.add(n,vfr.remove(ridx));
}
int n = ridx == -1?_names.length-1:_names.length;
String [] names = Arrays.copyOf(_names, n);
vfr = vfr.subframe(names);
Vec [] frvecs = vfr.vecs();
vfr = vfr.subframe(names); // select only supported columns, if column is missing Exception is thrown
Vec[] frvecs = vfr.vecs();
boolean[] toEnum = new boolean[frvecs.length];
if(!exact) for(int i = 0; i < n;++i)
if(_domains[i] != null && !frvecs[i].isEnum())
if(_domains[i] != null && !frvecs[i].isEnum()) {// if model expects domain but input frame does not have domain => switch vector to enum
frvecs[i] = frvecs[i].toEnum();
toEnum[i] = true;
}
int map[][] = adapt(names,vfr.domains(),exact);
ArrayList<Vec> avecs = new ArrayList<Vec>();
ArrayList<String> anames = new ArrayList<String>();
for( int c=0; c<map.length; c++ ) // iterate over columns
if(map[c] != null){
avecs.add(frvecs[c] = frvecs[c].makeTransf(map[c]));
anames.add(names[c]);
assert map.length == names.length; // Be sure that adapt call above do not skip any column
ArrayList<Vec> avecs = new ArrayList<Vec>(); // adapted vectors
ArrayList<String> anames = new ArrayList<String>(); // names for adapted vector

for( int c=0; c<map.length; c++ ) // Iterate over columns
if(map[c] != null) { // Column needs adaptation
Vec adaptedVec = null;
if (toEnum[c]) { // Vector was flipped to column already, compose transformation
adaptedVec = TransfVec.compose( (TransfVec) frvecs[c], map[c], false );
} else adaptedVec = frvecs[c].makeTransf(map[c]);
avecs.add(frvecs[c] = adaptedVec);
anames.add(names[c]); // Collect right names
}
return new Frame[] { new Frame(names,frvecs), new Frame(anames.toArray(new String[anames.size()]), avecs.toArray(new Vec[avecs.size()])) };
}
Expand Down
39 changes: 31 additions & 8 deletions src/main/java/water/fvec/TransfVec.java
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
package water.fvec;

import water.*;
import water.util.Utils;

/**
* Dummy vector transforming values of given vector according to given domain mapping.
*/
public class TransfVec extends Vec {
final Key _masterVecKey;
final int[] _domMap;
final int _domMin;
transient Vec _masterVec;

public TransfVec(Key masterVecKey, int[] domMap, Key key, long[] espc) {
this(masterVecKey, domMap, null, key, espc);
public TransfVec(Key masterVecKey, int[] domMap, int domMin, Key key, long[] espc) {
this(masterVecKey, domMap, domMin, null, key, espc);
}
public TransfVec(Key masterVecKey, int[] domMap, String[] domain, Key key, long[] espc) {
public TransfVec(Key masterVecKey, int[] domMap, int domMin, String[] domain, Key key, long[] espc) {
super(key, espc);
_masterVecKey = masterVecKey;
_domMap = domMap;
_domMin = domMin;
_domain = domain;
}

Expand All @@ -27,14 +30,15 @@ public TransfVec(Key masterVecKey, int[] domMap, String[] domain, Key key, long[

@Override public Chunk elem2BV(int cidx) {
Chunk c = masterVec().elem2BV(cidx);
return new TransfChunk(c, _domMap, this);
return new TransfChunk(c, _domMap, _domMin, this);
}

static class TransfChunk extends Chunk {
Chunk _c;
int[] _domMap;
public TransfChunk(Chunk c, int[] domMap, Vec vec) { _c = c; _domMap = domMap; _len = _c._len; _start = _c._start; _vec = vec; }
@Override protected long at8_impl(int idx) { return _domMap[(int)_c.at8_impl(idx)]; }
final Chunk _c;
final int[] _domMap;
final int _domMin;
public TransfChunk(Chunk c, int[] domMap, int domMin, Vec vec) { _c = c; _domMap = domMap; _domMin = domMin; _len = _c._len; _start = _c._start; _vec = vec; }
@Override protected long at8_impl(int idx) { return _domMap[(int)_c.at8_impl(idx)-_domMin]; }
@Override protected double atd_impl(int idx) { return _c.isNA0(idx) ? Double.NaN : at8_impl(idx); }
@Override protected boolean isNA_impl(int idx) {
if (_c.isNA_impl(idx)) return true;
Expand All @@ -50,4 +54,23 @@ static class TransfChunk extends Chunk {
@Override public AutoBuffer write(AutoBuffer bb) { throw new UnsupportedOperationException(); }
@Override public Chunk read(AutoBuffer bb) { throw new UnsupportedOperationException(); }
}

/** Compose this vector with given transformation. Always return a new vector */
public Vec compose(int[] transfMap) { return compose(this, transfMap, true); }

/**
* Compose given origVector with given transformation. Always returns a new vector.
* Original vector is kept if keepOrig is true.
* @param origVec
* @param transfMap
* @param keepOrig
* @return a new instance of {@link TransfVec} composing transformation of origVector and tranfsMap
*/
public static Vec compose(TransfVec origVec, int[] transfMap, boolean keepOrig) {
// Do a mapping from INT -> ENUM -> this vector ENUM
int[] domMap = Utils.compose(origVec._domMap, transfMap);
Vec result = origVec.makeTransf(domMap, origVec._domain);
if (!keepOrig) DKV.remove(origVec._key);
return result;
}
}
9 changes: 8 additions & 1 deletion src/main/java/water/fvec/Vec.java
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ public Vec makeCon( final double d ) {
public Vec makeTransf(final int[] domMap, final String[] domain) {
Futures fs = new Futures();
if( _espc == null ) throw H2O.unimpl();
Vec v0 = new TransfVec(this._key, domMap, domain, group().addVecs(1)[0],_espc);
Vec v0 = new TransfVec(this._key, domMap, (int) min(), domain, group().addVecs(1)[0],_espc);
DKV.put(v0._key,v0,fs);
fs.blockForPending();
return v0;
Expand Down Expand Up @@ -593,6 +593,7 @@ public int reserveKeys(final int n){
}
}

/** Collect numeric domain of given vector */
public static class CollectDomain extends MRTask2<CollectDomain> {
final int _nclass;
final int _ymin;
Expand All @@ -608,6 +609,12 @@ public static class CollectDomain extends MRTask2<CollectDomain> {
}
}
@Override public void reduce( CollectDomain that ) { Utils.or(_dom,that._dom); }

/** Returns exact numeric domain of given vector computed by this task.
* The domain is always sorted. Hence:
* domain()[0] - minimal domain value
* domain()[domain().length-1] - maximal domain value
*/
public int[] domain() {
int cnt = 0;
for (int i=0; i<_dom.length; i++) if (_dom[i]>0) cnt++;
Expand Down
10 changes: 7 additions & 3 deletions src/main/java/water/util/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -741,12 +741,16 @@ private static long attemptTimeParse_1( ValueString str ) {
}

/** Returns a mapping of given domain to values (0, ... max(dom)).
* Unused domain items has mapping to -1. */
* Unused domain items has mapping to -1.
* @precondition - dom is sorted dom[0] contains minimal value, dom[dom.length-1] represents max. value. */
public static int[] mapping(int[] dom) {
assert dom.length > 0 : "Empty domain!";
assert dom[0] <= dom[dom.length-1] : "Domain is not sorted";
int min = dom[0];
int max = dom[dom.length-1];
int[] result = new int[max+1];
int[] result = new int[(max-min)+1];
for (int i=0; i<result.length; i++) result[i] = -1; // not used fields
for (int i=0; i<dom.length; i++) result[dom[i]] = i;
for (int i=0; i<dom.length; i++) result[dom[i]-min] = i;
return result;
}
public static String[] toStringMap(int[] dom) {
Expand Down
40 changes: 28 additions & 12 deletions src/test/java/hex/drf/DRFModelAdaptTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ private abstract class PrepData { abstract Vec prep(Frame fr); int needAdaptatio
* C - 2 D - 1 mapping should remap it into: D - 3
* D - 3
*/
@Test public void testModelAdapt() {
//@Ignore
@Test public void testModelAdapt1() {
testModelAdaptation(
"./smalldata/test/classifier/coldom_train.csv",
"./smalldata/test/classifier/coldom_test.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } });
"./smalldata/test/classifier/coldom_train_1.csv",
"./smalldata/test/classifier/coldom_test_1.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } },
true);
}

/**
Expand All @@ -39,21 +41,35 @@ private abstract class PrepData { abstract Vec prep(Frame fr); int needAdaptatio
* C - 2 X - 1 mapping should remap it into: X - NA
* D - 3
*/
@Test public void testModelAdapt2() {
//@Ignore
@Test public void testModelAdapt1_2() {
testModelAdaptation(
"./smalldata/test/classifier/coldom_train.csv",
"./smalldata/test/classifier/coldom_test2.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } });
"./smalldata/test/classifier/coldom_train_1.csv",
"./smalldata/test/classifier/coldom_test_1_2.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } },
true);
}

@Test public void testModelAdapt3() {
//@Ignore
@Test public void testModelAdapt2() {
testModelAdaptation(
"./smalldata/test/classifier/coldom_train_2.csv",
"./smalldata/test/classifier/coldom_test_2.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.find("R")]; }; @Override int needAdaptation(Frame fr) { return 0;} });
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.find("R")]; }; @Override int needAdaptation(Frame fr) { return 0;} },
true);
}

/** Test adaptation of numeric values in response column. */
//@Ignore
@Test public void testModelAdapt3() {
testModelAdaptation(
"./smalldata/test/classifier/coldom_train_3.csv",
"./smalldata/test/classifier/coldom_test_3.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } },
false);
}

void testModelAdaptation(String train, String test, PrepData dprep) {
void testModelAdaptation(String train, String test, PrepData dprep, boolean exactAdaptation) {
DRFModel model = null;
Frame frTest = null;
Frame frTrain = null;
Expand All @@ -70,7 +86,7 @@ void testModelAdaptation(String train, String test, PrepData dprep) {
frTest = parseFrame(testKey, test);
Assert.assertEquals("TEST CONF ERROR: The test dataset should contain 2*<number of input columns>+1!", 2*(frTrain.numCols()-1)+1, frTest.numCols());
// Adapt test dataset
frAdapted = model.adapt(frTest, true); // do not perform translation to enums
frAdapted = model.adapt(frTest, exactAdaptation); // do/do not perform translation to enums
Assert.assertEquals("Adapt method should return two frames", 2, frAdapted.length);
Assert.assertEquals("Test expects that all columns in test dataset has to be adapted", dprep.needAdaptation(frTrain), frAdapted[1].numCols());

Expand Down
8 changes: 4 additions & 4 deletions src/test/java/hex/gbm/GBMDomainTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ private abstract class PrepData { abstract Vec prep(Frame fr); }
*/
@Test public void testModelAdapt() {
runAndScoreGBM(
"./smalldata/test/classifier/coldom_train.csv",
"./smalldata/test/classifier/coldom_test.csv",
"./smalldata/test/classifier/coldom_train_1.csv",
"./smalldata/test/classifier/coldom_test_1.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } });
}

Expand All @@ -41,8 +41,8 @@ private abstract class PrepData { abstract Vec prep(Frame fr); }
*/
@Test public void testModelAdapt2() {
runAndScoreGBM(
"./smalldata/test/classifier/coldom_train.csv",
"./smalldata/test/classifier/coldom_test2.csv",
"./smalldata/test/classifier/coldom_train_1.csv",
"./smalldata/test/classifier/coldom_test_1_2.csv",
new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } });
}
// Adapt a trained model to a test dataset with different enums
Expand Down

0 comments on commit 8a87b22

Please sign in to comment.