From 8a87b22da1f663bec67cb49a249a6858e2d59834 Mon Sep 17 00:00:00 2001 From: mmalohlava Date: Wed, 27 Nov 2013 19:26:58 -0800 Subject: [PATCH] Fix of HEX-1351. The model adapt corrected again - it missed corner case when a column contained negative values which were transformed to enum. Furthermore, double call of makeTransf led to memory leak of a key. --- .../{coldom_test.csv => coldom_test_1.csv} | 0 .../{coldom_test2.csv => coldom_test_1_2.csv} | 0 smalldata/test/classifier/coldom_test_3.csv | 3 ++ .../{coldom_train.csv => coldom_train_1.csv} | 0 smalldata/test/classifier/coldom_train_3.csv | 13 ++++++ src/main/java/water/Model.java | 33 +++++++++------ src/main/java/water/fvec/TransfVec.java | 39 ++++++++++++++---- src/main/java/water/fvec/Vec.java | 9 ++++- src/main/java/water/util/Utils.java | 10 +++-- src/test/java/hex/drf/DRFModelAdaptTest.java | 40 +++++++++++++------ src/test/java/hex/gbm/GBMDomainTest.java | 8 ++-- 11 files changed, 115 insertions(+), 40 deletions(-) rename smalldata/test/classifier/{coldom_test.csv => coldom_test_1.csv} (100%) rename smalldata/test/classifier/{coldom_test2.csv => coldom_test_1_2.csv} (100%) create mode 100644 smalldata/test/classifier/coldom_test_3.csv rename smalldata/test/classifier/{coldom_train.csv => coldom_train_1.csv} (100%) create mode 100644 smalldata/test/classifier/coldom_train_3.csv diff --git a/smalldata/test/classifier/coldom_test.csv b/smalldata/test/classifier/coldom_test_1.csv similarity index 100% rename from smalldata/test/classifier/coldom_test.csv rename to smalldata/test/classifier/coldom_test_1.csv diff --git a/smalldata/test/classifier/coldom_test2.csv b/smalldata/test/classifier/coldom_test_1_2.csv similarity index 100% rename from smalldata/test/classifier/coldom_test2.csv rename to smalldata/test/classifier/coldom_test_1_2.csv diff --git a/smalldata/test/classifier/coldom_test_3.csv b/smalldata/test/classifier/coldom_test_3.csv new file mode 100644 index 0000000000..65a11426f0 --- /dev/null +++ b/smalldata/test/classifier/coldom_test_3.csv @@ -0,0 +1,3 @@ +A,b,-1,0,1 +A,a,-1,0,0 +D,b,-1,3,1 diff --git a/smalldata/test/classifier/coldom_train.csv b/smalldata/test/classifier/coldom_train_1.csv similarity index 100% rename from smalldata/test/classifier/coldom_train.csv rename to smalldata/test/classifier/coldom_train_1.csv diff --git a/smalldata/test/classifier/coldom_train_3.csv b/smalldata/test/classifier/coldom_train_3.csv new file mode 100644 index 0000000000..f18c217536 --- /dev/null +++ b/smalldata/test/classifier/coldom_train_3.csv @@ -0,0 +1,13 @@ +A,a,-1 +A,b,1 +A,c,1 +A,d,1 +B,a,-1 +B,b,-1 +B,c,1 +A,d,1 +C,a,-1 +C,b,-1 +C,c,-1 +A,d,1 +D,a,-1 diff --git a/src/main/java/water/Model.java b/src/main/java/water/Model.java index 0220f03233..7c90c9a320 100644 --- a/src/main/java/water/Model.java +++ b/src/main/java/water/Model.java @@ -212,27 +212,36 @@ private int[][] adapt( String names[], String domains[][], boolean exact) { * frame which contains only vectors which where adapted (the purpose of the * second frame is to delete all adapted vectors with deletion of the * frame). */ - public Frame[] adapt( Frame fr, boolean exact) { - Frame vfr = new Frame(fr); + public Frame[] adapt( final Frame fr, boolean exact) { + Frame vfr = new Frame(fr); // To avoid modification of original frame fr int ridx = vfr.find(_names[_names.length-1]); - if(ridx != -1 && ridx != vfr._names.length-1){ // put response to the end + if(ridx != -1 && ridx != vfr._names.length-1){ // Unify frame - put response to the end String n = vfr._names[ridx]; vfr.add(n,vfr.remove(ridx)); } int n = ridx == -1?_names.length-1:_names.length; String [] names = Arrays.copyOf(_names, n); - vfr = vfr.subframe(names); - Vec [] frvecs = vfr.vecs(); + vfr = vfr.subframe(names); // select only supported columns, if column is missing Exception is thrown + Vec[] frvecs = vfr.vecs(); + boolean[] toEnum = new boolean[frvecs.length]; if(!exact) for(int i = 0; i < n;++i) - if(_domains[i] != null && !frvecs[i].isEnum()) + if(_domains[i] != null && !frvecs[i].isEnum()) {// if model expects domain but input frame does not have domain => switch vector to enum frvecs[i] = frvecs[i].toEnum(); + toEnum[i] = true; + } int map[][] = adapt(names,vfr.domains(),exact); - ArrayList avecs = new ArrayList(); - ArrayList anames = new ArrayList(); - for( int c=0; c avecs = new ArrayList(); // adapted vectors + ArrayList anames = new ArrayList(); // names for adapted vector + + for( int c=0; c ENUM -> this vector ENUM + int[] domMap = Utils.compose(origVec._domMap, transfMap); + Vec result = origVec.makeTransf(domMap, origVec._domain); + if (!keepOrig) DKV.remove(origVec._key); + return result; + } } diff --git a/src/main/java/water/fvec/Vec.java b/src/main/java/water/fvec/Vec.java index 7bda857650..026f1abf7a 100644 --- a/src/main/java/water/fvec/Vec.java +++ b/src/main/java/water/fvec/Vec.java @@ -135,7 +135,7 @@ public Vec makeCon( final double d ) { public Vec makeTransf(final int[] domMap, final String[] domain) { Futures fs = new Futures(); if( _espc == null ) throw H2O.unimpl(); - Vec v0 = new TransfVec(this._key, domMap, domain, group().addVecs(1)[0],_espc); + Vec v0 = new TransfVec(this._key, domMap, (int) min(), domain, group().addVecs(1)[0],_espc); DKV.put(v0._key,v0,fs); fs.blockForPending(); return v0; @@ -593,6 +593,7 @@ public int reserveKeys(final int n){ } } + /** Collect numeric domain of given vector */ public static class CollectDomain extends MRTask2 { final int _nclass; final int _ymin; @@ -608,6 +609,12 @@ public static class CollectDomain extends MRTask2 { } } @Override public void reduce( CollectDomain that ) { Utils.or(_dom,that._dom); } + + /** Returns exact numeric domain of given vector computed by this task. + * The domain is always sorted. Hence: + * domain()[0] - minimal domain value + * domain()[domain().length-1] - maximal domain value + */ public int[] domain() { int cnt = 0; for (int i=0; i<_dom.length; i++) if (_dom[i]>0) cnt++; diff --git a/src/main/java/water/util/Utils.java b/src/main/java/water/util/Utils.java index f7f20aa92a..aba08200e1 100644 --- a/src/main/java/water/util/Utils.java +++ b/src/main/java/water/util/Utils.java @@ -741,12 +741,16 @@ private static long attemptTimeParse_1( ValueString str ) { } /** Returns a mapping of given domain to values (0, ... max(dom)). - * Unused domain items has mapping to -1. */ + * Unused domain items has mapping to -1. + * @precondition - dom is sorted dom[0] contains minimal value, dom[dom.length-1] represents max. value. */ public static int[] mapping(int[] dom) { + assert dom.length > 0 : "Empty domain!"; + assert dom[0] <= dom[dom.length-1] : "Domain is not sorted"; + int min = dom[0]; int max = dom[dom.length-1]; - int[] result = new int[max+1]; + int[] result = new int[(max-min)+1]; for (int i=0; i+1!", 2*(frTrain.numCols()-1)+1, frTest.numCols()); // Adapt test dataset - frAdapted = model.adapt(frTest, true); // do not perform translation to enums + frAdapted = model.adapt(frTest, exactAdaptation); // do/do not perform translation to enums Assert.assertEquals("Adapt method should return two frames", 2, frAdapted.length); Assert.assertEquals("Test expects that all columns in test dataset has to be adapted", dprep.needAdaptation(frTrain), frAdapted[1].numCols()); diff --git a/src/test/java/hex/gbm/GBMDomainTest.java b/src/test/java/hex/gbm/GBMDomainTest.java index ba903a0cfa..46c24a61d7 100644 --- a/src/test/java/hex/gbm/GBMDomainTest.java +++ b/src/test/java/hex/gbm/GBMDomainTest.java @@ -26,8 +26,8 @@ private abstract class PrepData { abstract Vec prep(Frame fr); } */ @Test public void testModelAdapt() { runAndScoreGBM( - "./smalldata/test/classifier/coldom_train.csv", - "./smalldata/test/classifier/coldom_test.csv", + "./smalldata/test/classifier/coldom_train_1.csv", + "./smalldata/test/classifier/coldom_test_1.csv", new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } }); } @@ -41,8 +41,8 @@ private abstract class PrepData { abstract Vec prep(Frame fr); } */ @Test public void testModelAdapt2() { runAndScoreGBM( - "./smalldata/test/classifier/coldom_train.csv", - "./smalldata/test/classifier/coldom_test2.csv", + "./smalldata/test/classifier/coldom_train_1.csv", + "./smalldata/test/classifier/coldom_test_1_2.csv", new PrepData() { @Override Vec prep(Frame fr) { return fr.vecs()[fr.numCols()-1]; } }); } // Adapt a trained model to a test dataset with different enums