Skip to content

Commit 38549e0

Browse files
author
anqi
committed
Update runif commands with seed in Python tests. Change R package so production build 99999 includes reduced size h2o.jar
1 parent 8a5e25b commit 38549e0

9 files changed

+79
-34
lines changed

R/Makefile

+10-10
Original file line numberDiff line numberDiff line change
@@ -47,18 +47,18 @@ else
4747
cp -p h2o_package.pdf ../target/R
4848
endif
4949
rm -rf h2o-package/inst/java
50-
# mkdir -p h2o-package/inst/java/tmp
51-
# cp -f ../target/h2o.jar h2o-package/inst/java/tmp/h2o_full.jar
52-
# (cd h2o-package/inst/java/tmp && jar xf h2o_full.jar)
53-
# (cd h2o-package/inst/java/tmp && rm -fr hadoop/0.* hadoop/1.* hadoop/cdh[35]* hadoop/cdh4_yarn)
54-
# (cd h2o-package/inst/java/tmp && rm -f h2o_full.jar)
55-
# (cd h2o-package/inst/java/tmp && cp META-INF/MANIFEST.MF ..)
56-
# (cd h2o-package/inst/java/tmp && rm -fr META-INF)
57-
# (cd h2o-package/inst/java/tmp && jar cfm ../h2o.jar ../MANIFEST.MF *)
58-
# rm -rf h2o-package/inst/java/tmp
5950
mkdir -p h2o-package/inst/java
6051
ifeq ($(BUILD_NUMBER),99999)
61-
cp -f ../target/h2o.jar h2o-package/inst/java
52+
# cp -f ../target/h2o.jar h2o-package/inst/java
53+
mkdir -p h2o-package/inst/java/tmp
54+
cp -f ../target/h2o.jar h2o-package/inst/java/tmp/h2o_full.jar
55+
(cd h2o-package/inst/java/tmp && jar xf h2o_full.jar)
56+
(cd h2o-package/inst/java/tmp && rm -fr hadoop/0.* hadoop/1.* hadoop/cdh[35]* hadoop/cdh4_yarn)
57+
(cd h2o-package/inst/java/tmp && rm -f h2o_full.jar)
58+
(cd h2o-package/inst/java/tmp && cp META-INF/MANIFEST.MF ..)
59+
(cd h2o-package/inst/java/tmp && rm -fr META-INF)
60+
(cd h2o-package/inst/java/tmp && jar cfm ../h2o.jar ../MANIFEST.MF *)
61+
rm -rf h2o-package/inst/java/tmp
6262
endif
6363
R CMD build h2o-package
6464

py/h2o_cmd.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -566,7 +566,7 @@ def createTestTrain(srcKey, trainDstKey, testDstKey, trainPercent,
566566
boundary = (trainPercent + 0.0)/100
567567

568568
execExpr = ""
569-
execExpr += "cct.hex=runif(%s);" % srcKey
569+
execExpr += "cct.hex=runif(%s,-1);" % srcKey
570570
execExpr += "%s=%s[cct.hex<=%s,];" % (trainDstKey, srcKey, boundary)
571571
if changeToBinomial:
572572
execExpr += "%s[,%s]=%s[,%s]==%s;" % (trainDstKey, outputCol+1, trainDstKey, outputCol+1, outputClass)

py/testdir_single_jvm_fvec/test_exec2_big_function_phrases.py

-1
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@
7474
"rhex[nrow(rhex),]",
7575
"rhex[nrow(rhex)-1,ncol(rhex)-1]",
7676
"rhex-rhex",
77-
7877
"runif(rhex[,1], -1)",
7978
"sum(1,2)",
8079
"sum(1,2,3)",

py/testdir_single_jvm_fvec/test_exec2_filter_slice.py

-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
exprList = [
66
# "rTest=randomFilter(<keyX>,58101,12345)",
7-
87
"a=runif(c.hex[,1], -1); rTrain=<keyX>[a<0.8,]",
98
# doesn't work yet
109
# "r2=c.hex[1:100,]",

py/testdir_single_jvm_fvec/test_exec2_function_phrases.py

-1
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@
7878
"rhex[nrow(rhex),]",
7979
"rhex[nrow(rhex)-1,ncol(rhex)-1]",
8080
"rhex-rhex",
81-
8281
"runif(rhex[,1], -1)",
8382
"sum(1,2)",
8483
"sum(1,2,3)",

py/testdir_single_jvm_fvec/test_exec2_operators.py

-1
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,6 @@
8888

8989
'b.hex=runif(r3.hex[,1], -1)',
9090
'b.hex=runif(r3.hex[1,], -1)',
91-
9291
# 'r.1[,1]=r.1[,1] + 1.3',
9392
# 'r<n>.hex=min(r.1,1+2)',
9493
# 'r<n>.hex=r2.hex + 1',

py/testdir_single_jvm_fvec/test_exec2_operators4.py

-1
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,6 @@
171171

172172
# doesn't work
173173
"mean=function(x){apply(x,1,sum)/nrow(x)};mean(r.hex)",
174-
175174
]
176175

177176

py/testdir_single_jvm_fvec/test_exec2_unary.py

-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
exprListFull = [
2424
'r1.hex=apply(r.hex,2,function(x){ifelse(is.na(x),0,x)})',
2525
'cct.hex=runif(r.hex, -1);rTrain=r.hex[cct.hex<=0.9,];rTest=r.hex[cct.hex>0.9,]',
26-
2726
# 'r<n>[,0] = r0[,0] * r<n-1>[,0]',
2827
# 'r<n>[0,] = r1[0,] + r<n-1>[0,]',
2928
# 'r<n> = r1 + r<n-1>',
@@ -34,7 +33,6 @@
3433
# 'r1.hex[1,]=r1.hex[1,]==1.0',
3534
'b.hex=runif(r3.hex[,1], -1)',
3635
'b.hex=runif(r3.hex[1,], -1)',
37-
3836
# 'r1.hex[,1]=r1.hex[,1] + 1.3',
3937
# 'r<n>.hex=min(r1.hex,1+2)',
4038
# 'r<n>.hex=r2.hex + 1',

src/main/java/hex/KMeans2.java

+68-16
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
import water.api.DocGen;
88
import water.api.Progress2;
99
import water.api.Request;
10-
import water.api.Request.API;
11-
import water.api.Request.Default;
1210
import water.fvec.Chunk;
1311
import water.fvec.Frame;
1412
import water.fvec.NewChunk;
@@ -60,12 +58,13 @@ public KMeans2() {
6058
if( sourceArg != null )
6159
sourceKey = Key.make(sourceArg);
6260

63-
// Drop ignored cols and, if users asks for it, cols with too many NAs
61+
// Drop ignored cols and, if user asks for it, cols with too many NAs
6462
Frame fr = DataInfo.prepareFrame(source, ignored_cols, false, false, drop_na_cols);
6563
String[] names = fr.names();
6664
Vec[] vecs = fr.vecs();
6765
if(vecs == null || vecs.length == 0)
6866
throw new IllegalArgumentException("No columns selected. Check that selected columns have not been dropped due to too many NAs.");
67+
DataInfo dinfo = new DataInfo(fr, 0, false, normalize, false);
6968

7069
// Fill-in response based on K99
7170
String[] domain = new String[k];
@@ -84,7 +83,7 @@ public KMeans2() {
8483
means[i] = (float) vecs[i].mean();
8584
if( mults != null ) {
8685
double sigma = vecs[i].sigma();
87-
mults[i] = normalize(sigma) ? 1 / sigma : 1;
86+
mults[i] = normalize(sigma) ? 1.0 / sigma : 1.0;
8887
}
8988
}
9089

@@ -94,7 +93,8 @@ public KMeans2() {
9493
if( initialization == Initialization.None ) {
9594
// Initialize all clusters to random rows
9695
clusters = new double[k][vecs.length];
97-
for (double[] cluster : clusters) randomRow(vecs, rand, cluster, means, mults);
96+
for (double[] cluster : clusters)
97+
randomRow(vecs, rand, cluster, means, mults);
9898
} else {
9999
// Initialize first cluster to random row
100100
clusters = new double[1][];
@@ -136,7 +136,10 @@ public KMeans2() {
136136
task._clusters = clusters;
137137
task._means = means;
138138
task._mults = mults;
139+
task._ncats = dinfo._cats;
140+
task._nnums = dinfo._nums;
139141
task.doAll(vecs);
142+
140143
model.centers = clusters = normalize ? denormalize(task._cMeans, vecs) : task._cMeans;
141144
model.between_cluster_variances = task._betwnSqrs;
142145
double[] variances = new double[task._cSqrs.length];
@@ -343,6 +346,7 @@ public static class KMeans2Model extends Model implements Progress {
343346
// Normalization caches
344347
private transient double[][] _normClust;
345348
private transient double[] _means, _mults;
349+
private transient int _ncats, _nnums;
346350

347351
public KMeans2Model(KMeans2 params, Key selfKey, Key dataKey, String names[], String domains[][]) {
348352
super(selfKey, dataKey, names, domains);
@@ -380,7 +384,8 @@ public KMeans2Model(KMeans2 params, Key selfKey, Key dataKey, String names[], St
380384
}
381385
data(tmp, chunks, rowInChunk, _means, _mults);
382386
Arrays.fill(preds, 0);
383-
int cluster = closest(cs, tmp, new ClusterDist())._cluster;
387+
// int cluster = closest(cs, tmp, new ClusterDist())._cluster;
388+
int cluster = closest(cs, tmp, _ncats, new ClusterDist())._cluster;
384389
preds[0] = cluster; // prediction in preds[0]
385390
preds[1+cluster] = 1; // class distribution
386391
return preds;
@@ -401,13 +406,15 @@ public class Clusters extends MRTask2<Clusters> {
401406
// IN
402407
double[][] _clusters; // Cluster centers
403408
double[] _means, _mults; // Normalization
409+
int _ncats, _nnums;
404410

405411
@Override public void map(Chunk[] cs, NewChunk ncs) {
406412
double[] values = new double[_clusters[0].length];
407413
ClusterDist cd = new ClusterDist();
408414
for (int row = 0; row < cs[0]._len; row++) {
409415
data(values, cs, row, _means, _mults);
410-
closest(_clusters, values, cd);
416+
// closest(_clusters, values, cd);
417+
closest(_clusters, values, _ncats, cd);
411418
int clu = cd._cluster;
412419
// ncs[0].addNum(clu);
413420
ncs.addEnum(clu);
@@ -478,6 +485,7 @@ public static class Lloyds extends MRTask2<Lloyds> {
478485
// IN
479486
double[][] _clusters;
480487
double[] _means, _mults; // Normalization
488+
int _ncats, _nnums;
481489

482490
// OUT
483491
double[][] _cMeans, _cSqrs; // Means and sum of squares for each cluster
@@ -499,7 +507,8 @@ public static class Lloyds extends MRTask2<Lloyds> {
499507
int[] clusters = new int[cs[0]._len];
500508
for( int row = 0; row < cs[0]._len; row++ ) {
501509
data(values, cs, row, _means, _mults);
502-
closest(_clusters, values, cd);
510+
// closest(_clusters, values, cd);
511+
closest(_clusters, values, _ncats, cd);
503512
int clu = clusters[row] = cd._cluster;
504513
_sqr += cd._dist;
505514
if( clu == -1 )
@@ -556,10 +565,6 @@ private static final class ClusterDist {
556565
double _dist;
557566
}
558567

559-
private static ClusterDist closest(double[][] clusters, double[] point, ClusterDist cd) {
560-
return closest(clusters, point, cd, clusters.length);
561-
}
562-
563568
private static double minSqr(double[][] clusters, double[] point, ClusterDist cd) {
564569
return closest(clusters, point, cd, clusters.length)._dist;
565570
}
@@ -568,14 +573,43 @@ private static double minSqr(double[][] clusters, double[] point, ClusterDist cd
568573
return closest(clusters, point, cd, count)._dist;
569574
}
570575

571-
/** Return both nearest of N cluster/centroids, and the square-distance. */
576+
private static ClusterDist closest(double[][] clusters, double[] point, ClusterDist cd) {
577+
return closest(clusters, point, cd, clusters.length);
578+
}
579+
580+
private static ClusterDist closest(double[][] clusters, double[] point, int ncats, ClusterDist cd) {
581+
return closest(clusters, point, ncats, cd, clusters.length);
582+
}
583+
572584
private static ClusterDist closest(double[][] clusters, double[] point, ClusterDist cd, int count) {
585+
return closest(clusters, point, 0, cd, count);
586+
}
587+
588+
private static ClusterDist closest(double[][] clusters, double[] point, int ncats, ClusterDist cd, int count) {
589+
return closest(clusters, point, ncats, cd, count, 1);
590+
}
591+
592+
/** Return both nearest of N cluster/centroids, and the square-distance. */
593+
private static ClusterDist closest(double[][] clusters, double[] point, int ncats, ClusterDist cd, int count, double dist) {
573594
int min = -1;
574595
double minSqr = Double.MAX_VALUE;
575596
for( int cluster = 0; cluster < count; cluster++ ) {
576597
double sqr = 0; // Sum of dimensional distances
577598
int pts = point.length; // Count of valid points
578-
for( int column = 0; column < clusters[cluster].length; column++ ) {
599+
600+
// Expand categoricals into binary indicator cols
601+
for(int column = 0; column < ncats; column++) {
602+
double d = point[column];
603+
if(Double.isNaN(d))
604+
pts--;
605+
else {
606+
// TODO: What is the distance between unequal categoricals?
607+
if(d != clusters[cluster][column])
608+
sqr += 2 * dist * dist;
609+
}
610+
}
611+
612+
for( int column = ncats; column < clusters[cluster].length; column++ ) {
579613
double d = point[column];
580614
if( Double.isNaN(d) ) { // Bad data?
581615
pts--; // Do not count
@@ -686,14 +720,16 @@ private static double[][] denormalize(double[][] clusters, Vec[] vecs) {
686720
private static void data(double[] values, Vec[] vecs, long row, double[] means, double[] mults) {
687721
for( int i = 0; i < values.length; i++ ) {
688722
double d = vecs[i].at(row);
689-
values[i] = data(d, i, means, mults);
723+
// values[i] = data(d, i, means, mults);
724+
values[i] = data(d, i, means, mults, vecs[i].cardinality());
690725
}
691726
}
692727

693728
private static void data(double[] values, Chunk[] chks, int row, double[] means, double[] mults) {
694729
for( int i = 0; i < values.length; i++ ) {
695730
double d = chks[i].at0(row);
696-
values[i] = data(d, i, means, mults);
731+
// values[i] = data(d, i, means, mults);
732+
values[i] = data(d, i, means, mults, chks[i]._vec.cardinality());
697733
}
698734
}
699735

@@ -709,4 +745,20 @@ private static double data(double d, int i, double[] means, double[] mults) {
709745
}
710746
return d;
711747
}
748+
749+
private static double data(double d, int i, double[] means, double[] mults, int cardinality) {
750+
if(cardinality == -1) {
751+
if( Double.isNaN(d) )
752+
d = means[i];
753+
if( mults != null ) {
754+
d -= means[i];
755+
d *= mults[i];
756+
}
757+
} else {
758+
// TODO: If NaN, then replace with majority class?
759+
if(Double.isNaN(d))
760+
d = Math.min(Math.round(means[i]), cardinality-1);
761+
}
762+
return d;
763+
}
712764
}

0 commit comments

Comments
 (0)