Skip to content

Commit

Permalink
add zero, na, factor cardinality counts to the summary objects and page
Browse files Browse the repository at this point in the history
  • Loading branch information
earlh committed Aug 4, 2013
1 parent 9f88277 commit 93ad42b
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 27 deletions.
26 changes: 26 additions & 0 deletions smalldata/test/test_percentiles_distns.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# generate modal dataset for testing histogram

n <- 1000

df <- data.frame( zeroone=sample( c(0,1), size=n, replace=T) )
df$zerotwo=sample( c(0:2), size=n, replace=T)



df$onemode_low0 <- rgamma( n=n, shape=1, scale=2 )

df$onemode_low1 <- rgamma( n=n, shape=2, scale=2 )

df$onemode_hi <- rgamma( n=n, shape=200, scale=0.1 )

df$twomode <- df$zeroone * rgamma( n=n, shape=1, scale=2) + (1 - df$zeroone) * rgamma( n=n, shape=20, scale=0.5 )

df$threemode <- ifelse( df$zerotwo == 0, rgamma(n=n, shape=1, scale=2), ifelse( df$zerotwo == 1, rgamma( n=n, shape=20, scale=0.5 ), rgamma(n=n, shape=40, scale=0.5)))

# add some factors
df$zerooneF <- paste('f', df$zeroone )
df$zerotwoF <- paste('f', df$zerotwo )

file <- gzfile( 'test_percentiles_distns.csv.gz', 'w' )
write.csv(file=file, x=df, row.names=F)
close( file )
Binary file added smalldata/test/test_percentiles_distns.csv.gz
Binary file not shown.
11 changes: 8 additions & 3 deletions src/main/java/hex/ColSummaryTask.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@ public final class ColSummaryTask extends RowTask<Summary>{
final int [] _cols;
public ColSummaryTask(ValueArray ary, int [] cols){super(ary);_cols = cols;}
@Override public Summary newRes() {return new Summary(_ary,_cols);}
@Override public void map(hex.RowTask.Row r, Summary t) {
for(int i = 0; i < _cols.length; ++i)
if(!r.isNA(_cols[i]))t._sums[i].add(r.getDCol(_cols[i]));

@Override public void map(hex.RowTask.Row r, Summary summary) {
for(int i = 0; i < _cols.length; ++i) {
if(!r.isNA(_cols[i]))
summary._sums[i].add(r.getDCol(_cols[i]));
else
summary._sums[ i ]._n_na ++;
}
}
@Override public Summary reduce(Summary left, Summary right) {
return left.add(right);
Expand Down
18 changes: 18 additions & 0 deletions src/main/java/hex/Summary.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ public final static class ColSummary extends Iced {
public static final double [] DEFAULT_PERCENTILES = {0.01,0.05,0.10,0.25,0.33,0.50,0.66,0.75,0.90,0.95,0.99};
final long [] _bins; // bins for histogram
long _n;
long _nzero;
long _n_na;
final double _start, _end, _binsz, _binszInv;
double [] _min; // min N elements
double [] _max; // max N elements
Expand Down Expand Up @@ -93,6 +95,13 @@ else if (a > 5*b/3)
}
public final double [] percentiles(){return _percentiles;}

public long getEnumCardinality(){
if (_enum)
return _bins.length;
else
throw new IllegalArgumentException("summary: non enums don't have enum cardinality");
}

private void computePercentiles(){
_percentileValues = new double [_percentiles.length];
if( _bins.length == 0 ) return;
Expand Down Expand Up @@ -122,6 +131,9 @@ void add(ColSummary other) {
assert Math.abs(_start - other._start) < 0.000001:"start - other._start = " + (_start - other._start);
assert Math.abs(_binszInv - other._binszInv) < 0.000000001;
_n += other._n;
_nzero += other._nzero;
_n_na += other._n_na;

for (int i = 0; i < _bins.length; i++)
_bins[i] += other._bins[i];
if(_min != null){
Expand Down Expand Up @@ -154,6 +166,8 @@ void add(ColSummary other) {

void add(double val) {
if(!_enum){
if (val == 0.)
_nzero++;
// first update min/max
if(val < _min[_min.length-1]){
int j = _min.length-1;
Expand Down Expand Up @@ -203,6 +217,8 @@ public JsonObject toJson(){
JsonObject res = new JsonObject();
res.addProperty("type", _enum?"enum":"number");
res.addProperty("name", _summary._ary._cols[_colId]._name);
if (_enum)
res.addProperty("enumCardinality", getEnumCardinality());
if(!_enum){
JsonArray min = new JsonArray();
for(double d:_min){
Expand All @@ -218,8 +234,10 @@ public JsonObject toJson(){
res.add("max", max);
res.addProperty("mean", _summary._ary._cols[_colId]._mean);
res.addProperty("sigma", _summary._ary._cols[_colId]._sigma);
res.addProperty("zeros", _summary._sums[ _colId ]._nzero);
}
res.addProperty("N", _n);
res.addProperty("na", _n_na);
JsonObject histo = new JsonObject();
histo.addProperty("bin_size", _binsz);
histo.addProperty("nbins", _bins.length);
Expand Down
76 changes: 54 additions & 22 deletions src/main/java/water/api/SummaryPage.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,42 +44,72 @@ public static String link(Key k, String s){
pageBldr.append("<div><a href='#col_" + cname + "'>" + cname + "</a></div>");
long N = o.get("N").getAsLong();
sb.append("<div class='table' id='col_" + cname + "' style='width:90%;heigth:90%;overflow-y:scroll;border-top-style:solid;'><div class='alert-success'><h4>Column: " + cname + "</h4></div>\n");
// !enum
if(o.has("min") && o.has("max")){
StringBuilder minRow = new StringBuilder("<tr><th>&mu;</th><td>" + Utils.p2d(o.get("mean").getAsDouble())+"</td><th style='border-left-style:solid; borde-left:1px;border-left-color:#ddd;'>min[5]</th>");
StringBuilder maxRow = new StringBuilder("<tr><th>&sigma;</th><td>" + Utils.p2d(o.get("sigma").getAsDouble()) + "</td><th style='border-left-style:solid; borde-left:1px;border-left-color:#ddd;'>max[5]</th>");
StringBuilder baseStats = new StringBuilder("<div style='width:100%;overflow:scroll;'><table class='table-bordered'>");
baseStats.append("<tr><th colspan='" + 100 + "' style='text-align:center;'>Base Stats</th></tr>");

baseStats.append("<th>&mu;</th><td>" + Utils.p2d(o.get("mean").getAsDouble())+"</td>");
baseStats.append("<th>&sigma;</th><td>" + Utils.p2d(o.get("sigma").getAsDouble()) + "</td>");

baseStats.append("<th>NAs</th> <td>" + o.get("na").getAsLong() + "</td>");
baseStats.append("<th>zeros</th>");
baseStats.append("<td>" + o.get("zeros").getAsLong() + "</td>");

StringBuilder minmax = new StringBuilder();
int min_count = 0;
Iterator<JsonElement> iter = o.get("min").getAsJsonArray().iterator();
int nCols = 3;
while(iter.hasNext()){
++nCols;
minRow.append("<td>" + Utils.p2d(iter.next().getAsDouble()) + "</td>");
min_count++;
minmax.append("<td>" + Utils.p2d(iter.next().getAsDouble()) + "</td>");
}
baseStats.append("<th>min[" + min_count + "]</th>");
baseStats.append(minmax.toString());

baseStats.append("<th>max[" + min_count + "]</th>");
iter = o.get("max").getAsJsonArray().iterator();
while(iter.hasNext())maxRow.append("<td>" + Utils.p2d(iter.next().getAsDouble()) + "</td>");
StringBuilder firstRow = new StringBuilder("<tr><th colspan='" + nCols + "' style='text-align:center;'>Base Stats</th>");
while(iter.hasNext()) baseStats.append("<td>" + Utils.p2d(iter.next().getAsDouble()) + "</td>");
baseStats.append("</tr> </table>");
baseStats.append("</div>");

sb.append( baseStats.toString());

StringBuilder threshold = new StringBuilder();
StringBuilder value = new StringBuilder();
if(o.has("percentiles")){
firstRow.append("<th colspan='12' style='text-align:center;border-left-style:solid; borde-left:1px;border-left-color:#ddd;'>Percentiles</th>");
JsonObject percentiles = o.get("percentiles").getAsJsonObject();
JsonArray thresholds = percentiles.get("thresholds").getAsJsonArray();
JsonArray values = percentiles.get("values").getAsJsonArray();
Iterator<JsonElement> tIter = thresholds.iterator();
Iterator<JsonElement> vIter = values.iterator();
minRow.append("<th style='border-left-style:solid; borde-left:1px;border-left-color:#ddd;'>Threshold</th>");
maxRow.append("<th style='border-left-style:solid; borde-left:1px;border-left-color:#ddd;'>Value</th>");

threshold.append("<tr><th>Threshold</th>");
value.append("<tr><th>Value</th>");
while(tIter.hasNext() && vIter.hasNext()){
minRow.append("<td>" + tIter.next().getAsString() + "</td>");
maxRow.append("<td>" + Utils.p2d(vIter.next().getAsDouble()) + "</td>");
threshold.append("<td>" + tIter.next().getAsString() + "</td>");
value.append("<td>" + Utils.p2d(vIter.next().getAsDouble()) + "</td>");
}
threshold.append("</tr>");
value.append("</tr>");

sb.append("<div style='width:100%;overflow:scroll;'><table class='table-bordered'>");
sb.append("<th colspan='12' style='text-align:center;'>Percentiles</th>");
sb.append(threshold.toString());
sb.append(value.toString());
sb.append("</table>");
sb.append("</div>");
}
firstRow.append("</tr>");
minRow.append("</tr>");
maxRow.append("</tr>");
sb.append("<div style='width:100%;overflow:scroll;'><table>");
sb.append(firstRow.toString());
sb.append(minRow.toString());
sb.append(maxRow.toString());

} else {
// this should be the _enum case, in which I want to report NA count
sb.append("<div style='width:100%;overflow:scroll;'><table class='table-bordered'>");
sb.append("<tr><th colspan='" + 4 + "' style='text-align:center;'>Base Stats</th></tr>");
// na row
sb.append("<tr><th>NAs</th> <td>" + o.get("na").getAsLong() + "</td>");
sb.append("<th>cardinality</th> <td>" + o.get("enumCardinality").getAsLong() + "</td></tr>");
sb.append("</table></div>");
}
sb.append("<h5>Histogram</h5>");
// sb.append("<h5>Histogram</h5>");
JsonObject histo = o.get("histogram").getAsJsonObject();
JsonArray bins = histo.get("bins").getAsJsonArray();
JsonArray names = histo.get("bin_names").getAsJsonArray();
Expand All @@ -90,7 +120,7 @@ public static String link(Key k, String s){
StringBuilder p = new StringBuilder("<tr>");
int i = 0;
while(bIter.hasNext() && nIter.hasNext() && i++ < MAX_HISTO_BINS_DISPLAYED){
n.append("<td>" + nIter.next().getAsString() + "</td>");
n.append("<th>" + nIter.next().getAsString() + "</th>");
long cnt = bIter.next().getAsLong();
b.append("<td>" + cnt + "</td>");
p.append(String.format("<td>%.1f%%</td>",(100.0*cnt/N)));
Expand All @@ -100,7 +130,9 @@ public static String link(Key k, String s){
n.append("</tr>\n");
b.append("</tr>\n");
p.append("</tr>\n");
sb.append("<div style='width:100%;overflow:scroll;'><table>" + n.toString() + b.toString() + p.toString() + "</table></div>");
sb.append("<div style='width:100%;overflow:scroll;'><table class='table-bordered'>");
sb.append("<thead> <th colspan=" + (MAX_HISTO_BINS_DISPLAYED + 1) + " style='text-align:center;'>Histogram </th> </thead>");
sb.append(n.toString() + b.toString() + p.toString() + "</table></div>");
sb.append("\n</div>\n");
}
sb.append("</div>");
Expand Down
98 changes: 96 additions & 2 deletions src/test/java/hex/SummaryTest.java
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
package hex;

import static org.junit.Assert.assertEquals;

import org.junit.BeforeClass;
import org.junit.Test;

import water.*;
import hex.ColSummaryTask;

public class SummaryTest extends TestUtil {

Expand All @@ -13,18 +14,111 @@ public class SummaryTest extends TestUtil {
// ==========================================================================
@Test public void testConstSummary() {
Key vkey = loadAndParseFile("con.hex","./smalldata/constantColumn.csv");
// Key vkey = loadAndParseFile("enum_test.hex","./smalldata/test/test_percentiles_distns.csv");
ValueArray ary = UKV.get(vkey);
int[] cols = new int[ary.numCols()];
for( int i=0; i<cols.length; i++ ) cols[i]=i;
Summary sum = new ColSummaryTask(ary,cols).invoke(vkey).result();
for( int i=0; i<cols.length; i++ ) {
System.out.println("col "+i);
sum._sums[i].toJson();
}

Summary.ColSummary csum = sum._sums[0];
assertEquals(1,csum._bins.length);
assertEquals(ary.length(),csum._bins[0]);
assertEquals(0, csum._n_na);
assertEquals(0, csum._nzero);
UKV.remove(vkey);
}


public void testNonConstSummary(){
Key vkey = loadAndParseFile("enum_test.hex","./smalldata/test/test_percentiles_distns.csv.gz");
try {
ValueArray array = UKV.get(vkey);
int[] cols = new int[ 2 ];

// search for columns zeroone and zerotwo
String[] colnames = array.colNames();
for( int i=0; i<cols.length; i++ ){
if ( colnames[ i ].equals("zeroone") )
cols[ 0 ] = i;
else if ( colnames[i].equals( "zerotwo" ) )
cols[ 1 ] = i;
}

Summary sum = new ColSummaryTask(array,cols).invoke(vkey).result();
for( int i=0; i<cols.length; i++ ) {
sum._sums[i]._summary = sum;
sum._sums[i].toJson();
}

// column zerooneF
Summary.ColSummary csum = sum._sums[0];
assertEquals(2, csum._bins.length);
assertEquals(0, csum._n_na);
assertEquals(520, csum._nzero);
assertEquals(520, csum._bins[ 0 ]);
assertEquals(480, csum._bins[ 1 ]);

// column zerotwoF
csum = sum._sums[1];
assertEquals(3, csum._bins.length);
assertEquals(0, csum._n_na);
assertEquals(334, csum._nzero);
assertEquals(334, csum._bins[ 0 ]);
assertEquals(329, csum._bins[ 1 ]);
assertEquals(337, csum._bins[ 1 ]);
} finally {
if (vkey != null)
UKV.remove( vkey );
}
}

public void testEnumSummary(){
//test_percentiles_00.csv

Key vkey = loadAndParseFile("enum_test.hex","./smalldata/test/test_percentiles_distns.csv.gz");
try {
ValueArray array = UKV.get(vkey);
int[] cols = new int[ 2 ];

// search for columns zerooneF and zerotwoF
for( int i=0; i<cols.length; i++ ){
if ( array.colNames()[ i ].equals("zerooneF") )
cols[ 0 ] = i;
else if ( array.colNames()[i].equals( "zerotwoF" ) )
cols[ 1 ] = i;
}

Summary sum = new ColSummaryTask(array,cols).invoke(vkey).result();
for( int i=0; i<cols.length; i++ ) {
sum._sums[i]._summary = sum;
sum._sums[i].toJson();
}

// column zerooneF
Summary.ColSummary csum = sum._sums[0];
assertEquals(2, csum._bins.length);
assertEquals(2, csum.getEnumCardinality());
assertEquals(0, csum._n_na);
assertEquals(520, csum._bins[ 0 ]);
assertEquals(480, csum._bins[ 1 ]);

// column zerotwoF
csum = sum._sums[1];
assertEquals(3, csum._bins.length);
assertEquals(3, csum.getEnumCardinality());
assertEquals(0, csum._n_na);
assertEquals(334, csum._bins[ 0 ]);
assertEquals(329, csum._bins[ 1 ]);
assertEquals(337, csum._bins[ 1 ]);
} finally {
if (vkey != null)
UKV.remove( vkey );
}
}



}

0 comments on commit 93ad42b

Please sign in to comment.