Skip to content

Commit

Permalink
Fix categorical handling to always use bitsets. Was doing ordinal com…
Browse files Browse the repository at this point in the history
…parison (treating categoricals as numerical) for #factors < #bins.

This is related to HEXDEV-319.
  • Loading branch information
arnocandel committed May 19, 2015
1 parent ed59791 commit 79a6989
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions src/main/java/hex/gbm/DHistogram.java
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ public void setMax( float max ) {
old = _maxIn;
}

private static int MAX_FACTOR_BINS=1024; // Allow more bins for factors
public DHistogram( String name, final int nbins, final byte isInt, final float min, final float maxEx, long nelems, int min_rows, boolean doGrpSplit ) {
assert nelems > 0;
assert nbins >= 1;
Expand All @@ -88,17 +89,15 @@ public DHistogram( String name, final int nbins, final byte isInt, final float m
// See if we can show there are fewer unique elements than nbins.
// Common for e.g. boolean columns, or near leaves.
int xbins = nbins;
float step;
if( isInt>0 && maxEx-min <= nbins ) {
if( isInt>0 && maxEx-min <= Math.max(nbins,(isInt==2?MAX_FACTOR_BINS:nbins)) ) {
assert ((long)min)==min; // No overflow
xbins = (char)((long)maxEx-(long)min); // Shrink bins
assert xbins > 1; // Caller ensures enough range to bother
step = 1.0f; // Fixed stepsize
_step = 1.0f; // Fixed stepsize
} else {
step = (maxEx-min)/nbins; // Step size for linear interpolation
assert step > 0;
_step = nbins/(maxEx-min);
assert _step > 0 && !Float.isInfinite(_step);
}
_step = 1.0f/step; // Use multiply instead of division during frequent binning math
_nbin = (char)xbins;
// Do not allocate the big arrays here; wait for scoreCols to pick which cols will be used.
}
Expand Down Expand Up @@ -174,7 +173,8 @@ void add( TDH dsh ) {
static public float find_maxEx(float maxIn, int isInt ) {
float ulp = Math.ulp(maxIn);
if( isInt > 0 && 1 > ulp ) ulp = 1;
return maxIn+ulp;
float res = maxIn+ulp;
return Float.isInfinite(res) ? maxIn : res;
}

// Compute a "score" for a column; lower score "wins" (is a better split).
Expand Down

0 comments on commit 79a6989

Please sign in to comment.