Skip to content

Commit 524bef3

Browse files
committed
Add -many_cols (same as -chunk_bytes 24).
Rename -chunk_bits to -chunk_bytes.
1 parent 53c7872 commit 524bef3

File tree

3 files changed

+51
-20
lines changed

3 files changed

+51
-20
lines changed

hadoop/src/main/java/water/hadoop/h2odriver.java

+15-7
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ public class h2odriver extends Configured implements Tool {
4747
static int cloudFormationTimeoutSeconds = DEFAULT_CLOUD_FORMATION_TIMEOUT_SECONDS;
4848
static int nthreads = -1;
4949
static int basePort = -1;
50-
static int chunk_bits;
50+
static boolean manyCols = false;
51+
static int chunk_bytes;
5152
static int data_max_factor_levels;
5253
static boolean beta = false;
5354
static boolean enableRandomUdpDrop = false;
@@ -391,8 +392,9 @@ static void usage() {
391392
" -n | -nodes <number of H2O nodes (i.e. mappers) to create>\n" +
392393
" [-nthreads <maximum typical worker threads, i.e. cpus to use>]\n" +
393394
" [-baseport <starting HTTP port for H2O nodes; default is 54321>]\n" +
394-
" [-chunk_bits <bits per chunk (e.g., 22 for 4MB chunks)>]\n" +
395-
" [-data_max_factor_levels <max. number of factors per column (e.g., 65000)>]\n" +
395+
" [-many_cols] (improve handling of high-dimensional datasets, same as -chunk_bytes 24)\n" +
396+
" [-chunk_bytes <log (base 2) of chunk size in bytes (e.g., default is 22 for 4MB chunks)>]\n" +
397+
" [-data_max_factor_levels <max. number of factors per column (e.g., default is 65000)>]\n" +
396398
" [-ea]\n" +
397399
" [-verbose:gc]\n" +
398400
" [-XX:+PrintGCDetails]\n" +
@@ -547,9 +549,12 @@ else if (s.equals("-nthreads")) {
547549
i++; if (i >= args.length) { usage(); }
548550
nthreads = Integer.parseInt(args[i]);
549551
}
550-
else if (s.equals("-chunk_bits")) {
552+
else if (s.equals("-many_cols")) {
553+
manyCols = true;
554+
}
555+
else if (s.equals("-chunk_bytes")) {
551556
i++; if (i >= args.length) { usage(); }
552-
chunk_bits = Integer.parseInt(args[i]);
557+
chunk_bytes = Integer.parseInt(args[i]);
553558
}
554559
else if (s.equals("-data_max_factor_levels")) {
555560
i++; if (i >= args.length) { usage(); }
@@ -924,8 +929,11 @@ private int run2(String[] args) throws Exception {
924929
if (beta) {
925930
conf.set(h2omapper.H2O_BETA_KEY, "-beta");
926931
}
927-
if (chunk_bits > 0) {
928-
conf.set(h2omapper.H2O_CHUNKBITS_KEY, Integer.toString(chunk_bits));
932+
if (manyCols) {
933+
conf.set(h2omapper.H2O_MANYCOLS_KEY, "-many_cols");
934+
}
935+
if (chunk_bytes > 0) {
936+
conf.set(h2omapper.H2O_CHUNKBITS_KEY, Integer.toString(chunk_bytes));
929937
}
930938
if (data_max_factor_levels > 0) {
931939
conf.set(h2omapper.H2O_DATAMAXFACTORLEVELS_KEY, Integer.toString(data_max_factor_levels));

hadoop/src/main/java/water/hadoop/h2omapper.java

+13-6
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ public class h2omapper extends Mapper<Text, Text, Text, Text> {
2727
final static public String H2O_BETA_KEY = "h2o.beta";
2828
final static public String H2O_RANDOM_UDP_DROP_KEY = "h2o.random.udp.drop";
2929
final static public String H2O_NTHREADS_KEY = "h2o.nthreads";
30+
final static public String H2O_MANYCOLS_KEY = "h2o.many.cols";
3031
final static public String H2O_CHUNKBITS_KEY = "h2o.chunk.bits";
3132
final static public String H2O_DATAMAXFACTORLEVELS_KEY = "h2o.data.max.factor.levels";
3233
final static public String H2O_BASE_PORT_KEY = "h2o.baseport";
@@ -362,7 +363,8 @@ private int run2(Context context) throws IOException, InterruptedException {
362363
String driverIp = conf.get(H2O_DRIVER_IP_KEY);
363364
String driverPortString = conf.get(H2O_DRIVER_PORT_KEY);
364365
String network = conf.get(H2O_NETWORK_KEY);
365-
String chunkBitsString = conf.get(H2O_CHUNKBITS_KEY);
366+
String manyColsString = conf.get(H2O_MANYCOLS_KEY);
367+
String chunkBytesString = conf.get(H2O_CHUNKBITS_KEY);
366368
String dataMaxFactorLevelsString = conf.get(H2O_DATAMAXFACTORLEVELS_KEY);
367369
String nthreadsString = conf.get(H2O_NTHREADS_KEY);
368370
String basePortString = conf.get(H2O_BASE_PORT_KEY);
@@ -412,11 +414,16 @@ private int run2(Context context) throws IOException, InterruptedException {
412414
argsList.add(Integer.toString(dataMaxFactorLevels));
413415
}
414416
}
415-
if (chunkBitsString != null) {
416-
if (chunkBitsString.length() > 0) {
417-
argsList.add("-chunk_bits");
418-
int chunkBits = Integer.parseInt(chunkBitsString);
419-
argsList.add(Integer.toString(chunkBits));
417+
if (manyColsString != null) {
418+
if (manyColsString.length() > 0) {
419+
argsList.add("-many_cols");
420+
}
421+
}
422+
if (chunkBytesString != null) {
423+
if (chunkBytesString.length() > 0) {
424+
argsList.add("-chunk_bytes");
425+
int chunkBytes = Integer.parseInt(chunkBytesString);
426+
argsList.add(Integer.toString(chunkBytes));
420427
}
421428
}
422429
if (betaString != null) {

src/main/java/water/H2O.java

+23-7
Original file line numberDiff line numberDiff line change
@@ -715,7 +715,8 @@ public static class OptArgs extends Arguments.Opt {
715715
public String version = null;
716716
public String single_precision = null;
717717
public int data_max_factor_levels;
718-
public int chunk_bits;
718+
public String many_cols = null;
719+
public int chunk_bytes;
719720
public String beta = null;
720721
public String mem_watchdog = null; // For developer debugging
721722
public boolean md5skip = false;
@@ -768,9 +769,12 @@ public static void printHelp() {
768769
" from double to single precision to save memory of numerical data.\n" +
769770
" (The default is double precision.)\n" +
770771
"\n" +
771-
" -chunk_bits <integer>\n" +
772-
" The number of bits per chunk.\n" +
773-
" (The default is " + LOG_CHK + ", which is " + PrettyPrint.bytes(1<<LOG_CHK) + ".)\n" +
772+
" -many_cols\n" +
773+
" Enables improved handling of high-dimensional datasets. Same as -chunk_bytes 24.\n" +
774+
"\n" +
775+
" -chunk_bytes <integer>\n" +
776+
" Experimental option. Not in combination with -many_cols. The log (base 2) of chunk size in bytes.\n" +
777+
" (The default is " + LOG_CHK + ", which leads to a chunk size of " + PrettyPrint.bytes(1<<LOG_CHK) + ".)\n" +
774778
"\n" +
775779
" -data_max_factor_levels <integer>\n" +
776780
" The maximum number of factor levels for categorical columns.\n" +
@@ -925,9 +929,21 @@ public static void main( String[] args ) {
925929
Log.info("Max. number of factor levels per column: " + DATA_MAX_FACTOR_LEVELS);
926930
}
927931

928-
if (OPT_ARGS.chunk_bits != 0) {
929-
if (OPT_ARGS.chunk_bits > 0)
930-
LOG_CHK = OPT_ARGS.chunk_bits;
932+
if (OPT_ARGS.chunk_bytes != 0 || OPT_ARGS.many_cols != null) {
933+
if (OPT_ARGS.many_cols != null) {
934+
LOG_CHK = 24;
935+
if (OPT_ARGS.chunk_bytes > 0) {
936+
Log.warn("-chunk_bytes is ignored since -many_cols was set.");
937+
}
938+
} else if (OPT_ARGS.chunk_bytes > 0) {
939+
LOG_CHK = OPT_ARGS.chunk_bytes;
940+
if (OPT_ARGS.chunk_bytes < 22) {
941+
Log.warn("-chunk_bytes < 22 is not officially supported. Use at your own risk.");
942+
}
943+
if (OPT_ARGS.chunk_bytes > 24) {
944+
Log.warn("-chunk_bytes > 24 is not officially supported. Use at your own risk.");
945+
}
946+
}
931947
}
932948
Log.info("Chunk size: " + PrettyPrint.bytes(1<<LOG_CHK));
933949

0 commit comments

Comments
 (0)