Skip to content

Commit

Permalink
Add a non-modeling column type, the UUID
Browse files Browse the repository at this point in the history
New column type UUID.
Parse the Java fixed-format UUID.
Display in browser; export to CSV.
Remove as default-available modeling column from most algos.
Add a generic C16Chunk holding 2 longs, available as vec.at16l() and
vec.at16h() and their usual Chunk equivalents.
PrettyPrinter.uuid(lo,hi) for printing.
Fixed bugs with all the various frame splitter utilities dropping the
"time" value (thus converting a time column into an integer column).
Copy the UUID flag thru as needed.
Cleanup numerous ideaj warnings
  • Loading branch information
cliffclick committed Jun 9, 2014
1 parent 7f7f0e7 commit 3ec9edd
Show file tree
Hide file tree
Showing 24 changed files with 441 additions and 127 deletions.
2 changes: 1 addition & 1 deletion prj.el
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
'(jde-run-option-debug nil)
'(jde-run-option-vm-args (quote ("-XX:+PrintGC")))
'(jde-compile-option-directory "./target/classes")
'(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "hex.gbm.GBMTest")))
'(jde-run-option-application-args (quote ("-beta" "-mainClass" "org.junit.runner.JUnitCore" "water.fvec.ParseTimeTest")))
'(jde-debugger (quote ("JDEbug")))
'(jde-compile-option-source (quote ("1.6")))
'(jde-compile-option-classpath (quote ("./target/classes" "./lib/javassist.jar" "./lib/hadoop/cdh4/hadoop-common.jar" "./lib/hadoop/cdh4/hadoop-auth.jar" "./lib/hadoop/cdh4/slf4j-api-1.6.1.jar" "./lib/hadoop/cdh4/slf4j-nop-1.6.1.jar" "./lib/hadoop/cdh4/hadoop-hdfs.jar" "./lib/hadoop/cdh4/protobuf-java-2.4.0a.jar" "./lib/apache/commons-codec-1.4.jar" "./lib/apache/commons-configuration-1.6.jar" "./lib/apache/commons-lang-2.4.jar" "./lib/apache/commons-logging-1.1.1.jar" "./lib/apache/httpclient-4.1.1.jar" "./lib/apache/httpcore-4.1.jar" "./lib/junit/junit-4.11.jar" "./lib/apache/guava-12.0.1.jar" "./lib/gson/gson-2.2.2.jar" "./lib/poi/poi-3.8-20120326.jar" "./lib/poi/poi-ooxml-3.8-20120326.jar" "./lib/poi/poi-ooxml-schemas-3.8-20120326.jar" "./lib/poi/dom4j-1.6.1.jar" "./lib/Jama/Jama.jar" "./lib/s3/aws-java-sdk-1.3.27.jar" "./lib/log4j/log4j-1.2.15.jar" "./lib/joda/joda-time-2.3.jar")))
Expand Down
36 changes: 20 additions & 16 deletions smalldata/test/test_uuid.csv
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
1,9ff4ed3a-6b00-4130-9aca-2ed897305fd1
2,ac1e1ca3-5ca8-438a-85a4-8175ed5bb7ec
3,6870f256-e145-4d75-adb0-99ccb77d5d3a
4,d8da52c1-d145-4dff-b3d1-127c6eb75d40
5,25ce1456-546d-4e35-bddc-d571b26581ea
6,2e1d193f-d1da-4664-8a2b-ffdfe0aa7be3
1000010407,89e68530-422e-43ba-bd00-aa3d8f2cfcaa
1000024046,4055a53b-411f-46f0-9d2e-cf03bc95c080
1000054511,49d14d8e-5c42-439d-b4a8-995e25b1602f
1000065922,4e31b8aa-4aa9-4e8b-be8f-5cc6323235b4
1000066478,2e1d193f-d1da-4664-8a2b-ffdfe0aa7be3
1000067268,25ce1456-546d-4e35-bddc-d571b26581ea
100007536,d8da52c1-d145-4dff-b3d1-127c6eb75d40
1000079839,6870f256-e145-4d75-adb0-99ccb77d5d3a
10000913,ac1e1ca3-5ca8-438a-85a4-8175ed5bb7ec
1000104538,9ff4ed3a-6b00-4130-9aca-2ed897305fd1
1,9ff4ed3a-6b00-4130-9aca-2ed897305fd1,1
2,ac1e1ca3-5ca8-438a-85a4-8175ed5bb7ec,1
3,6870f256-e145-4d75-adb0-99ccb77d5d3a,0
4,d8da52c1-d145-4dff-b3d1-127c6eb75d40,1
5,25ce1456-546d-4e35-bddc-d571b26581ea,0
6,2e1d193f-d1da-4664-8a2b-ffdfe0aa7be3,0
1000010407,89e68530-422e-43ba-bd00-aa3d8f2cfcaa,1
1000024046,4055a53b-411f-46f0-9d2e-cf03bc95c080,0
1000054511,49d14d8e-5c42-439d-b4a8-995e25b1602f,0
1000065922,4e31b8aa-4aa9-4e8b-be8f-5cc6323235b4,0
1000066478,2e1d193f-d1da-4664-8a2b-ffdfe0aa7be3,0
1000067268,25ce1456-546d-4e35-bddc-d571b26581ea,0
100007536,d8da52c1-d145-4dff-b3d1-127c6eb75d40,1
1000079839,6870f256-e145-4d75-adb0-99ccb77d5d3a,0
10000913,ac1e1ca3-5ca8-438a-85a4-8175ed5bb7ec,0
1000104538,9ff4ed3a-6b00-4130-9aca-2ed897305fd1,1
7,00000000-0000-0000-0000-000000000000,0
8,80000000-0000-0000-0000-000000000000,0
9,FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF,1

4 changes: 3 additions & 1 deletion src/main/java/hex/FrameExtractor.java
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,12 @@ protected Vec[][] makeTemplates() {
final int num = dataset.numCols(); // number of columns in input frame
final int nsplits = espcPerSplit.length; // number of splits
final String[][] domains = dataset.domains(); // domains
final boolean[] uuids = dataset.uuids();
final byte[] times = dataset.times();
Vec[][] t = new Vec[nsplits][/*num*/]; // resulting vectors for all
for (int i=0; i<nsplits; i++) {
// vectors for j-th split
t[i] = new Vec(Vec.newKey(),espcPerSplit[i/*-th split*/]).makeZeros(num, domains);
t[i] = new Vec(Vec.newKey(),espcPerSplit[i/*-th split*/]).makeZeros(num, domains, uuids, times);
}
return t;
}
Expand Down
6 changes: 4 additions & 2 deletions src/main/java/hex/FrameSplitter.java
Original file line number Diff line number Diff line change
Expand Up @@ -128,10 +128,12 @@ private Vec[][] makeTemplates(Frame dataset, float[] ratios) {
final int num = dataset.numCols(); // number of columns in input frame
final int nsplits = espcPerSplit.length; // number of splits
final String[][] domains = dataset.domains(); // domains
final boolean[] uuids = dataset.uuids();
final byte [] times = dataset.times();
Vec[][] t = new Vec[nsplits][/*num*/]; // resulting vectors for all
for (int i=0; i<nsplits; i++) {
// vectors for j-th split
t[i] = new Vec(Vec.newKey(),espcPerSplit[i/*-th split*/]).makeZeros(num, domains);
t[i] = new Vec(Vec.newKey(),espcPerSplit[i/*-th split*/]).makeZeros(num, domains, uuids, times);
}
return t;
}
Expand Down Expand Up @@ -191,7 +193,7 @@ public FrameSplitTask(H2OCountedCompleter completer, Vec[] srcVecs, float[] rati
int nrows = cs[0]._len;
// For each output chunk extract appropriate rows for partIdx-th part
for (int i=0; i<cs.length; i++) {
// WARNING: this implementation does not preserver co-location of chunks so we are forcing here network transfer!
// WARNING: this implementation does not preserve co-location of chunks so we are forcing here network transfer!
ChunkSplitter.extractChunkPart(_srcVecs[i].chunkForChunkIdx(cinidx), cs[i], startRow, nrows, _fs);
}
}
Expand Down
21 changes: 3 additions & 18 deletions src/main/java/hex/Summary2.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@
import water.api.Request.API;
import water.fvec.*;
import water.exec.Flow;
import water.parser.*;
import water.util.Utils;
import water.util.Log;

import java.util.Arrays;
import java.util.Random;

/**
* Summary of a column.
Expand Down Expand Up @@ -131,6 +129,7 @@ public BasicStat add(Chunk chk) {
for(int i = 0; i < chk._len; i++) {
double val;
if (chk.isNA0(i)) { _nas++; continue; }
if( chk._vec.isUUID() ) continue;
if (Double.isNaN(val = chk.at0(i))) { _nans++; continue; }
if (val == Double.POSITIVE_INFINITY) _pinfs++;
else if (val == Double.NEGATIVE_INFINITY) _ninfs++;
Expand Down Expand Up @@ -167,21 +166,6 @@ public BasicStat finishUp() {
return this;
}

/**
* @return number of filled elements, excluding NaN's as well.
*/
public long len1() {
return _len - _nas - _nans;
}
/**
* Returns whether the fill density is less than the given percent.
* @param pct target percent.
* @param nan if true then NaN is counted as missing.
* @return true if less than {@code pct} of rows are filled. */
public boolean isSparse(double pct, boolean nan) {
assert 0 < pct && pct <= 1;
return (double)(_len - _nas - (nan?_nans:0)) / _len < pct;
}
}

public static class PrePass extends MRTask2<PrePass> {
Expand Down Expand Up @@ -312,7 +296,7 @@ public void finishUp(Vec vec) {
public Summary2(Vec vec, String name, BasicStat stat0, int max_qbins) {
colname = name;
_stat0 = stat0;
_type = vec.isEnum()?2:vec.isInt()?1:0;
_type = vec.isEnum()?T_ENUM:vec.isInt()?T_INT:T_REAL;
_domain = vec.isEnum() ? vec.domain() : null;
_gprows = 0;
double sigma = Double.isNaN(vec.sigma()) ? 0 : vec.sigma();
Expand Down Expand Up @@ -469,6 +453,7 @@ public Summary2(Vec vec, String name, BasicStat stat0) {
}

public Summary2 add(Chunk chk) {
if( chk._vec.isUUID() ) return this;
for (int i = 0; i < chk._len; i++)
add(chk.at0(i));
return this;
Expand Down
27 changes: 13 additions & 14 deletions src/main/java/water/Job.java
Original file line number Diff line number Diff line change
Expand Up @@ -201,11 +201,6 @@ public boolean isCancelledOrCrashed() {
return state == JobState.CANCELLED || state == JobState.FAILED;
}

/** Returns true if the job was cancelled by the user.
* @return true if the job is in state {@link JobState#CANCELLED}.
*/
public boolean isCancelledXX() { return state == JobState.CANCELLED; }

/** Returns true if the job was terminated by unexpected exception.
* @return true, if the job was terminated by unexpected exception.
*/
Expand Down Expand Up @@ -273,13 +268,10 @@ public void remove() {
* @param jobkey job key
* @return returns a job with given job key or null if a job is not found.
*/
public static final Job findJob(final Key jobkey) {
Job job = UKV.get(jobkey);
return job;
}
public static Job findJob(final Key jobkey) { return UKV.get(jobkey); }

/** Finds a job with given dest key or returns null */
public static final Job findJobByDest(final Key destKey) {
public static Job findJobByDest(final Key destKey) {
Job job = null;
for( Job current : Job.all() ) {
if( current.dest().equals(destKey) ) {
Expand Down Expand Up @@ -372,7 +364,7 @@ public static void waitUntilJobEnded(Key jobkey, int pollingIntervalMillis) {
return;
}

try { Thread.sleep (pollingIntervalMillis); } catch (Exception xe) {}
try { Thread.sleep (pollingIntervalMillis); } catch (Exception ignore) {}
}
}

Expand All @@ -390,8 +382,7 @@ public static class ChunkProgress extends Iced implements Progress {
final long _count;
private final Status _status;
final String _error;
protected DException _ex;
public enum Status { Computing, Done, Cancelled, Error };
public enum Status { Computing, Done, Cancelled, Error }

public Status status() { return _status; }

Expand Down Expand Up @@ -587,6 +578,14 @@ else if(val.length>0) {
if (!isEmpty(ignored_cols_by_name)) { specified++; }
if (specified > 1) throw new IllegalArgumentException("Arguments 'cols', 'ignored_cols_by_name', and 'ignored_cols' are exclusive");

Vec[] vecs = source.vecs();
for( int i = 0; i < vecs.length; i++ )
if( vecs[i].isUUID() ) {
if( ignored_cols==null ) ignored_cols = new int[0];
ignored_cols = Arrays.copyOf(ignored_cols,ignored_cols.length+1);
ignored_cols[ignored_cols.length-1] = i;
}

// If the column are not specified, then select everything.
if (isEmpty(cols)) {
cols = new int[source.vecs().length];
Expand Down Expand Up @@ -884,7 +883,7 @@ final protected void genericCrossValidation(Frame[] splits, long[] offsets, int
protected String[] getVectorDomain(final Vec v) {
assert v==null || v.isInt() || v.isEnum() : "Cannot get vector domain!";
if (v==null) return null;
String[] r = null;
String[] r;
if (v.isEnum()) {
r = v.domain();
} else {
Expand Down
11 changes: 11 additions & 0 deletions src/main/java/water/PrettyPrint.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,15 @@ public static String bytesPerSecond(long bytes) {
if( bytes < 0 ) return "N/A";
return bytes(bytes)+"/S";
}

// About as clumsy and random as a blaster...
public static String UUID( long lo, long hi ) {
long lo0 = (lo>>32)&0xFFFFFFFFL;
long lo1 = (lo>>16)&0xFFFFL;
long lo2 = (lo>> 0)&0xFFFFL;
long hi0 = (hi>>48)&0xFFFFL;
long hi1 = (hi>> 0)&0xFFFFFFFFFFFFL;
return String.format("%08X-%04X-%04X-%04X-%012X",lo0,lo1,lo2,hi0,hi1);
}

}
19 changes: 14 additions & 5 deletions src/main/java/water/api/Inspect2.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,13 @@ public class Inspect2 extends Request2 {

// An internal JSON-output-only class
static class ColSummary extends Iced {
public static enum ColType { Enum, Int, Real, Time };
public static enum ColType { Enum, Int, Real, Time, UUID };
static final int API_WEAVER=1; // This file has auto-gen'd doc & json fields
static public DocGen.FieldDoc[] DOC_FIELDS; // Initialized from Auto-Gen code.
public ColSummary( String name, Vec vec ) {
this.name = name;
this.type = vec.isEnum() ? ColType.Enum : vec.isInt() ? (vec.isTime() ? ColType.Time : ColType.Int) : ColType.Real;
this.type = vec.isEnum() ? ColType.Enum : vec.isUUID() ? ColType.UUID : (vec.isInt() ? (vec.isTime() ? ColType.Time : ColType.Int) : ColType.Real);
boolean numeric = !vec.isEnum() && !vec.isUUID();
this.min = vec.isEnum() ? Double.NaN : vec.min();
this.max = vec.isEnum() ? Double.NaN : vec.max();
this.mean = vec.isEnum() ? Double.NaN : vec.mean();
Expand Down Expand Up @@ -194,7 +195,10 @@ public static Response redirect(Request req, String src_key) {
sb.append("<tr class='warning'>");
sb.append("<td>").append("Mean").append("</td>");
for( int i=0; i<cols.length; i++ )
sb.append("<td>").append(cols[i].type == ColType.Enum ? NA : mean_dformat.format(cols[i].mean)).append("</td>");
sb.append("<td>").append((cols[i].type == ColType.Enum) ||
(cols[i].type == ColType.UUID)
? NA
: mean_dformat.format(cols[i].mean)).append("</td>");
sb.append("</tr>");

// Cardinality row is shown only if dataset contains enum-column
Expand Down Expand Up @@ -259,8 +263,13 @@ public static Response redirect(Request req, String src_key) {
}

// ---
// Return a well-formated string for this kind of Vec
public static String x0( Vec v, long row ) { return x1(v,row,v.at(row)); }
// Return a well-formatted string for this kind of Vec
public static String x0( Vec v, long row ) {
if( !v.isUUID() ) return x1(v,row,v.at(row));
// UUID handling
if( v.isNA(row) ) return x1(v,row,Double.NaN);
return "<b style=\"font-family:monospace;\">"+PrettyPrint.UUID(v.at16l(row),v.at16h(row))+"</b>";
}

// Format a row, OR the min/max
public static String x1( Vec v, long row, double d ) {
Expand Down
20 changes: 17 additions & 3 deletions src/main/java/water/api/RequestArguments.java
Original file line number Diff line number Diff line change
Expand Up @@ -2617,6 +2617,18 @@ protected Frame fr() {
/** A Class Vec/Column within a Frame. Limited to 1000 classes, just to prevent madness. */
public class FrameClassVec extends FrameKeyVec {
public FrameClassVec(String name, TypeaheadKey key ) { super(name, key); }
@Override protected String[] selectValues() {
final Vec [] vecs = fr().vecs();
String[] names = new String[vecs.length];
int j = 0;
for( int i = 0; i < vecs.length; ++i) {
if( vecs[i].min() < vecs[i].max() &&
((double)vecs[i].naCnt())/vecs[i].length() <= 0.1 &&
!vecs[i].isUUID() ) // No math on strings or UUIDs
names[j++] = fr()._names[i]; // ignore constant columns and columns with too many NAs
}
return Arrays.copyOf(names, j);
}
@Override protected Vec defaultValue() {
Frame fr = fr();
return fr != null ? fr.vecs()[fr.vecs().length - 1] : null;
Expand Down Expand Up @@ -2649,7 +2661,8 @@ public void setResponse(FrameClassVec response) {
addPrerequisite(response);
}
public boolean shouldIgnore(int i, Frame fr ) {
return _response != null && _response.value() == fr.vecs()[i];
return (_response != null && _response.value() == fr.vecs()[i]) ||
fr.vecs()[i].isUUID();
}
public void checkLegality(Vec v) throws IllegalArgumentException { }
protected Comparator<Integer> colComp(final ValueArray ary){
Expand Down Expand Up @@ -2713,9 +2726,10 @@ protected Comparator<Integer> colComp(final ValueArray ary){
final Vec [] vecs = fr().vecs();
int [] res = new int[vecs.length];
int j = 0;
for(int i = 0; i < vecs.length; ++i){
for( int i = 0; i < vecs.length; ++i) {
if(!(vecs[i].min() < vecs[i].max()) ||
(_filterNAs && ((double)vecs[i].naCnt())/vecs[i].length() > 0.1))
(_filterNAs && ((double)vecs[i].naCnt())/vecs[i].length() > 0.1) ||
vecs[i].isUUID() ) // No math on strings or UUIDs
res[j++] = i; // ignore constant columns and columns with too many NAs
}
return Arrays.copyOf(res, j);
Expand Down
15 changes: 12 additions & 3 deletions src/main/java/water/fvec/AppendableVec.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ public class AppendableVec extends Vec {
public static final byte ENUM = 2;
public static final byte NUMBER = 4;
public static final byte TIME = 8;
public static final byte UUID =16;
byte [] _chunkTypes;
long _naCnt;
long _strCnt;
Expand Down Expand Up @@ -97,11 +98,12 @@ public Vec close(Futures fs) {
int nchunk = _espc.length;
while( nchunk > 0 && _espc[nchunk-1] == 0 ) nchunk--;
DKV.remove(chunkKey(nchunk)); // remove potential trailing key
boolean hasNumber = false, hasEnum = false, hasTime=false;
boolean hasNumber = false, hasEnum = false, hasTime=false, hasUUID=false;
for( int i = 0; i < nchunk; ++i ) {
if( (_chunkTypes[i] & TIME ) != 0 ) { hasNumber = true; hasTime=true; }
if( (_chunkTypes[i] & NUMBER) != 0 ) hasNumber = true;
if( (_chunkTypes[i] & ENUM ) != 0 ) hasEnum = true;
if( (_chunkTypes[i] & UUID ) != 0 ) hasUUID = true;
}
// number wins, we need to go through the enum chunks and declare them all
// NAs (chunk is considered enum iff it has only enums + possibly some nas)
Expand All @@ -110,6 +112,14 @@ public Vec close(Futures fs) {
if(_chunkTypes[i] == ENUM)
DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs);
}
// enum wins over UUID
if( hasUUID && hasEnum ) {
hasUUID=false;
for(int i = 0; i < nchunk; ++i)
if(_chunkTypes[i] == UUID)
DKV.put(chunkKey(i), new C0DChunk(Double.NaN, (int)_espc[i]),fs);
}

// Make sure time is consistent
int t = -1;
if( hasTime ) {
Expand Down Expand Up @@ -138,8 +148,7 @@ public Vec close(Futures fs) {
}
espc[nchunk]=x; // Total element count in last
// Replacement plain Vec for AppendableVec.
Vec vec = new Vec(_key, espc, _domain);
vec._time = (byte)t; // Time parse, if any
Vec vec = new Vec(_key, espc, _domain, hasUUID, (byte)t);
DKV.put(_key,vec,fs); // Inject the header
return vec;
}
Expand Down
Loading

0 comments on commit 3ec9edd

Please sign in to comment.