diff --git a/R/h2o-package/R/Classes.R b/R/h2o-package/R/Classes.R index 36e7ac4120..e6f477adca 100644 --- a/R/h2o-package/R/Classes.R +++ b/R/h2o-package/R/Classes.R @@ -376,6 +376,16 @@ year.H2OParsedData <- h2o.year month <- function(x) UseMethod('month', x) month.H2OParsedData <- h2o.month +as.Date.H2OParsedData <- function(x, format, ...) { + if(!is.character(format)) stop("format must be a string") + + expr = paste("as.Date(", paste(x@key, deparse(substitute(format)), sep = ","), ")", sep = "") + res = .h2o.__exec2(x@h2o, expr) + res <- .h2o.exec2(res$dest_key, h2o = x@h2o, res$dest_key) + res@logic <- FALSE + return(res) +} + diff.H2OParsedData <- function(x, lag = 1, differences = 1, ...) { if(!is.numeric(lag)) stop("lag must be numeric") if(!is.numeric(differences)) stop("differences must be numeric") diff --git a/R/tests/testdir_jira/runit_hex_1841_asdate_datemanipulation.R b/R/tests/testdir_jira/runit_hex_1841_asdate_datemanipulation.R new file mode 100644 index 0000000000..95b15bedd0 --- /dev/null +++ b/R/tests/testdir_jira/runit_hex_1841_asdate_datemanipulation.R @@ -0,0 +1,123 @@ +# +# date parsing and field extraction tests +# + + +setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) +source('../findNSourceUtils.R') + + + +datetest <- function(conn){ + Log.info('uploading date testing dataset') + hdf <- h2o.importFile(conn, normalizePath(locate('smalldata/jira/v-11.csv'))) + # df should be 5 columns: ds1:5 + + Log.info('data as loaded into h2o:') + Log.info(head(hdf)) + + # NB: columns 1,5 are currently unsupported as date types + # that is, h2o cannot understand: + # 1 integer days since epoch (or since any other date); + # 2 dates formatted as %d/%m/%y (in strptime format strings) + summary(hdf) + + Log.info('adding date columns') + # NB: h2o automagically recognizes and if it doesn't recognize, you're out of luck + hdf$ds5 <- as.Date(hdf$ds5, "%d/%m/%y %H:%M") + hdf$ds6 <- as.Date(hdf$ds6, "%d/%m/%Y %H:%M:%S") + hdf$ds7 <- as.Date(hdf$ds7, "%m/%d/%y") + hdf$ds8 <- as.Date(hdf$ds8, "%m/%d/%Y") + hdf$ds9 <- as.Date(as.factor(hdf$ds9), "%Y%m%d") + hdf$ds10 <- as.Date(hdf$ds10, "%Y_%m_%d") + + Log.info('extracting year and month from posix date objects') + hdf$year2 <- year(hdf$ds2) + hdf$year3 <- year(hdf$ds3) + hdf$year4 <- year(hdf$ds4) + hdf$year5 <- year(hdf$ds5) + hdf$year6 <- year(hdf$ds6) + hdf$year7 <- year(hdf$ds7) + hdf$year8 <- year(hdf$ds8) + hdf$year9 <- year(hdf$ds9) + hdf$year10 <- year(hdf$ds10) + hdf$mon2 <- month(hdf$ds2) + hdf$mon3 <- month(hdf$ds3) + hdf$mon4 <- month(hdf$ds4) + hdf$mon5 <- month(hdf$ds5) + hdf$mon6 <- month(hdf$ds6) + hdf$mon7 <- month(hdf$ds7) + hdf$mon8 <- month(hdf$ds8) + hdf$mon9 <- month(hdf$ds9) + hdf$mon10 <- month(hdf$ds10) + hdf$idx2 <- year(hdf$ds2) * 12 + month(hdf$ds2) + hdf$idx3 <- year(hdf$ds3) * 12 + month(hdf$ds3) + hdf$idx4 <- year(hdf$ds4) * 12 + month(hdf$ds4) + hdf$idx5 <- year(hdf$ds5) * 12 + month(hdf$ds5) + hdf$idx6 <- year(hdf$ds6) * 12 + month(hdf$ds6) + hdf$idx7 <- year(hdf$ds7) * 12 + month(hdf$ds7) + hdf$idx8 <- year(hdf$ds8) * 12 + month(hdf$ds8) + hdf$idx9 <- year(hdf$ds9) * 12 + month(hdf$ds9) + hdf$idx10 <- year(hdf$ds10) * 12 + month(hdf$ds10) + + cc <- colnames(hdf) + nn <- c( paste('year', 2:10, sep=''), paste('month', 2:10, sep=''), paste('idx', 2:10, sep='') ) + cc[ (length(cc) - length(nn) + 1):length(cc) ] <- nn + colnames(hdf) <- cc + + Log.info('pulling year/month indices local') + ldf <- as.data.frame( hdf ) + + # build the truth using R internal date fns + rdf <- read.csv(locate('smalldata/jira/v-11.csv')) + rdf$days1 <- as.Date(rdf$ds1, origin='1970-01-01') + rdf$days2 <- as.Date(rdf$ds2, format='%Y-%m-%d') + rdf$days3 <- as.Date(rdf$ds3, format='%d-%b-%y') + rdf$days4 <- as.Date(rdf$ds4, format='%d-%B-%Y') + rdf$days5 <- as.Date(rdf$ds5, format='%d/%m/%y %H:%M') + rdf$days6 <- as.Date(rdf$ds6, format='%d/%m/%Y %H:%M:%S') + rdf$days7 <- as.Date(rdf$ds7, format='%m/%d/%y') + rdf$days8 <- as.Date(rdf$ds8, format='%m/%d/%Y') + rdf$days9 <- as.Date(as.factor(rdf$ds9), format='%Y%m%d') + rdf$days10 <- as.Date(rdf$ds10, format='%Y_%m_%d') + + months <- data.frame(lapply(rdf[,11:20], function(x) as.POSIXlt(x)$mon)) + years <- data.frame(lapply(rdf[,11:20], function(x) as.POSIXlt(x)$year)) + idx <- 12*years + months + + Log.info('testing correctness') + expect_that( ldf$year2, equals(years[,2]) ) + expect_that( ldf$year3, equals(years[,3]) ) + expect_that( ldf$year4, equals(years[,4]) ) + expect_that( ldf$year5, equals(years[,5]) ) + expect_that( ldf$year6, equals(years[,6]) ) + expect_that( ldf$year7, equals(years[,7]) ) + expect_that( ldf$year8, equals(years[,8]) ) + expect_that( ldf$year9, equals(years[,9]) ) + expect_that( ldf$year10, equals(years[,10]) ) + + expect_that( ldf$month2, equals(months[,2]) ) + expect_that( ldf$month3, equals(months[,3]) ) + expect_that( ldf$month4, equals(months[,4]) ) + expect_that( ldf$month5, equals(months[,5]) ) + expect_that( ldf$month6, equals(months[,6]) ) + expect_that( ldf$month7, equals(months[,7]) ) + expect_that( ldf$month8, equals(months[,8]) ) + expect_that( ldf$month9, equals(months[,9]) ) + expect_that( ldf$month10, equals(months[,10]) ) + + expect_that( ldf$idx2, equals(idx[,2]) ) + expect_that( ldf$idx3, equals(idx[,3]) ) + expect_that( ldf$idx4, equals(idx[,4]) ) + expect_that( ldf$idx5, equals(idx[,5]) ) + expect_that( ldf$idx6, equals(idx[,6]) ) + expect_that( ldf$idx7, equals(idx[,7]) ) + expect_that( ldf$idx8, equals(idx[,8]) ) + expect_that( ldf$idx9, equals(idx[,9]) ) + expect_that( ldf$idx10, equals(idx[,10]) ) + + testEnd() +} + + +doTest('date testing', datetest) diff --git a/R/tests/testdir_jira/runit_v_11_datemanipulation.R b/R/tests/testdir_jira/runit_v_11_datemanipulation.R deleted file mode 100644 index aeb543bf12..0000000000 --- a/R/tests/testdir_jira/runit_v_11_datemanipulation.R +++ /dev/null @@ -1,76 +0,0 @@ -# -# date parsing and field extraction tests -# - - -setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f"))) -source('../findNSourceUtils.R') - - - -datetest <- function(conn){ - - Log.info('uploading date testing dataset') - hdf <- h2o.importFile(conn, normalizePath(locate('smalldata/jira/v-11.csv'))) - # df should be 5 columns: ds1:5 - - Log.info('data as loaded into h2o:') - Log.info(head(hdf)) - - # NB: columns 1,5 are currently unsupported as date types - # that is, h2o cannot understand: - # 1 integer days since epoch (or since any other date); - # 2 dates formatted as %d/%m/%y (in strptime format strings) - - Log.info('adding date columns') - # NB: h2o automagically recognizes and if it doesn't recognize, you're out of luck - - Log.info('extracting year and month from posix date objects') - hdf$year2 <- year(hdf$ds2) - hdf$year3 <- year(hdf$ds3) - hdf$year4 <- year(hdf$ds4) - hdf$mon2 <- month(hdf$ds2) - hdf$mon3 <- month(hdf$ds3) - hdf$mon4 <- month(hdf$ds4) - hdf$idx2 <- year(hdf$ds2) * 12 + month(hdf$ds2) - hdf$idx3 <- year(hdf$ds3) * 12 + month(hdf$ds3) - hdf$idx4 <- year(hdf$ds4) * 12 + month(hdf$ds4) - - cc <- colnames(hdf) - nn <- c( paste('year', 2:4, sep=''), paste('month', 2:4, sep=''), paste('idx', 2:4, sep='') ) - cc[ (length(cc) - length(nn) + 1):length(cc) ] <- nn - colnames(hdf) <- cc - - Log.info('pulling year/month indices local') - ldf <- as.data.frame( hdf ) - - # build the truth using R internal date fns - rdf <- read.csv(locate('smalldata/jira/v-11.csv')) - rdf$days1 <- as.Date(rdf$ds1, origin='1970-01-01') - rdf$days2 <- as.Date(rdf$ds2, format='%Y-%m-%d') - rdf$days3 <- as.Date(rdf$ds3, format='%d-%b-%y') - rdf$days4 <- as.Date(rdf$ds4, format='%d-%B-%Y') - rdf$days5 <- as.Date(rdf$ds5, format='%d/%m/%y') - - months <- data.frame(lapply(rdf[,6:10], function(x) as.POSIXlt(x)$mon)) - years <- data.frame(lapply(rdf[,6:10], function(x) as.POSIXlt(x)$year)) - idx <- 12*years + months - - Log.info('testing correctness') - expect_that( ldf$year2, equals(years[,2]) ) - expect_that( ldf$year3, equals(years[,3]) ) - expect_that( ldf$year4, equals(years[,4]) ) - - expect_that( ldf$month2, equals(months[,2]) ) - expect_that( ldf$month3, equals(months[,3]) ) - expect_that( ldf$month4, equals(months[,4]) ) - - expect_that( ldf$idx2, equals(idx[,2]) ) - expect_that( ldf$idx3, equals(idx[,3]) ) - expect_that( ldf$idx4, equals(idx[,4]) ) - - testEnd() -} - - -doTest('date testing', datetest) diff --git a/smalldata/jira/v-11.csv b/smalldata/jira/v-11.csv index 3a62240394..81d9ca17bf 100644 --- a/smalldata/jira/v-11.csv +++ b/smalldata/jira/v-11.csv @@ -1,4 +1,4 @@ -"ds1","ds2","ds3","ds4","ds5" -1,"1970-01-02","3-Jan-06","3-January-2006","3/01/06" -1500,"1974-02-09","15-Jul-09","15-July-2009","15/07/09" -15000,"2011-01-26","30-Sep-09","30-September-2009","30/09/09" +"ds1","ds2","ds3","ds4","ds5","ds6","ds7","ds8","ds9","ds10" +1,"1970-01-02","3-Jan-06","3-January-2006","3/01/06 13:30","3/01/2006 13:30:00","1/3/68","1/3/2068","19700102","1970_1_2" +1500,"1974-02-09","15-Jul-09","15-July-2009","15/07/09 1:01","15/07/2009 01:01:30","07/15/69","07/15/1969","19740209","1974_02_09" +15000,"2011-01-26","30-Sep-09","30-September-2009","30/09/09 23:00","30/09/2009 23:00:59","9/30/09","9/30/2009","20110126","2011_1_26" diff --git a/src/main/java/water/exec/AST.java b/src/main/java/water/exec/AST.java index 3defb3f3f3..bb32ead1af 100644 --- a/src/main/java/water/exec/AST.java +++ b/src/main/java/water/exec/AST.java @@ -42,7 +42,7 @@ static AST parseVal(Exec2 E, boolean EOS ) { if( (ast = ASTId .parse(E)) != null ) return ast; if( (ast = ASTNum .parse(E)) != null ) return ast; if( (ast = ASTOp .parse(E)) != null ) return ast; - if( E.peek('"',EOS) ) E.throwErr("The current Exec does not handle strings",E._x); + if( (ast = ASTStr .parse(E)) != null ) return ast; return null; } abstract void exec(Env env); @@ -225,7 +225,9 @@ static AST parse(Exec2 E, boolean EOS ) { AST rows=E.xpeek(',',(x=E._x),parseCXExpr(E, false)); if( rows != null && !rows._t.union(Type.dblary()) ) E.throwErr("Must be scalar or array",x); AST cols=E.xpeek(']',(x=E._x),parseCXExpr(E, false)); - if( cols != null && !cols._t.union(Type.dblary()) ) E.throwErr("Must be scalar or array",x); + if( cols != null && !cols._t.union(Type.dblary()) ) + if (cols._t.isStr()) E.throwErr("The current Exec does not handle strings",x); + else E.throwErr("Must be scalar or array",x); Type t = // Provable scalars will type as a scalar rows != null && rows.isPosConstant() && cols != null && cols.isPosConstant() ? Type.DBL : Type.ARY; @@ -383,6 +385,22 @@ static String parseNew(Exec2 E) { @Override public String toString() { return _id; } } +class ASTStr extends AST { + final String _str; + ASTStr(String str) { super(Type.STR); _str=str; } + // Parse a string, or throw a parse error + static ASTStr parse(Exec2 E) { + String str = E.isString(); + if (str != null) { + E._x += str.length()+2; //str + quotes + return new ASTStr(str); + } + return null; + } + @Override void exec(Env env) { env.push(_str); } + @Override public String toString() { return _str; } +} + // -------------------------------------------------------------------------- class ASTAssign extends AST { final AST _lhs; diff --git a/src/main/java/water/exec/ASTOp.java b/src/main/java/water/exec/ASTOp.java index 75c9666bfd..d7c98dda45 100644 --- a/src/main/java/water/exec/ASTOp.java +++ b/src/main/java/water/exec/ASTOp.java @@ -12,10 +12,9 @@ import org.joda.time.DateTime; import org.joda.time.MutableDateTime; +import org.joda.time.format.DateTimeFormatter; import water.*; import water.fvec.*; -import water.fvec.Vec.VectorGroup; -import water.util.Log; import water.util.Utils; /** Parse a generic R string and build an AST, in the context of an H2O Cloud @@ -133,6 +132,7 @@ public abstract class ASTOp extends AST { putPrefix(new ASTMinute()); putPrefix(new ASTSecond()); putPrefix(new ASTMillis()); + putPrefix(new ASTasDate()); // Time series operations putPrefix(new ASTDiff ()); @@ -712,6 +712,44 @@ class ASTMinute extends ASTTimeOp { @Override String opStr(){return "minute";} @ class ASTSecond extends ASTTimeOp { @Override String opStr(){return "second";} @Override ASTOp make() {return new ASTSecond();} @Override long op(MutableDateTime dt) { return dt.getSecondOfMinute();}} class ASTMillis extends ASTTimeOp { @Override String opStr(){return "millis";} @Override ASTOp make() {return new ASTMillis();} @Override long op(MutableDateTime dt) { return dt.getMillisOfSecond();}} +class ASTasDate extends ASTOp { + ASTasDate() { super(new String[]{"as.Date", "x", "format"}, + new Type[]{Type.ARY, Type.ARY, Type.STR}, + OPF_PREFIX, + OPP_PREFIX,OPA_RIGHT); } + @Override String opStr() { return "as.Date"; } + @Override ASTOp make() {return new ASTasDate();} + @Override void apply(Env env, int argcnt, ASTApply apply) { + final String format = env.popStr(); + if (format.isEmpty()) throw new IllegalArgumentException("as.Date requires a non-empty format string"); + // check the format string more? + + Frame fr = env.ary(-1); + + if( fr.vecs().length != 1 || !fr.vecs()[0].isEnum() ) + throw new IllegalArgumentException("as.Date requires a single column of factors"); + + Frame fr2 = new MRTask2() { + @Override public void map( Chunk chks[], NewChunk nchks[] ) { + //done on each node in lieu of rewriting DateTimeFormatter as Iced + DateTimeFormatter dtf = ParseTime.forStrptimePattern(format); + for( int i=0; i(); @@ -53,12 +55,14 @@ public class Env extends Iced { public boolean isAry() { return _ary[_sp-1] != null; } public boolean isFcn () { return _fcn[_sp-1] != null; } public boolean isDbl () { return !isAry() && !isFcn(); } + public boolean isStr () { return !isAry() && !isFcn() && _str[_sp-1] != null; } public boolean isFcn (int i) { return _fcn[_sp+i] != null; } public boolean isAry(int i) { return _ary[_sp+i] != null; } // Peek operators public Frame ary(int i) { Frame fr = _ary[_sp+i]; assert fr != null; return fr; } public ASTOp fcn(int i) { ASTOp op = _fcn[_sp+i]; assert op != null; return op; } public double dbl(int i) { double d = _d [_sp+i]; return d; } + public String str(int i) { String s = _str[_sp+i]; assert s != null; return s; } // Load the nth Id/variable from the named lexical scope, typed as a Frame public Frame frId(int d, int n) { @@ -77,10 +81,12 @@ void push( int slots ) { _ary= Arrays.copyOf(_ary,len<<1); _d = Arrays.copyOf(_d ,len<<1); _fcn= Arrays.copyOf(_fcn,len<<=1); + _str= Arrays.copyOf(_str,len<<1); } } void push( Frame fr ) { push(1); _ary[_sp-1] = addRef(fr); assert _ary[0]==null||check_refcnt(_ary[0].anyVec());} void push( double d ) { push(1); _d [_sp-1] = d ; } + void push( String st) { push(1); _str[_sp-1] = st ; } void push( ASTOp fcn) { push(1); _fcn[_sp-1] = addRef(fcn); } void push( Frame fr, String key ) { push(fr); _key[_sp-1]=key; } @@ -92,6 +98,7 @@ void push_slot( int d, int n ) { _ary[_sp-1] = addRef(_ary[idx]); _d [_sp-1] = _d [idx]; _fcn[_sp-1] = addRef(_fcn[idx]); + _str[_sp-1] = _str[idx]; assert _ary[0]==null || check_refcnt(_ary[0].anyVec()); } void push_slot( int d, int n, Env global ) { @@ -102,6 +109,7 @@ void push_slot( int d, int n, Env global ) { global._ary[gidx] = global.addRef(_ary[idx]); global._d [gidx] = _d [idx] ; global._fcn[gidx] = global.addRef(_fcn[idx]); + global._str[gidx] = _str[idx] ; assert _ary[0]==null || global.check_refcnt(_ary[0].anyVec()); } // Copy from TOS into a slot. Does NOT pop results. @@ -116,6 +124,7 @@ void tos_into_slot( int d, int n, String id ) { Frame fr = _ary[_sp-1]; _ary[idx] = fr==null ? null : addRef(new Frame(fr)); _d [idx] = _d [_sp-1] ; + _str[idx] = _str[_sp-1] ; _fcn[idx] = addRef(_fcn[_sp-1]); _key[idx] = d==0 && fr!=null ? id : null; // Temporary solution to add a UDF to global name space. Needs to fix in the future. @@ -130,6 +139,7 @@ void tos_into_slot( int idx, String id ) { _ary[idx] = fr==null ? null : addRef(new Frame(fr)); _d [idx] = _d [_sp-1] ; _fcn[idx] = addRef(_fcn[_sp-1]); + _str[idx] = _str[_sp-1] ; _key[idx] = fr!=null ? id : null; assert _ary[0]== null || check_refcnt(_ary[0].anyVec()); } @@ -143,6 +153,7 @@ void pop_into_stk( int x ) { _ary[_sp+x] = _ary[_sp-1]; // Copy without changing ref cnt _fcn[_sp+x] = _fcn[_sp-1]; _d [_sp+x] = _d [_sp-1]; + _str[_sp+x] = _str[_sp-1]; _sp--; x++; // Pop without changing ref cnt while( x++ < -1 ) pop(); } @@ -185,6 +196,7 @@ void popScope() { // Pop & return a Frame or Fcn; ref-cnt of all things remains unchanged. // Caller is responsible for tracking lifetime. public double popDbl() { assert isDbl(); return _d [--_sp]; } + public String popStr() { assert isStr(); return _str[--_sp]; } public ASTOp popFcn() { assert isFcn(); ASTOp op = _fcn[--_sp]; _fcn[_sp]=null; return op; } public Frame popAry() { assert isAry(); Frame fr = _ary[--_sp]; _ary[_sp]=null; assert allAlive(fr); return fr; } public Frame peekAry() { assert isAry(); Frame fr = _ary[_sp-1]; assert allAlive(fr); return fr; } @@ -230,6 +242,7 @@ private Env( Env e, boolean cntrefs ) { _ary= Arrays.copyOf(e._ary,_sp); _d = Arrays.copyOf(e._d ,_sp); _fcn= Arrays.copyOf(e._fcn,_sp); + _str = Arrays.copyOf(e._str,_sp); _tod= e._tod; _display = e._display.clone(); if( cntrefs ) { // If counting refs @@ -410,6 +423,7 @@ public String resultString( ) { public String toString(int i, boolean verbose_fcn) { if( _ary[i] != null ) return _ary[i]._key+":"+_ary[i].numRows()+"x"+_ary[i].numCols(); else if( _fcn[i] != null ) return _fcn[i].toString(verbose_fcn); + else if( _str[i] != null ) return _str[i]; return Double.toString(_d[i]); } @Override public String toString() { diff --git a/src/main/java/water/exec/Exec2.java b/src/main/java/water/exec/Exec2.java index 9b80754381..8405cb4d61 100644 --- a/src/main/java/water/exec/Exec2.java +++ b/src/main/java/water/exec/Exec2.java @@ -170,6 +170,7 @@ boolean peekEOS() { static boolean isLetter2(char c) { return c=='.' || c==':' || c=='\\' || isDigit(c) || isLetter(c); } + static boolean isQuote(char c) { return c=='"' || c=='\''; } // Return an ID string, or null if we get weird stuff or numbers. Valid IDs // include all the operators, except parens (function application) and assignment. @@ -222,6 +223,18 @@ String isID() { return _str.substring(_x-2,_x); } + String isString() { // returns string value without enclosing quotes + if( _x>=_buf.length ) return null; // No characters to parse + char c = _buf[_x]; + + if( isQuote(c) ) { + int x=_x+1; + while( x < _buf.length && _buf[x] != c )x++; + return _str.substring(_x+1,x); + } + return null; + } + // isID specifically does not parse "=" or "<-". This guy does. boolean isAssign(boolean EOS) { if( peek('<',EOS) ) { diff --git a/src/main/java/water/exec/Type.java b/src/main/java/water/exec/Type.java index 912e3a78d7..886f7ae58e 100644 --- a/src/main/java/water/exec/Type.java +++ b/src/main/java/water/exec/Type.java @@ -16,6 +16,7 @@ public class Type extends Iced { final static private int FCN0 = 4; // Return type in _ts[0], args in _ts[1...]; final static private int DBLARY0= 5; // Type is either DBL or ARY but not FCN final static private int ANYARY0= 6; // Type is ARY if any ts[] is an ARY, else DBL + final static private int STR0 = 7; final static private int VARARGS=32; // OR'd onto last type in a fcn, allows zero or more of this type int _t; // One of the above #s static private int UNIQUE; // Unique ID handy for debugging @@ -49,6 +50,7 @@ private boolean varargs_clean( int t, Type ts[] ) { // Make some base types static Type DBL = new Type(DBL0,null); static Type ARY = new Type(ARY0,null); + static Type STR = new Type(STR0, null); public static Type unbound() { return new Type(UNBOUND,new Type[1]); } public static Type fcn(Type[] ts) { return new Type(FCN0,ts); } public static Type varargs(Type t) { return new Type(t._t,t._ts,1f);} @@ -104,7 +106,7 @@ private Type findAnyAry() { t.union(fun); t=fun=t.find(); } else { if( t._t == FCN0 ) fun = t; - if( t._t != DBL0 && // Keep non-DBL + if( t._t != DBL0 && t._t != STR0 && // Keep non-DBL & non-STR !dupType(len,t) ) // But remove dups _ts[len++] = t; } @@ -181,8 +183,9 @@ else if( tta==ANYARY0 && ttb==DBL0 ) { // Force all to DBL boolean isAry() { Type t=find(); return t._t==ARY0; } boolean isDbl() { Type t=find(); return t._t==DBL0; } boolean isFcn() { Type t=find(); return t._t==FCN0; } - boolean isNotFun() { Type t=find(); return t._t==DBL0 || t._t==ARY0 || t._t==DBLARY0; } + boolean isNotFun() { Type t=find(); return t._t==DBL0 || t._t==ARY0 || t._t==DBLARY0 || t._t==STR0; } boolean isDblAry() { Type t=find(); return t._t==DBL0 || t._t==ARY0; } + boolean isStr() { Type t=find(); return t._t==STR0; } // Return type of functions public Type ret() { Type t=find(); assert t._t == FCN0; return t._ts[0].find(); } @@ -195,6 +198,7 @@ else if( tta==ANYARY0 && ttb==DBL0 ) { // Force all to DBL case DBL0: s = "dbl"; break; case ARY0: s = "ary"; break; case DBLARY0: s = "dblary"; break; + case STR0: s = "str"; break; case ANYARY0: { s = "anyary{"; for( Type t : _ts ) s += t+","; diff --git a/src/main/java/water/fvec/ParseTime.java b/src/main/java/water/fvec/ParseTime.java index 16d9b12dd4..0db8ccabda 100644 --- a/src/main/java/water/fvec/ParseTime.java +++ b/src/main/java/water/fvec/ParseTime.java @@ -1,7 +1,10 @@ package water.fvec; import org.joda.time.DateTime; +import org.joda.time.format.DateTimeFormatter; +import org.joda.time.format.DateTimeFormatterBuilder; import water.parser.ValueString; +import water.util.Log; public abstract class ParseTime { // Deduce if we are looking at a Date/Time value, or not. @@ -218,4 +221,200 @@ public static long badUUID( ValueString str ) { str.setOff(-1); return Long.MIN_VALUE; } + + /** + * Factory to create a formatter from a strptime pattern string. + * This models the commonly supported features of strftime from POSIX + * (where it can). + *

+ * The format may contain locale specific output, and this will change as + * you change the locale of the formatter. + * Call DateTimeFormatter.withLocale(Locale) to switch the locale. + * For example: + *

+   * DateTimeFormat.forPattern(pattern).withLocale(Locale.FRANCE).print(dt);
+   * 
+ * + * @param pattern pattern specification + * @return the formatter + * @throws IllegalArgumentException if the pattern is invalid + */ + public static DateTimeFormatter forStrptimePattern(String pattern) { + if (pattern == null || pattern.length() == 0) + throw new IllegalArgumentException("Empty date time pattern specification"); + + DateTimeFormatterBuilder builder = new DateTimeFormatterBuilder(); + parseToBuilder(builder, pattern); + DateTimeFormatter formatter = builder.toFormatter(); + + return formatter; + } + + //----------------------------------------------------------------------- + /** + * Parses the given pattern and appends the rules to the given + * DateTimeFormatterBuilder. See strptime man page for valid patterns. + * + * @param pattern pattern specification + * @throws IllegalArgumentException if the pattern is invalid + */ + private static void parseToBuilder(DateTimeFormatterBuilder builder, String pattern) { + int length = pattern.length(); + int[] indexRef = new int[1]; + + for (int i=0; i 1900s, 00-68 -> 2000s + builder.appendTwoDigitWeekyear(2019); + break; + case 'Y': + builder.appendYear(4,4); + break; + case 'z': + builder.appendTimeZoneOffset(null, "z", false, 2, 2); + break; + case 'Z': + builder.appendTimeZoneName(); + break; + default: // No match, ignore + builder.appendLiteral('\''); + builder.appendLiteral(token); + Log.warn(token + "is not acceptted as a parse token, treating as a literal"); + } + } else { + if (c == '\'') { + String sub = token.substring(1); + if (sub.length() > 0) { + // Create copy of sub since otherwise the temporary quoted + // string would still be referenced internally. + builder.appendLiteral(new String(sub)); + } + } else throw new IllegalArgumentException("Unexpected token encountered parsing format string:" + c); + } + } + } + /** + * Parses an individual token. + * + * @param pattern the pattern string + * @param indexRef a single element array, where the input is the start + * location and the output is the location after parsing the token + * @return the parsed token + */ + private static String parseToken(String pattern, int[] indexRef) { + StringBuilder buf = new StringBuilder(); + + int i = indexRef[0]; + int length = pattern.length(); + + char c = pattern.charAt(i); + if (c == '%' && i + 1 < length && pattern.charAt(i+1) != '%') { + //Grab pattern tokens + c = pattern.charAt(++i); + //0 is ignored for input, and this ignores alternative religious eras + if ((c == '0' || c == 'E') && i + 1 >= length) c = pattern.charAt(++i); + buf.append('%'); + buf.append(c); + } else { // Grab all else as text + buf.append('\''); // mark literals with ' in first place + buf.append(c); + for (i++; i < length;i++) { + c = pattern.charAt(i); + if (c == '%' ) { // consume literal % otherwise break + if (i + 1 < length && pattern.charAt(i + 1) == '%') i++; + else { i--; break; } + } + buf.append(c); + } + } + + indexRef[0] = i; + return buf.toString(); + } } diff --git a/src/test/java/water/exec/Expr2Test.java b/src/test/java/water/exec/Expr2Test.java index bcf2442894..e99132a797 100644 --- a/src/test/java/water/exec/Expr2Test.java +++ b/src/test/java/water/exec/Expr2Test.java @@ -271,7 +271,7 @@ public class Expr2Test extends TestUtil { checkStr("x=3;3\n*\n-\nx",3); // Each of '3' and '*' and '-' and 'x' is a standalone statement // No strings, yet - checkStr("function(df) { min(df[,\"age\"]) }","The current Exec does not handle strings\nfunction(df) { min(df[,\"age\"]) }\n ^\n"); + checkStr("function(df) { min(df[,\"age\"]) }","The current Exec does not handle strings\nfunction(df) { min(df[,\"age\"]) }\n ^-----^\n"); // Cleanup testing temps checkStr("a=0;x=0;y=0",0); // Delete keys from global scope diff --git a/src/test/java/water/fvec/ParseTimeTest.java b/src/test/java/water/fvec/ParseTimeTest.java index bae328d707..f8b3804572 100644 --- a/src/test/java/water/fvec/ParseTimeTest.java +++ b/src/test/java/water/fvec/ParseTimeTest.java @@ -48,9 +48,9 @@ public class ParseTimeTest extends TestUtil { @Test public void testTimeParse2() { double[][] exp = new double[][] { - d(1 , 115200000L, 1136275200000L, 1136275200000L, 1 ), - d(1500 , 129625200000L, 1247641200000L, 1247641200000L, 0 ), - d(15000 , 1296028800000L, 1254294000000L, 1254294000000L, 2 ), + d(1 , 115200000L, 1136275200000L, 1136275200000L, 1, 1, 1, 1, 19700102, 0), + d(1500 , 129625200000L, 1247641200000L, 1247641200000L, 0, 0, 0, 0, 19740209, 1), + d(15000 , 1296028800000L, 1254294000000L, 1254294000000L, 2, 2, 2, 2, 20110126, 2), }; ParserTest2.testParsed(TestUtil.parseFrame(null,"smalldata/jira/v-11.csv"),exp,exp.length); }