Skip to content

Commit

Permalink
HEX-1841 - Added as.Date functionality
Browse files Browse the repository at this point in the history
  • Loading branch information
bghill committed Aug 29, 2014
1 parent 02f345c commit 6af5d83
Show file tree
Hide file tree
Showing 12 changed files with 433 additions and 90 deletions.
10 changes: 10 additions & 0 deletions R/h2o-package/R/Classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,16 @@ year.H2OParsedData <- h2o.year
month <- function(x) UseMethod('month', x)
month.H2OParsedData <- h2o.month

as.Date.H2OParsedData <- function(x, format, ...) {
if(!is.character(format)) stop("format must be a string")

expr = paste("as.Date(", paste(x@key, deparse(substitute(format)), sep = ","), ")", sep = "")
res = .h2o.__exec2(x@h2o, expr)
res <- .h2o.exec2(res$dest_key, h2o = x@h2o, res$dest_key)
res@logic <- FALSE
return(res)
}

diff.H2OParsedData <- function(x, lag = 1, differences = 1, ...) {
if(!is.numeric(lag)) stop("lag must be numeric")
if(!is.numeric(differences)) stop("differences must be numeric")
Expand Down
123 changes: 123 additions & 0 deletions R/tests/testdir_jira/runit_hex_1841_asdate_datemanipulation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#
# date parsing and field extraction tests
#


setwd(normalizePath(dirname(R.utils::commandArgs(asValues=TRUE)$"f")))
source('../findNSourceUtils.R')



datetest <- function(conn){
Log.info('uploading date testing dataset')
hdf <- h2o.importFile(conn, normalizePath(locate('smalldata/jira/v-11.csv')))
# df should be 5 columns: ds1:5

Log.info('data as loaded into h2o:')
Log.info(head(hdf))

# NB: columns 1,5 are currently unsupported as date types
# that is, h2o cannot understand:
# 1 integer days since epoch (or since any other date);
# 2 dates formatted as %d/%m/%y (in strptime format strings)
summary(hdf)

Log.info('adding date columns')
# NB: h2o automagically recognizes and if it doesn't recognize, you're out of luck
hdf$ds5 <- as.Date(hdf$ds5, "%d/%m/%y %H:%M")
hdf$ds6 <- as.Date(hdf$ds6, "%d/%m/%Y %H:%M:%S")
hdf$ds7 <- as.Date(hdf$ds7, "%m/%d/%y")
hdf$ds8 <- as.Date(hdf$ds8, "%m/%d/%Y")
hdf$ds9 <- as.Date(as.factor(hdf$ds9), "%Y%m%d")
hdf$ds10 <- as.Date(hdf$ds10, "%Y_%m_%d")

Log.info('extracting year and month from posix date objects')
hdf$year2 <- year(hdf$ds2)
hdf$year3 <- year(hdf$ds3)
hdf$year4 <- year(hdf$ds4)
hdf$year5 <- year(hdf$ds5)
hdf$year6 <- year(hdf$ds6)
hdf$year7 <- year(hdf$ds7)
hdf$year8 <- year(hdf$ds8)
hdf$year9 <- year(hdf$ds9)
hdf$year10 <- year(hdf$ds10)
hdf$mon2 <- month(hdf$ds2)
hdf$mon3 <- month(hdf$ds3)
hdf$mon4 <- month(hdf$ds4)
hdf$mon5 <- month(hdf$ds5)
hdf$mon6 <- month(hdf$ds6)
hdf$mon7 <- month(hdf$ds7)
hdf$mon8 <- month(hdf$ds8)
hdf$mon9 <- month(hdf$ds9)
hdf$mon10 <- month(hdf$ds10)
hdf$idx2 <- year(hdf$ds2) * 12 + month(hdf$ds2)
hdf$idx3 <- year(hdf$ds3) * 12 + month(hdf$ds3)
hdf$idx4 <- year(hdf$ds4) * 12 + month(hdf$ds4)
hdf$idx5 <- year(hdf$ds5) * 12 + month(hdf$ds5)
hdf$idx6 <- year(hdf$ds6) * 12 + month(hdf$ds6)
hdf$idx7 <- year(hdf$ds7) * 12 + month(hdf$ds7)
hdf$idx8 <- year(hdf$ds8) * 12 + month(hdf$ds8)
hdf$idx9 <- year(hdf$ds9) * 12 + month(hdf$ds9)
hdf$idx10 <- year(hdf$ds10) * 12 + month(hdf$ds10)

cc <- colnames(hdf)
nn <- c( paste('year', 2:10, sep=''), paste('month', 2:10, sep=''), paste('idx', 2:10, sep='') )
cc[ (length(cc) - length(nn) + 1):length(cc) ] <- nn
colnames(hdf) <- cc

Log.info('pulling year/month indices local')
ldf <- as.data.frame( hdf )

# build the truth using R internal date fns
rdf <- read.csv(locate('smalldata/jira/v-11.csv'))
rdf$days1 <- as.Date(rdf$ds1, origin='1970-01-01')
rdf$days2 <- as.Date(rdf$ds2, format='%Y-%m-%d')
rdf$days3 <- as.Date(rdf$ds3, format='%d-%b-%y')
rdf$days4 <- as.Date(rdf$ds4, format='%d-%B-%Y')
rdf$days5 <- as.Date(rdf$ds5, format='%d/%m/%y %H:%M')
rdf$days6 <- as.Date(rdf$ds6, format='%d/%m/%Y %H:%M:%S')
rdf$days7 <- as.Date(rdf$ds7, format='%m/%d/%y')
rdf$days8 <- as.Date(rdf$ds8, format='%m/%d/%Y')
rdf$days9 <- as.Date(as.factor(rdf$ds9), format='%Y%m%d')
rdf$days10 <- as.Date(rdf$ds10, format='%Y_%m_%d')

months <- data.frame(lapply(rdf[,11:20], function(x) as.POSIXlt(x)$mon))
years <- data.frame(lapply(rdf[,11:20], function(x) as.POSIXlt(x)$year))
idx <- 12*years + months

Log.info('testing correctness')
expect_that( ldf$year2, equals(years[,2]) )
expect_that( ldf$year3, equals(years[,3]) )
expect_that( ldf$year4, equals(years[,4]) )
expect_that( ldf$year5, equals(years[,5]) )
expect_that( ldf$year6, equals(years[,6]) )
expect_that( ldf$year7, equals(years[,7]) )
expect_that( ldf$year8, equals(years[,8]) )
expect_that( ldf$year9, equals(years[,9]) )
expect_that( ldf$year10, equals(years[,10]) )

expect_that( ldf$month2, equals(months[,2]) )
expect_that( ldf$month3, equals(months[,3]) )
expect_that( ldf$month4, equals(months[,4]) )
expect_that( ldf$month5, equals(months[,5]) )
expect_that( ldf$month6, equals(months[,6]) )
expect_that( ldf$month7, equals(months[,7]) )
expect_that( ldf$month8, equals(months[,8]) )
expect_that( ldf$month9, equals(months[,9]) )
expect_that( ldf$month10, equals(months[,10]) )

expect_that( ldf$idx2, equals(idx[,2]) )
expect_that( ldf$idx3, equals(idx[,3]) )
expect_that( ldf$idx4, equals(idx[,4]) )
expect_that( ldf$idx5, equals(idx[,5]) )
expect_that( ldf$idx6, equals(idx[,6]) )
expect_that( ldf$idx7, equals(idx[,7]) )
expect_that( ldf$idx8, equals(idx[,8]) )
expect_that( ldf$idx9, equals(idx[,9]) )
expect_that( ldf$idx10, equals(idx[,10]) )

testEnd()
}


doTest('date testing', datetest)
76 changes: 0 additions & 76 deletions R/tests/testdir_jira/runit_v_11_datemanipulation.R

This file was deleted.

8 changes: 4 additions & 4 deletions smalldata/jira/v-11.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"ds1","ds2","ds3","ds4","ds5"
1,"1970-01-02","3-Jan-06","3-January-2006","3/01/06"
1500,"1974-02-09","15-Jul-09","15-July-2009","15/07/09"
15000,"2011-01-26","30-Sep-09","30-September-2009","30/09/09"
"ds1","ds2","ds3","ds4","ds5","ds6","ds7","ds8","ds9","ds10"
1,"1970-01-02","3-Jan-06","3-January-2006","3/01/06 13:30","3/01/2006 13:30:00","1/3/68","1/3/2068","19700102","1970_1_2"
1500,"1974-02-09","15-Jul-09","15-July-2009","15/07/09 1:01","15/07/2009 01:01:30","07/15/69","07/15/1969","19740209","1974_02_09"
15000,"2011-01-26","30-Sep-09","30-September-2009","30/09/09 23:00","30/09/2009 23:00:59","9/30/09","9/30/2009","20110126","2011_1_26"
22 changes: 20 additions & 2 deletions src/main/java/water/exec/AST.java
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ static AST parseVal(Exec2 E, boolean EOS ) {
if( (ast = ASTId .parse(E)) != null ) return ast;
if( (ast = ASTNum .parse(E)) != null ) return ast;
if( (ast = ASTOp .parse(E)) != null ) return ast;
if( E.peek('"',EOS) ) E.throwErr("The current Exec does not handle strings",E._x);
if( (ast = ASTStr .parse(E)) != null ) return ast;
return null;
}
abstract void exec(Env env);
Expand Down Expand Up @@ -225,7 +225,9 @@ static AST parse(Exec2 E, boolean EOS ) {
AST rows=E.xpeek(',',(x=E._x),parseCXExpr(E, false));
if( rows != null && !rows._t.union(Type.dblary()) ) E.throwErr("Must be scalar or array",x);
AST cols=E.xpeek(']',(x=E._x),parseCXExpr(E, false));
if( cols != null && !cols._t.union(Type.dblary()) ) E.throwErr("Must be scalar or array",x);
if( cols != null && !cols._t.union(Type.dblary()) )
if (cols._t.isStr()) E.throwErr("The current Exec does not handle strings",x);
else E.throwErr("Must be scalar or array",x);
Type t = // Provable scalars will type as a scalar
rows != null && rows.isPosConstant() &&
cols != null && cols.isPosConstant() ? Type.DBL : Type.ARY;
Expand Down Expand Up @@ -383,6 +385,22 @@ static String parseNew(Exec2 E) {
@Override public String toString() { return _id; }
}

class ASTStr extends AST {
final String _str;
ASTStr(String str) { super(Type.STR); _str=str; }
// Parse a string, or throw a parse error
static ASTStr parse(Exec2 E) {
String str = E.isString();
if (str != null) {
E._x += str.length()+2; //str + quotes
return new ASTStr(str);
}
return null;
}
@Override void exec(Env env) { env.push(_str); }
@Override public String toString() { return _str; }
}

// --------------------------------------------------------------------------
class ASTAssign extends AST {
final AST _lhs;
Expand Down
42 changes: 40 additions & 2 deletions src/main/java/water/exec/ASTOp.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,9 @@
import org.joda.time.DateTime;
import org.joda.time.MutableDateTime;

import org.joda.time.format.DateTimeFormatter;
import water.*;
import water.fvec.*;
import water.fvec.Vec.VectorGroup;
import water.util.Log;
import water.util.Utils;

/** Parse a generic R string and build an AST, in the context of an H2O Cloud
Expand Down Expand Up @@ -133,6 +132,7 @@ public abstract class ASTOp extends AST {
putPrefix(new ASTMinute());
putPrefix(new ASTSecond());
putPrefix(new ASTMillis());
putPrefix(new ASTasDate());

// Time series operations
putPrefix(new ASTDiff ());
Expand Down Expand Up @@ -712,6 +712,44 @@ class ASTMinute extends ASTTimeOp { @Override String opStr(){return "minute";} @
class ASTSecond extends ASTTimeOp { @Override String opStr(){return "second";} @Override ASTOp make() {return new ASTSecond();} @Override long op(MutableDateTime dt) { return dt.getSecondOfMinute();}}
class ASTMillis extends ASTTimeOp { @Override String opStr(){return "millis";} @Override ASTOp make() {return new ASTMillis();} @Override long op(MutableDateTime dt) { return dt.getMillisOfSecond();}}

class ASTasDate extends ASTOp {
ASTasDate() { super(new String[]{"as.Date", "x", "format"},
new Type[]{Type.ARY, Type.ARY, Type.STR},
OPF_PREFIX,
OPP_PREFIX,OPA_RIGHT); }
@Override String opStr() { return "as.Date"; }
@Override ASTOp make() {return new ASTasDate();}
@Override void apply(Env env, int argcnt, ASTApply apply) {
final String format = env.popStr();
if (format.isEmpty()) throw new IllegalArgumentException("as.Date requires a non-empty format string");
// check the format string more?

Frame fr = env.ary(-1);

if( fr.vecs().length != 1 || !fr.vecs()[0].isEnum() )
throw new IllegalArgumentException("as.Date requires a single column of factors");

Frame fr2 = new MRTask2() {
@Override public void map( Chunk chks[], NewChunk nchks[] ) {
//done on each node in lieu of rewriting DateTimeFormatter as Iced
DateTimeFormatter dtf = ParseTime.forStrptimePattern(format);
for( int i=0; i<nchks.length; i++ ) {
NewChunk n =nchks[i];
Chunk c = chks[i];
int rlen = c._len;
for( int r=0; r<rlen; r++ ) {
if (!c.isNA0(r)) {
String date = c._vec.domain((long)c.at0(r));
n.addNum(DateTime.parse(date, dtf).getMillis(), 0);
} else n.addNA();
}
}
}
}.doAll(fr.numCols(),fr).outputFrame(fr._names, null);
env.poppush(2, fr2, null);
}
}

// Finite backward difference for user-specified lag
// http://en.wikipedia.org/wiki/Finite_difference
class ASTDiff extends ASTOp {
Expand Down
Loading

0 comments on commit 6af5d83

Please sign in to comment.