Skip to content

Commit

Permalink
generating data having NA and order, related to #40 and #12
Browse files Browse the repository at this point in the history
  • Loading branch information
jangorecki committed Nov 1, 2018
1 parent 65a7216 commit 55779e1
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions groupby-datagen.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
# Rscript groupby-datagen.R 1e7 1e2 0 0 ## 1e7 rows, 1e2 K, 0% NAs, random order
# Rscript groupby-datagen.R 1e8 1e1 5 1 ## 1e8 rows, 10 K, 5% NAs, sorted order
args = commandArgs(TRUE)

require(data.table)
N=as.integer(args[1L]); K=as.integer(args[2L])
N=as.integer(args[1L]); K=as.integer(args[2L]); nas=as.integer(args[3L]); sort=as.integer(args[4L])
stopifnot(nas>100L, nas<0L, !sort%in%c(0L,1L))
set.seed(108)
DT <- data.table(
id1 = sample(sprintf("id%03d",1:K), N, TRUE), # large groups (char)
Expand All @@ -14,6 +17,17 @@ DT <- data.table(
v2 = sample(5, N, TRUE), # int in range [1,5]
v3 = sample(round(runif(100,max=100),4), N, TRUE) # numeric e.g. 23.5749
)
if (nas>0L) {
N_nas = as.integer(N*(nas/100))
for (col in names(DT)) {
I_nas = sample(N, N_nas, replace=FALSE)
set(DT, I_nas, col, NA)
}
}
if (sort==1L) {
setkeyv(DT, paste0("id", 1:6))
}

pretty_sci = function(x) {
tmp<-strsplit(as.character(x), "+", fixed=TRUE)[[1L]]
if(length(tmp)==1L) {
Expand All @@ -22,6 +36,6 @@ pretty_sci = function(x) {
paste0(tmp[1L], as.character(as.integer(tmp[2L])))
}
}
fwrite(DT, sprintf("G1_%s_%s.csv", pretty_sci(N), pretty_sci(K)))
fwrite(DT, sprintf("G1_%s_%s_%s_%s.csv", pretty_sci(N), pretty_sci(K), nas, sort))

quit("no", status=0)

0 comments on commit 55779e1

Please sign in to comment.