forked from pandas-dev/pandas
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bench_merge.R
161 lines (128 loc) · 4.36 KB
/
bench_merge.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
library(plyr)
library(data.table)
N <- 10000
indices = rep(NA, N)
indices2 = rep(NA, N)
for (i in 1:N) {
indices[i] <- paste(sample(letters, 10), collapse="")
indices2[i] <- paste(sample(letters, 10), collapse="")
}
left <- data.frame(key=rep(indices[1:8000], 10),
key2=rep(indices2[1:8000], 10),
value=rnorm(80000))
right <- data.frame(key=indices[2001:10000],
key2=indices2[2001:10000],
value2=rnorm(8000))
right2 <- data.frame(key=rep(right$key, 2),
key2=rep(right$key2, 2),
value2=rnorm(16000))
left.dt <- data.table(left, key=c("key", "key2"))
right.dt <- data.table(right, key=c("key", "key2"))
right2.dt <- data.table(right2, key=c("key", "key2"))
# left.dt2 <- data.table(left)
# right.dt2 <- data.table(right)
## left <- data.frame(key=rep(indices[1:1000], 10),
## key2=rep(indices2[1:1000], 10),
## value=rnorm(100000))
## right <- data.frame(key=indices[1:1000],
## key2=indices2[1:1000],
## value2=rnorm(10000))
timeit <- function(func, niter=10) {
timing = rep(NA, niter)
for (i in 1:niter) {
gc()
timing[i] <- system.time(func())[3]
}
mean(timing)
}
left.join <- function(sort=FALSE) {
result <- base::merge(left, right, all.x=TRUE, sort=sort)
}
right.join <- function(sort=FALSE) {
result <- base::merge(left, right, all.y=TRUE, sort=sort)
}
outer.join <- function(sort=FALSE) {
result <- base::merge(left, right, all=TRUE, sort=sort)
}
inner.join <- function(sort=FALSE) {
result <- base::merge(left, right, all=FALSE, sort=sort)
}
left.join.dt <- function(sort=FALSE) {
result <- right.dt[left.dt]
}
right.join.dt <- function(sort=FALSE) {
result <- left.dt[right.dt]
}
outer.join.dt <- function(sort=FALSE) {
result <- merge(left.dt, right.dt, all=TRUE, sort=sort)
}
inner.join.dt <- function(sort=FALSE) {
result <- merge(left.dt, right.dt, all=FALSE, sort=sort)
}
plyr.join <- function(type) {
result <- plyr::join(left, right, by=c("key", "key2"),
type=type, match="first")
}
sort.options <- c(FALSE, TRUE)
# many-to-one
results <- matrix(nrow=4, ncol=3)
colnames(results) <- c("base::merge", "plyr", "data.table")
rownames(results) <- c("inner", "outer", "left", "right")
base.functions <- c(inner.join, outer.join, left.join, right.join)
plyr.functions <- c(function() plyr.join("inner"),
function() plyr.join("full"),
function() plyr.join("left"),
function() plyr.join("right"))
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt)
for (i in 1:4) {
base.func <- base.functions[[i]]
plyr.func <- plyr.functions[[i]]
dt.func <- dt.functions[[i]]
results[i, 1] <- timeit(base.func)
results[i, 2] <- timeit(plyr.func)
results[i, 3] <- timeit(dt.func)
}
# many-to-many
left.join <- function(sort=FALSE) {
result <- base::merge(left, right2, all.x=TRUE, sort=sort)
}
right.join <- function(sort=FALSE) {
result <- base::merge(left, right2, all.y=TRUE, sort=sort)
}
outer.join <- function(sort=FALSE) {
result <- base::merge(left, right2, all=TRUE, sort=sort)
}
inner.join <- function(sort=FALSE) {
result <- base::merge(left, right2, all=FALSE, sort=sort)
}
left.join.dt <- function(sort=FALSE) {
result <- right2.dt[left.dt]
}
right.join.dt <- function(sort=FALSE) {
result <- left.dt[right2.dt]
}
outer.join.dt <- function(sort=FALSE) {
result <- merge(left.dt, right2.dt, all=TRUE, sort=sort)
}
inner.join.dt <- function(sort=FALSE) {
result <- merge(left.dt, right2.dt, all=FALSE, sort=sort)
}
sort.options <- c(FALSE, TRUE)
# many-to-one
results <- matrix(nrow=4, ncol=3)
colnames(results) <- c("base::merge", "plyr", "data.table")
rownames(results) <- c("inner", "outer", "left", "right")
base.functions <- c(inner.join, outer.join, left.join, right.join)
plyr.functions <- c(function() plyr.join("inner"),
function() plyr.join("full"),
function() plyr.join("left"),
function() plyr.join("right"))
dt.functions <- c(inner.join.dt, outer.join.dt, left.join.dt, right.join.dt)
for (i in 1:4) {
base.func <- base.functions[[i]]
plyr.func <- plyr.functions[[i]]
dt.func <- dt.functions[[i]]
results[i, 1] <- timeit(base.func)
results[i, 2] <- timeit(plyr.func)
results[i, 3] <- timeit(dt.func)
}