forked from tidyverse/dplyr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjoin.r
180 lines (156 loc) · 5.89 KB
/
join.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#' Join two tbls together.
#'
#' These are generic functions that dispatch to individual tbl methods - see the
#' method documentation for details of individual data sources. \code{x} and
#' \code{y} should usually be from the same data source, but if \code{copy} is
#' \code{TRUE}, \code{y} will automatically be copied to the same source as
#' \code{x} - this may be an expensive operation.
#'
#' @section Join types:
#'
#' Currently dplyr supports four join types:
#'
#' \describe{
#' \item{\code{inner_join}}{return all rows from \code{x} where there are matching
#' values in \code{y}, and all columns from \code{x} and \code{y}. If there are multiple matches
#' between \code{x} and \code{y}, all combination of the matches are returned.}
#'
#' \item{\code{left_join}}{return all rows from \code{x}, and all columns from \code{x}
#' and \code{y}. Rows in \code{x} with no match in \code{y} will have \code{NA} values in the new
#' columns. If there are multiple matches between \code{x} and \code{y}, all combinations
#' of the matches are returned.}
#'
#' \item{\code{right_join}}{return all rows from \code{y}, and all columns from \code{x}
#' and y. Rows in \code{y} with no match in \code{x} will have \code{NA} values in the new
#' columns. If there are multiple matches between \code{x} and \code{y}, all combinations
#' of the matches are returned.}
#'
#' \item{\code{semi_join}}{return all rows from \code{x} where there are matching
#' values in \code{y}, keeping just columns from \code{x}.
#'
#' A semi join differs from an inner join because an inner join will return
#' one row of \code{x} for each matching row of \code{y}, where a semi
#' join will never duplicate rows of \code{x}.}
#'
#' \item{\code{anti_join}}{return all rows from \code{x} where there are not
#' matching values in \code{y}, keeping just columns from \code{x}.}
#'
#' \item{\code{full_join}}{return all rows and all columns from both \code{x} and \code{y}.
#' Where there are not matching values, returns \code{NA} for the one missing.}
#' }
#'
#' @section Grouping:
#'
#' Groups are ignored for the purpose of joining, but the result preserves
#' the grouping of \code{x}.
#'
#' @param x,y tbls to join
#' @param by a character vector of variables to join by. If \code{NULL}, the
#' default, \code{join} will do a natural join, using all variables with
#' common names across the two tables. A message lists the variables so
#' that you can check they're right (to suppress the message, simply
#' explicitly list the variables that you want to join).
#'
#' To join by different variables on x and y use a named vector.
#' For example, \code{by = c("a" = "b")} will match \code{x.a} to
#' \code{y.b}.
#' @param copy If \code{x} and \code{y} are not from the same data source,
#' and \code{copy} is \code{TRUE}, then \code{y} will be copied into the
#' same src as \code{x}. This allows you to join tables across srcs, but
#' it is a potentially expensive operation so you must opt into it.
#' @param suffix If there are non-joined duplicate variables in \code{x} and
#' \code{y}, these suffixes will be added to the output to diambiguate them.
#' @param ... other parameters passed onto methods
#' @name join
NULL
#' @rdname join
#' @export
inner_join <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) {
UseMethod("inner_join")
}
#' @rdname join
#' @export
left_join <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) {
UseMethod("left_join")
}
#' @rdname join
#' @export
right_join <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) {
UseMethod("right_join")
}
#' @rdname join
#' @export
full_join <- function(x, y, by = NULL, copy = FALSE, suffix = c(".x", ".y"), ...) {
UseMethod("full_join")
}
#' @rdname join
#' @export
semi_join <- function(x, y, by = NULL, copy = FALSE, ...) {
UseMethod("semi_join")
}
#' @rdname join
#' @export
anti_join <- function(x, y, by = NULL, copy = FALSE, ...) {
UseMethod("anti_join")
}
#' Extract out common by variables
#'
#' @export
#' @keywords internal
common_by <- function(by = NULL, x, y) UseMethod("common_by", by)
#' @export
common_by.character <- function(by, x, y) {
by <- common_by_from_vector(by)
common_by.list(by, x, y)
}
common_by_from_vector <- function(by) {
by <- by[!duplicated(by)]
by_x <- names(by) %||% by
by_y <- unname(by)
# If x partially named, assume unnamed are the same in both tables
by_x[by_x == ""] <- by_y[by_x == ""]
list(x = by_x, y = by_y)
}
#' @export
common_by.list <- function(by, x, y) {
x_vars <- tbl_vars(x)
if (!all(by$x %in% x_vars)) {
stop("Join column not found in lhs: ", paste(setdiff(by$x, x_vars), collapse = ", "), call. = FALSE)
}
y_vars <- tbl_vars(y)
if (!all(by$y %in% y_vars)) {
stop("Join column not found in rhs: ", paste(setdiff(by$y, y_vars), collapse = ", "), call. = FALSE)
}
by
}
#' @export
common_by.NULL <- function(by, x, y) {
by <- intersect(tbl_vars(x), tbl_vars(y))
if (length(by) == 0) {
stop("No common variables. Please specify `by` param.", call. = FALSE)
}
message("Joining, by = ", utils::capture.output(dput(by)))
list(
x = by,
y = by
)
}
# Returns NULL if variables don't need to be renamed
unique_names <- function(x_names, y_names, by, suffix = c(".x", ".y")) {
common <- setdiff(intersect(x_names, y_names), by$x[by$x == by$y])
if (length(common) == 0) return(NULL)
suffix <- check_suffix(suffix)
x_match <- match(common, x_names)
x_new <- x_names
x_new[x_match] <- paste0(x_names[x_match], suffix$x)
y_match <- match(common, y_names)
y_new <- y_names
y_new[y_match] <- paste0(y_names[y_match], suffix$y)
list(x = setNames(x_new, x_names), y = setNames(y_new, y_names))
}
check_suffix <- function(x) {
if (!is.character(x) || length(x) != 2) {
stop("`suffix` must be a character vector of length 2.", call. = FALSE)
}
list(x = x[1], y = x[2])
}