forked from apache/arrow
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dplyr-select.R
129 lines (106 loc) · 4.57 KB
/
dplyr-select.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
# The following S3 methods are registered on load if dplyr is present
tbl_vars.arrow_dplyr_query <- function(x) names(x$selected_columns)
select.arrow_dplyr_query <- function(.data, ...) {
column_select(.data, enquos(...), op = "select")
}
select.Dataset <- select.ArrowTabular <- select.RecordBatchReader <- select.arrow_dplyr_query
rename.arrow_dplyr_query <- function(.data, ...) {
column_select(.data, enquos(...), op = "rename")
}
rename.Dataset <- rename.ArrowTabular <- rename.RecordBatchReader <- rename.arrow_dplyr_query
rename_with.arrow_dplyr_query <- function(.data, .fn, .cols = everything(), ...) {
.fn <- as_function(.fn)
old_names <- names(dplyr::select(.data, {{ .cols }}))
dplyr::rename(.data, !!set_names(old_names, .fn(old_names)))
}
rename_with.Dataset <- rename_with.ArrowTabular <- rename_with.RecordBatchReader <- rename_with.arrow_dplyr_query
relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL) {
# The code in this function is adapted from the code in dplyr::relocate.data.frame
# at https://github.com/tidyverse/dplyr/blob/master/R/relocate.R
# TODO: revisit this after https://github.com/tidyverse/dplyr/issues/5829
.data <- as_adq(.data)
# Assign the schema to the expressions
schema <- .data$.data$schema
walk(.data$selected_columns, ~ (.$schema <- schema))
# Create a mask for evaluating expressions in tidyselect helpers
mask <- new_environment(.cache$functions, parent = caller_env())
to_move <- eval_select(substitute(c(...)), .data$selected_columns, mask)
.before <- enquo(.before)
.after <- enquo(.after)
has_before <- !quo_is_null(.before)
has_after <- !quo_is_null(.after)
if (has_before && has_after) {
abort("Must supply only one of `.before` and `.after`.")
} else if (has_before) {
where <- min(unname(eval_select(quo_get_expr(.before), .data$selected_columns, mask)))
if (!where %in% to_move) {
to_move <- c(to_move, where)
}
} else if (has_after) {
where <- max(unname(eval_select(quo_get_expr(.after), .data$selected_columns, mask)))
if (!where %in% to_move) {
to_move <- c(where, to_move)
}
} else {
where <- 1L
if (!where %in% to_move) {
to_move <- c(to_move, where)
}
}
lhs <- setdiff(seq2(1, where - 1), to_move)
rhs <- setdiff(seq2(where + 1, length(.data$selected_columns)), to_move)
pos <- vec_unique(c(lhs, to_move, rhs))
new_names <- names(pos)
.data$selected_columns <- .data$selected_columns[pos]
if (!is.null(new_names)) {
names(.data$selected_columns)[new_names != ""] <- new_names[new_names != ""]
}
.data
}
relocate.Dataset <- relocate.ArrowTabular <- relocate.RecordBatchReader <- relocate.arrow_dplyr_query
column_select <- function(.data, select_expression, op = c("select", "rename")) {
op <- match.arg(op)
.data <- as_adq(.data)
sim_df <- as.data.frame(implicit_schema(.data))
old_names <- names(sim_df)
if (op == "select") {
out <- eval_select(expr(c(!!!select_expression)), sim_df)
# select only columns from `out`
subset <- out
} else if (op == "rename") {
out <- eval_rename(expr(c(!!!select_expression)), sim_df)
# select all columns as only renaming
subset <- set_names(seq_along(old_names), old_names)
names(subset)[out] <- names(out)
}
.data$selected_columns <- set_names(.data$selected_columns[subset], names(subset))
# check if names have updated
new_names <- old_names
new_names[out] <- names(out)
names_compared <- set_names(old_names, new_names)
renamed <- names_compared[old_names != new_names]
# Update names in group_by if changed in select() or rename()
if (length(renamed)) {
gbv <- .data$group_by_vars
renamed_groups <- gbv %in% renamed
gbv[renamed_groups] <- names(renamed)[match(gbv[renamed_groups], renamed)]
.data$group_by_vars <- gbv
}
.data
}