forked from tidyverse/dplyr
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdo.r
180 lines (167 loc) · 5.77 KB
/
do.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#' Do anything
#'
#' \Sexpr[results=rd, stage=render]{dplyr:::lifecycle("questioning")}
#'
#' `do()` is marked as questioning as of dplyr 0.8.0, and may be advantageously
#' replaced by [group_modify()].
#'
#' @description This is a general purpose complement to the specialised
#' manipulation functions [filter()], [select()], [mutate()],
#' [summarise()] and [arrange()]. You can use `do()`
#' to perform arbitrary computation, returning either a data frame or
#' arbitrary objects which will be stored in a list. This is particularly
#' useful when working with models: you can fit models per group with
#' `do()` and then flexibly extract components with either another
#' `do()` or `summarise()`.
#'
#' For an empty data frame, the expressions will be evaluated once, even in the
#' presence of a grouping. This makes sure that the format of the resulting
#' data frame is the same for both empty and non-empty input.
#'
#' @section Connection to plyr:
#'
#' If you're familiar with plyr, `do()` with named arguments is basically
#' equivalent to [plyr::dlply()], and `do()` with a single unnamed argument
#' is basically equivalent to [plyr::ldply()]. However, instead of storing
#' labels in a separate attribute, the result is always a data frame. This
#' means that `summarise()` applied to the result of `do()` can
#' act like `ldply()`.
#'
#' @inheritParams filter
#' @param .data a tbl
#' @param ... Expressions to apply to each group. If named, results will be
#' stored in a new column. If unnamed, should return a data frame. You can
#' use `.` to refer to the current group. You can not mix named and
#' unnamed arguments.
#' @return
#' `do()` always returns a data frame. The first columns in the data frame
#' will be the labels, the others will be computed from `...`. Named
#' arguments become list-columns, with one element for each group; unnamed
#' elements must be data frames and labels will be duplicated accordingly.
#'
#' Groups are preserved for a single unnamed input. This is different to
#' [summarise()] because `do()` generally does not reduce the
#' complexity of the data, it just expresses it in a special way. For
#' multiple named inputs, the output is grouped by row with
#' [rowwise()]. This allows other verbs to work in an intuitive
#' way.
#' @export
#' @examples
#' by_cyl <- group_by(mtcars, cyl)
#' do(by_cyl, head(., 2))
#'
#' models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
#' models
#'
#' summarise(models, rsq = summary(mod)$r.squared)
#' models %>% do(data.frame(coef = coef(.$mod)))
#' models %>% do(data.frame(
#' var = names(coef(.$mod)),
#' coef(summary(.$mod)))
#' )
#'
#' models <- by_cyl %>% do(
#' mod_linear = lm(mpg ~ disp, data = .),
#' mod_quad = lm(mpg ~ poly(disp, 2), data = .)
#' )
#' models
#' compare <- models %>% do(aov = anova(.$mod_linear, .$mod_quad))
#' # compare %>% summarise(p.value = aov$`Pr(>F)`)
#'
#' if (require("nycflights13")) {
#' # You can use it to do any arbitrary computation, like fitting a linear
#' # model. Let's explore how carrier departure delays vary over the time
#' carriers <- group_by(flights, carrier)
#' group_size(carriers)
#'
#' mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
#' mods %>% do(as.data.frame(coef(.$mod)))
#' mods %>% summarise(rsq = summary(mod)$r.squared)
#'
#' \dontrun{
#' # This longer example shows the progress bar in action
#' by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
#' library(mgcv)
#' by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month, data = .))
#' }
#' }
do <- function(.data, ...) {
UseMethod("do")
}
#' @export
do.default <- function(.data, ...) {
do_(.data, .dots = compat_as_lazy_dots(...))
}
#' @export
#' @rdname se-deprecated
do_ <- function(.data, ..., .dots = list()) {
signal_soft_deprecated(paste_line(
"do_() is deprecated. ",
"Please use group_map() instead"
))
UseMethod("do_")
}
#' @export
do.NULL <- function(.data, ...) {
NULL
}
#' @export
do_.NULL <- function(.data, ..., .dots = list()) {
NULL
}
# Helper functions -------------------------------------------------------------
env_bind_do_pronouns <- function(env, data) {
if (is_function(data)) {
bind <- env_bind_active
} else {
bind <- env_bind
}
# Use `:=` for `.` to avoid partial matching with `.env`
bind(env, "." := data, .data = data)
}
label_output_dataframe <- function(labels, out, groups, .drop) {
data_frame <- vapply(out[[1]], is.data.frame, logical(1))
if (any(!data_frame)) {
bad("Results {bad} must be data frames, not {first_bad_class}",
bad = fmt_comma(which(!data_frame)),
first_bad_class = fmt_classes(out[[1]][[which.min(data_frame)]])
)
}
rows <- vapply(out[[1]], nrow, numeric(1))
out <- bind_rows(out[[1]])
if (!is.null(labels)) {
# Remove any common columns from labels
labels <- labels[setdiff(names(labels), names(out))]
# Repeat each row to match data
labels <- labels[rep(seq_len(nrow(labels)), rows), , drop = FALSE]
rownames(labels) <- NULL
grouped_df(bind_cols(labels, out), groups, .drop)
} else {
rowwise(out)
}
}
label_output_list <- function(labels, out, groups) {
if (!is.null(labels)) {
labels[names(out)] <- out
rowwise(labels)
} else {
class(out) <- "data.frame"
attr(out, "row.names") <- .set_row_names(length(out[[1]]))
rowwise(out)
}
}
named_args <- function(args) {
# Arguments must either be all named or all unnamed.
named <- sum(names2(args) != "")
if (!(named == 0 || named == length(args))) {
abort("Arguments must either be all named or all unnamed")
}
if (named == 0 && length(args) > 1) {
bad("Can only supply one unnamed argument, not {length(args)}")
}
# Check for old syntax
if (named == 1 && names(args) == ".f") {
abort("do syntax changed in dplyr 0.2. Please see documentation for details")
}
named != 0
}