ARROW-10668: [R] Support for the .data pronoun

and tests for the .env pronoun Closes apache#9051 from jonkeane/dot_data Authored-by: Jonathan Keane <[email protected]> Signed-off-by: Neal Richardson <[email protected]>
zhixingheyi-tian · Dec 31, 2020 · c92256d · c92256d
1 parent dd5fe70
commit c92256d
Show file tree

Hide file tree

Showing 5 changed files with 71 additions and 2 deletions.
diff --git a/r/NEWS.md b/r/NEWS.md
@@ -32,6 +32,7 @@
 * Table columns can now be added, replaced, or removed by assigning `<-` with either `$` or `[[`
 * Column names of Tables and RecordBatches can be renamed by assigning `names()`
 * Large string types can now be written to Parquet files
+* The [pronouns `.data` and `.env`](https://rlang.r-lib.org/reference/tidyeval-data.html) are now fully supported in Arrow-dplyr pipelines.
 
 ## Bug fixes
 

diff --git a/r/R/dplyr.R b/r/R/dplyr.R
@@ -264,7 +264,10 @@ filter_mask <- function(.data) {
   env_bind(f_env, !!!lapply(func_names, comp_func))
   # Then add the column references
   # Renaming is handled automatically by the named list
-  env_bind(f_env, !!!lapply(.data$selected_columns, var_binder))
+  data_pronoun <- lapply(.data$selected_columns, var_binder)
+  env_bind(f_env, !!!data_pronoun)
+  # Then bind the data pronoun
+  env_bind(f_env, .data = data_pronoun)
   new_data_mask(f_env)
 }
 

diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
@@ -438,6 +438,35 @@ test_that("filter() with %in%", {
   )
 })
 
+test_that("filter() with .data", {
+  ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
+  expect_equivalent(
+    ds %>%
+      select(.data$int, .data$part) %>%
+      filter(.data$int == 3, .data$part == 1) %>%
+      collect(),
+    tibble(int = df1$int[3], part = 1)
+  )
+
+  expect_equivalent(
+    ds %>%
+      select(.data$int, .data$part) %>%
+      filter(.data$int %in% c(6, 4, 3, 103, 107), .data$part == 1) %>%
+      collect(),
+    tibble(int = df1$int[c(3, 4, 6)], part = 1)
+  )
+
+  # and the .env pronoun too!
+  chr <- 1
+  expect_equivalent(
+    ds %>%
+      select(.data$int, .data$part) %>%
+      filter(.data$int %in% c(6, 4, 3, 103, 107), .data$part == .env$chr) %>%
+      collect(),
+    tibble(int = df1$int[c(3, 4, 6)], part = 1)
+  )
+})
+
 test_that("filter() on timestamp columns", {
   ds <- open_dataset(dataset_dir, partitioning = schema(part = uint8()))
   expect_equivalent(

diff --git a/r/tests/testthat/test-dplyr.R b/r/tests/testthat/test-dplyr.R
@@ -232,6 +232,42 @@ test_that("Filtering with a function that doesn't have an Array/expr method stil
   )
 })
 
+test_that("filter() with .data pronoun", {
+  expect_dplyr_equal(
+    input %>%
+      filter(.data$dbl > 4) %>%
+      select(.data$chr, .data$int, .data$lgl) %>%
+      collect(),
+    tbl
+  )
+
+  expect_dplyr_equal(
+    input %>%
+      filter(is.na(.data$lgl)) %>%
+      select(.data$chr, .data$int, .data$lgl) %>%
+      collect(),
+    tbl
+  )
+
+  # and the .env pronoun too!
+  chr <- 4
+  expect_dplyr_equal(
+    input %>%
+      filter(.data$dbl > .env$chr) %>%
+      select(.data$chr, .data$int, .data$lgl) %>%
+      collect(),
+    tbl
+  )
+
+  # but there is an error if we don't override the masking with `.env`
+  expect_dplyr_error(
+    tbl %>%
+      filter(.data$dbl > chr) %>%
+      select(.data$chr, .data$int, .data$lgl) %>%
+      collect()
+  )
+})
+
 test_that("summarize", {
   expect_dplyr_equal(
     input %>%