Skip to content

Commit

Permalink
group_by() does not create an arbitrary NA group when grouping by f…
Browse files Browse the repository at this point in the history
…actors with `drop = TRUE`.

closes tidyverse#4460
  • Loading branch information
romainfrancois committed Jul 17, 2019
1 parent c6a1f61 commit b82aff2
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 6 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# dplyr (development version)

* `group_by()` does not create an arbitrary NA group when grouping by factors with `drop = TRUE` (#4460).

* `rbind_all()` and `rbind_list()` have been removed (@bjungbogati, #4433).

* `dr_dplyr()` has been removed as it is no longer needed (#4433, @smwindecker).
Expand Down
39 changes: 33 additions & 6 deletions src/group_indices.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,12 +191,17 @@ class FactorSlicer : public Slicer {
f(data[depth]),
nlevels(Rf_length(Rf_getAttrib(f, symbols::levels))),

levels(nlevels + 1),
indices(nlevels + 1),
slicers(nlevels + 1),
slicers(),
slicer_size(0),
has_implicit_na(false),
drop(drop_)
{
for (int i = 0; i < nlevels; i++) {
levels[i] = i + 1;
}
levels[nlevels] = NA_INTEGER;
train(index_range);
}

Expand All @@ -214,7 +219,7 @@ class FactorSlicer : public Slicer {
groups_range.add(idx);

// fill the groups at these indices
std::fill_n(INTEGER(x) + idx.start, idx.size, i + 1);
std::fill_n(INTEGER(x) + idx.start, idx.size, levels[i]);
}

if (has_implicit_na) {
Expand Down Expand Up @@ -244,12 +249,34 @@ class FactorSlicer : public Slicer {
}
if (!has_implicit_na) {
indices.pop_back();
slicers.pop_back();
levels.pop_back();
}
// ---- for each level, train child slicers

int n = nlevels + has_implicit_na;

// ---- drop unused levels
if (drop) {
n = 0;
std::vector<int>::iterator levels_it = levels.begin();
std::vector< std::vector<int> >::iterator indices_it = indices.begin();

for (; levels_it != levels.end();) {
if (indices_it->size() == 0) {
indices_it = indices.erase(indices_it);
levels_it = levels.erase(levels_it);
} else {
++n;
++indices_it;
++levels_it;
}
}
nlevels = n - has_implicit_na;
}

slicers.reserve(n);
for (int i = 0; i < n; i++) {
slicers[i] = slicer(indices[i], depth + 1, data, visitors, drop);
slicers.push_back(slicer(indices[i], depth + 1, data, visitors, drop));
slicer_size += slicers[i]->size();
}

Expand All @@ -268,7 +295,6 @@ class FactorSlicer : public Slicer {
} else {
indices[value - 1].push_back(idx);
}

}
}

Expand All @@ -280,6 +306,7 @@ class FactorSlicer : public Slicer {
Factor f;
int nlevels;

std::vector<int> levels;
std::vector< std::vector<int> > indices;
std::vector< boost::shared_ptr<Slicer> > slicers;
int slicer_size;
Expand Down Expand Up @@ -421,7 +448,7 @@ boost::shared_ptr<Slicer> slicer(const std::vector<int>& index_range, int depth,
return boost::shared_ptr<Slicer>(new LeafSlicer(index_range));
} else {
SEXP x = data[depth];
if (Rf_isFactor(x) && !drop) {
if (Rf_isFactor(x)) {
return boost::shared_ptr<Slicer>(new FactorSlicer(depth, index_range, data, visitors, drop));
} else {
return boost::shared_ptr<Slicer>(new VectorSlicer(depth, index_range, data, visitors, drop));
Expand Down
9 changes: 9 additions & 0 deletions tests/testthat/test-group-by.r
Original file line number Diff line number Diff line change
Expand Up @@ -527,3 +527,12 @@ test_that("group_by() puts NA groups last in STRSXP (#4227)", {
expect_identical(res$x, c("apple", "banana", NA_character_))
expect_identical(res$.rows, list(1L, 3L, 2L))
})

test_that("group_by() does not create arbitrary NA groups for factors when drop = TRUE (#4460)", {
res <- expect_warning(group_data(group_by(iris, Species)[0, ]), NA)
expect_equal(nrow(res), 0L)

res <- expect_warning(group_data(group_by(iris[0, ], Species)), NA)
expect_equal(nrow(res), 0L)
})

0 comments on commit b82aff2

Please sign in to comment.