add slice_n() function

beikejiangb · Apr 8, 2020 · 2784244 · 2784244
1 parent 391ba15
commit 2784244
Show file tree

Hide file tree

Showing 3 changed files with 134 additions and 11 deletions.
diff --git a/adv_dplyr.Rmd b/adv_dplyr.Rmd
@@ -589,8 +589,18 @@ iris %>%
 
 
 
-## arcoss函数
-强大的`arcoss()`函数，估计要替代以上`scope`函数
+## across函数
+
+
+数据框中向量de方向，事实上可以看做有两个方向，横着看是row-vector，竖着看是col-vector。
+- colwise:  `group_by() %>% summarise/mutate  +  across()`
+- rowwise:  `rowwise()/nest_by() %>%  summarise/mutate + c_across()`
+
+- https://dplyr.tidyverse.org/dev/articles/rowwise.html
+- https://dplyr.tidyverse.org/dev/articles/colwise.html
+
+比如
+
 ```{r, eval = FALSE}
 iris %>%
   group_by(Species) %>%
@@ -602,3 +612,60 @@ iris %>%
 ```
 
 
+
+### across函数替代scope函数
+
+强大的`across()`函数，替代以上`scope`函数(_if, _at, 和 _all函数), 同时`slice_max()`, `slice_min()`, `slice_n()` 将替代 `top_n()`函数。
+
+```{r, eval = FALSE}
+df %>% mutate_if(is.numeric, mean, na.rm = TRUE)
+# ->
+df %>% mutate(across(is.numeric, mean, na.rm = TRUE))
+
+df %>% mutate_at(vars(x, starts_with("y")), mean, na.rm = TRUE)
+# ->
+df %>% mutate(across(c(x, starts_with("y")), mean, na.rm = TRUE))
+
+df %>% mutate_all(mean, na.rm = TRUE)
+# ->
+df %>% mutate(across(everything(), mean, na.rm = TRUE))
+```
+
+
+
+### 更方便的colwise操作
+
+```{r, eval = FALSE}
+# multiple
+df <- tibble(x = 1:3, y = 3:5, z = 5:7)
+mult <- list(x = 1, y = 10, z = 100)
+
+df %>% mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]]))
+
+
+
+# weights
+df <- tibble(x = 1:3, y = 3:5, z = 5:7)
+df
+weights <- list(x = 0.2, y = 0.3, z = 0.5)
+
+df %>% mutate(
+  across(all_of(names(weights)),
+         list(wt = ~ .x * weights[[cur_column()]]), 
+         .names = "{col}.{fn}")
+)
+
+
+
+# cutoffs
+df <- tibble(x = 1:3, y = 3:5, z = 5:7)
+df
+
+cutoffs <- list(x = 2, y = 3, z = 7)
+
+df %>% mutate(
+  across(all_of(names(cutoffs)), ~ if_else(.x > cutoffs[[cur_column()]], 1, 0))
+  )
+```
+
+
diff --git a/eda_covid2019.Rmd b/eda_covid2019.Rmd
@@ -6,6 +6,8 @@ library(tidyverse)
 library(lubridate)
 library(maps)
 library(viridis)
+library(ggrepel)
+library(paletteer)
 library(shadowtext)
 library(showtext)
 showtext_auto()
@@ -315,6 +317,10 @@ d2 <- d1 %>%
 d2
 ```
 
+```{block, type="danger"}
+大家都谈过恋爱，也有可能失恋。大家失恋时间是不同的，若把失恋的当天作为第 0 day, 就可以比较失恋若干天后每个人精神波动情况。参照《失恋33天》
+```
+
 
 
 ```{r}
@@ -588,21 +594,68 @@ d2a %>%
 
 
 
-### 用因子来弄
+### 比较tidy的方法
 
 对数据框d2a增加两列属性(有无标签，有无颜色)，然后手动改颜色
 
+
 ```{r}
-d4 <- d2a %>%
-  mutate(country_label = case_when(
-    country_region %in% highlight ~ country_region,
-    country_region %in% gray ~ "gray",
-    TRUE ~ NA_character_
-  ))
+highlight_country <- d2a %>%
+  group_by(country_region) %>%
+  filter(days_since_100 == max(days_since_100)) %>%
+  ungroup() %>%
+  arrange(desc(days_since_100)) %>% 
+  top_n(10, days_since_100) %>%
+  pull(country_region)
+
+highlight_country
+```
+
 
-d4
+吸取了[Kieran Healy大神的配色方案](https://github.com/kjhealy/covid) 
+```{r}
+## Colors
+cgroup_cols <- c(prismatic::clr_darken(paletteer_d("ggsci::category20_d3"), 0.2)[1:length(highlight_country)], "gray70")
+scales::show_col(cgroup_cols)
+```
+
+
+
+```{r}
+d2a %>% 
+  group_by(country_region) %>% 
+  filter(max(days_since_100) > 9) %>%
+  mutate(
+    end_label = ifelse(days_since_100 == max(days_since_100), country_region, NA_character_)
+  ) %>% 
+  mutate(end_label = case_when(country_region %in% highlight_country ~ end_label,
+                               TRUE ~ NA_character_), 
+         cgroup = case_when(country_region %in% highlight_country ~ country_region, 
+                            TRUE ~ "ZZOTHER")) %>% # length(highlight_country) + gray
+
+  
+  ggplot(aes(x = days_since_100, y = cases, 
+         color = cgroup, label = end_label, 
+         group = country_region)) + 
+  geom_line(size = 0.8) + 
+  geom_text_repel(nudge_x = 1.1,
+                  nudge_y = 0.1, 
+                  segment.color = NA) + 
+  guides(color = FALSE) + 
+  scale_color_manual(values = cgroup_cols) +
+  scale_y_continuous(labels = scales::comma_format(accuracy = 1), 
+                     breaks = 10^seq(2, 8),
+                     trans = "log10"
+                     ) + 
+  labs(x = "Days Since 100 Confirmed Death", 
+       y = "Cumulative Number of Deaths (log10 scale)", 
+       title = "Cumulative Number of Reported Deaths from COVID-19, Selected Countries", 
+    subtitle = "Cumulative number of cases, by Number of days since 100th case",
+    caption = "data source from @www.ft.com") 
 ```
-感觉这样还是有点折腾。
+
+感觉这样是最好的方案。
+
 
 
 

diff --git a/eda_olympics.Rmd b/eda_olympics.Rmd
@@ -248,3 +248,6 @@ tb4 %>%
   ))
 ```
 
+## 课后作业
+
+- 探索数据，建立身高体重的线性模型
-Original file line number
+Diff line change
@@ Expand Up / @@ -248,3 +248,6 @@ tb4 %>% @@
       ))
     ```
+    ## 课后作业
+    - 探索数据，建立身高体重的线性模型