vector-function vs mutate

lin-dake · Oct 15, 2021 · 9febb2b · 9febb2b
1 parent 506f3e2
commit 9febb2b
Show file tree

Hide file tree

Showing 23 changed files with 147 additions and 35 deletions.
diff --git a/baseR_subsetting.Rmd b/baseR_subsetting.Rmd
@@ -266,8 +266,6 @@ tb[, "x"]
 ## 延伸阅读
 
 - 如何获取`matrix(1:9, nrow = 3)`上对角元? 对角元？
-- 对数据框，思考`df["x"]`， `df[["x"]]`， `df$x`三者的区别?
-- 如果`x`是一个矩阵，请问 `x[] <- 0` 和`x <- 0` 有什么区别？
 
 ```{r subsetting-34, include=FALSE}
 m <- matrix(1:9, nrow = 3)
@@ -285,6 +283,35 @@ upper.tri(m, diag = FALSE)
 m[upper.tri(m, diag = FALSE)]
 ```
 
+- 对数据框，思考`df["x"]`， `df[["x"]]`， `df$x`三者的区别?
+
+- 如果`x`是一个矩阵，请问 `x[] <- 0` 和`x <- 0` 有什么区别？
+
+- 不添加参数na.rm = TRUE的前提下，用sum()计算向量x的元素之和
+
+```{r, eval=FALSE}
+x <- c(3, 5, NA, 2, NA)
+```
+
+提示：
+
+- 使用is.na(x) 检查向量元素是否为缺失值，并保存为新的对象x_missing
+- 将所有缺失值赋值为0
+- 然后sum() 计算
+
+```{r, include=FALSE}
+x <- c(3, 5, NA, 2, NA)
+x_missing <- is.na(x)
+x_missing
+
+x[x_missing] <- 0
+x
+sum(x)
+```
+
+
+
+
 
 
 ```{r subsetting-37, include=FALSE}

diff --git a/bayesian_categorical.Rmd b/bayesian_categorical.Rmd
@@ -149,3 +149,8 @@ m2 <- stan(model_code = stan_program, data = stan_data)
 m2
 ```
 
+
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/bayesian_glm.Rmd b/bayesian_glm.Rmd
@@ -388,3 +388,7 @@ dagify(D ~ G,
   scale_y_continuous(NULL, breaks = NULL)
 ```
 
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/bayesian_hierarchical.Rmd b/bayesian_hierarchical.Rmd
@@ -16,6 +16,7 @@ options(mc.cores = parallel::detectCores())
 
 ```{r}
 radon <- readr::read_rds(here::here('demo_data', "radon.rds")) 
+head(radon)
 ```
 
 
@@ -712,3 +713,8 @@ summary(fit_slope_partial, c("gamma_a", "gamma_b"))$summary
 rstan::traceplot(fit_slope_partial, pars = c("sigma"))
 ```
 
+
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/bayesian_inference.Rmd b/bayesian_inference.Rmd
@@ -47,7 +47,7 @@ $$
 ```{r}
 library(tidyverse)
 d <- readr::read_rds(here::here('demo_data', "height_weight.rds")) 
-d
+head(d)
 ```
 
 
@@ -297,3 +297,7 @@ d_grid_samples %>%
 - 《Doing Bayesian Data Analysis: A Tutorial with R, JAGS, and Stan》  (2nd Edition) John Kruschke, 2014
 - 《Bayesian Models for Astrophysical Data: Using R, JAGS, Python, and Stan》， Joseph M. Hilbe, Cambridge University Press, 2017
 
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/bayesian_intro_stan.Rmd b/bayesian_intro_stan.Rmd
@@ -6,7 +6,7 @@
 
 
 
-# 配置环境
+## 配置环境
 
 - 安装 [Rtools4.0](https://cran.r-project.org/bin/windows/Rtools/)到`C`盘
 
@@ -32,3 +32,7 @@ install.packages(c("tidybayes", "bayesplot"))
 <https://github.com/stan-dev/rstan/wiki/RStan-Getting-Started>
 
 
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/bayesian_lm.Rmd b/bayesian_lm.Rmd
@@ -261,7 +261,7 @@ fit_normal %>%
 
 
 
-# #小结
+## 小结
 
 ```{r, out.width = '80%', fig.align = 'center', echo = FALSE}
 knitr::include_graphics(here::here("images", "from_model_to_code.jpg"))
@@ -279,12 +279,12 @@ beta   ~ normal(20, 5);
 ```
 
 - 修改stan代码，尝试推断上一章的身高分布
-```{r}
+```{r, eval=FALSE}
 d <- readr::read_rds(here::here('demo_data', "height_weight.rds")) 
 ```
 
 
-```{r}
+```{r, eval=FALSE}
 stan_program <- "
 data {
   int N;
@@ -319,11 +319,6 @@ fit <- stan(model_code = stan_program, data = stan_data,
 
 
 
-```{r}
-traceplot(fit)
-```
-
-
-```{r}
-fit
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
 ```
diff --git a/bayesian_t_test.Rmd b/bayesian_t_test.Rmd
@@ -217,7 +217,7 @@ stan_best_normal %>%
 > 标准正态分布是t分布的极限分布
 
 ```{r}
-for (nu in 1:50) {
+for (nu in c(1, seq(5, 50, by = 10))) {
  p <- tibble(x = seq(-5, 5, by=0.1)) %>% 
     ggplot(aes(x)) + 
     stat_function(fun = dnorm, color = 'gray') + 
@@ -310,7 +310,8 @@ stan_best_student %>%
 
 ```{r}
 stan_best_student %>%
-  as.data.frame()
+  as.data.frame() %>% 
+  head()
 ```
 
 
@@ -406,3 +407,9 @@ stan_linear %>%
   tidybayes::gather_draws(beta) %>%
   tidybayes::mean_hdi(.width = 0.89)
 ```
+
+
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/bayesian_vaccine_effectiveness.Rmd b/bayesian_vaccine_effectiveness.Rmd
@@ -1,4 +1,4 @@
-# 新冠疫苗有效率的计算 {#bayesian-vaccine-effectiveness}
+# 贝叶斯分析案例-新冠疫苗有效率的计算 {#bayesian-vaccine-effectiveness}
 
 
 ## 引言
@@ -74,7 +74,7 @@ $$
 
 具体Stan代码如下
 
-```{r eda-vaccine-effectiveness-4, cache=TRUE, results=FALSE, eval=FALSE}
+```{r eda-vaccine-effectiveness-4}
 stan_program <- "
 data {
   int<lower=1> event_c;        // num events, control
@@ -111,10 +111,10 @@ mod_vaccine <- stan(model_code = stan_program, data = stan_data)
 ```
 
 
-```{r include=FALSE}
+```{r, eval=FALSE, include=FALSE}
 # 运行stan代码，导致渲染bookdown报错，不知道为什么，先用这边笨办法凑合吧
 # mod_vaccine %>% saveRDS(here::here("stan","mod_vaccine.rds"))
-mod_vaccine <- readRDS(here::here("stan","mod_vaccine.rds"))
+# mod_vaccine <- readRDS(here::here("stan","mod_vaccine.rds"))
 
 ```
 

diff --git a/bayesian_workflow.Rmd b/bayesian_workflow.Rmd
@@ -411,3 +411,6 @@ y_i &\sim \operatorname{Normal}(\mu_i, \sigma) \\
 \end{align}
 $$
 
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/eda_nobel.Rmd b/eda_nobel.Rmd
@@ -630,7 +630,7 @@ df4
 
 
 
-```{r eda-nobel-49, fig.width= 12, fig.height= 6}
+```{r eda-nobel-49, fig.width= 6, fig.height= 3}
 library(gganimate)
 df4 %>%
   mutate(prize_year = as.integer(prize_year)) %>%

diff --git a/eda_rowwise.Rmd b/eda_rowwise.Rmd
@@ -1,6 +1,6 @@
 # tidyverse中行方向的操作 {#eda-rowwise}
 
-dplyr 1.0 推出之后，数据框**行方向**的操作得到完美解决，因此本章的内容已经过时，大家可以跳出本章，直接阅读第\@ref(colwise) 章。（留着本章，主要是让自己时常回顾下之前的探索。让自己最难忘的，或许就是曾经的痛点吧）
+dplyr 1.0 推出之后，数据框**行方向**的操作得到完美解决，因此本章的内容已经过时，大家可以跳出本章，直接阅读第\@ref(tidyverse-colwise) 章。（留着本章，主要是让自己时常回顾下之前的探索。让自己最难忘的，或许就是曾经的痛点吧）
 
 
 ```{r rowwise-1, message = FALSE, warning = FALSE}
@@ -20,7 +20,7 @@ df
 
 ## rowwise函数
 
-dplyr提供了rowwise函数，但大神说不推荐
+dplyr提供了rowwise()函数
 ```{r rowwise-3, eval=FALSE}
 df %>%
   rowwise() %>%
@@ -218,6 +218,7 @@ df %>% mutate(mean = rowMeans(across(is.numeric & -id)))
 
 
 ## 用lay方案
+
 [lay包](https://github.com/romainfrancois/lay)解决方案
 ```{r rowwise-23, eval = FALSE}
 library(lay)

diff --git a/images/mutate-calc-square1.png b/images/mutate-calc-square1.png
diff --git a/images/mutate-calc-square2.png b/images/mutate-calc-square2.png
diff --git a/images/mutate-calc-square3.png b/images/mutate-calc-square3.png
diff --git a/images/mutate-function.png b/images/mutate-function.png
diff --git a/index.Rmd b/index.Rmd
@@ -174,7 +174,7 @@ knitr::include_graphics("images/rbook1.png")
 
 
 
-4、**关于课程目标**
+3、**关于课程目标**
 
 - 课程目标: 熟悉数据科学流程，掌握统计编程技能，能运用探索性分析方法，解决基本的实际应用问题，做到学以致用，**不是 learning R，而是 learning with R**
 
@@ -191,7 +191,7 @@ knitr::include_graphics("images/rbook1.png")
 
 
 
-5、**关于如何提问**
+4、**关于如何提问**
 
 有的同学，这样一上来就问：**老师，我的代码怎么运行不出来呢？**或者图省事，干脆手机拍个照片一发。
 

diff --git a/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf b/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf
@@ -5,7 +5,7 @@ account: wangminjie
 server: bookdown.org
 hostUrl: https://bookdown.org/__api__
 appId: 3039
-bundleId: 58198
+bundleId: 58528
 url: https://bookdown.org/wangminjie/R4DS/
-when: 1633922029.47714
-lastSyncTime: 1633922029.47714
+when: 1634285026.70162
+lastSyncTime: 1634285026.70162
diff --git a/tidystats_poisson_regression.Rmd b/tidystats_poisson_regression.Rmd
@@ -633,6 +633,7 @@ dx %>%
 ```
 
 ### 更复杂的模型
+
 以后再说
 ```{r poisson-regression-33, eval=FALSE}
 glm(number_of_fish ~ 1 + (1 | pollution_level),
@@ -645,11 +646,11 @@ glm(number_of_fish ~ 1 + (1 | pollution_level),
 
 ## 小结
 
-```{r poisson-regression-34}
+```{r poisson-regression-34, out.width = '100%', echo = FALSE}
 knitr::include_graphics(path = "images/One_Picture.png")
 ```
 
-第 \@ref(logistic-regression) 章接着讲广义线性模型中的logistic回归模型。
+第 \@ref(tidystats-logistic-regression) 章接着讲广义线性模型中的logistic回归模型。
 
 
 <!-- ## 贝叶斯泊松回归 -->

diff --git a/tidyverse_dplyr.Rmd b/tidyverse_dplyr.Rmd
@@ -122,12 +122,56 @@ knitr::include_graphics("images/pipe2.png")
 ```
 
 
-
 ```{r}
 # using `%>%`
 df %>% mutate(extra = reward) 
 ```
-是不是很赞。现在有个问题，此时 `df` 有没发生变化？是否包含额外的奖励分呢？
+
+是不是很赞。
+
+
+```{r out.width = '65%', echo = FALSE}
+knitr::include_graphics("images/mutate-function.png")
+```
+
+
+## 向量函数与mutate()
+
+`mutate()`函数的本质还是第 \@ref(baseR-operators) 章介绍向量函数和向量化操作，只不过是换作在数据框中完成，这样更能形成“据框进、数据框出”的思维，方便快捷地构思并统计任务^[https://dcl-prog.stanford.edu/data-structure-basics.html ]。
+
+比如，我们想计算每位同学分数的平方，然后构建数据框新的一列，我们可以用第 \@ref(baseR-functions) 章函数的方法，自定义`calc_square()`函数
+
+```{r}
+calc_square <- function(x) {
+  x^2
+}
+
+df %>% 
+  mutate(new_col = calc_square(score))
+```
+
+
+在`mutate()`中引用数据框的某一**列名**，实际上是引用了列名对应的**整个向量**, 所以，这里我们传递`score`到`calc_square()`，就是把整个`score`向量传递给`calc_square()`.
+
+```{r out.width = '85%', echo = FALSE}
+knitr::include_graphics("images/mutate-calc-square1.png")
+```
+
+几何算符（这里是平方）是向量化的，因此`calc_square()`会对输入的`score`向量，返回一个等长的向量。
+
+```{r out.width = '90%', echo = FALSE}
+knitr::include_graphics("images/mutate-calc-square2.png")
+```
+
+`mutate()` 拿到这个新的向量后，就在原有数据框中添加新的一列`new_col`
+
+```{r out.width = '65%', echo = FALSE}
+knitr::include_graphics("images/mutate-calc-square3.png")
+```
+
+## 保存为新的数据框
+
+现在有个问题，此时 `df` 有没发生变化？是否包含额外的奖励分呢？
 事实上，此时`df`并没有发生改变，还是原来的状态。如果需要保存计算结果，就需要把计算的结果重新赋值给新的对象，当然，也可以赋值给`df`本身，这样`df`存储的数据就**更新**为计算后的结果。
 
 
@@ -489,6 +533,14 @@ Carol同学的信息没有显示？ Dave 同学显示了但没有考试成绩？
 df1 %>% full_join(df2, by = "name")
 ```
 
+
+## 内联结`inner_join()`
+
+只保留name条目相同地记录
+```{r, message=FALSE}
+df1 %>% inner_join(df2, by = "name")
+```
+
 ## 筛选联结
 
 筛选联结，有两个`semi_join(x, y)`和`anti_join(x, y)`，函数不改变数据框`x`的变量的数量，主要影响的是`x`的观测，也就说会剔除一些行，其功能类似`filter()`
@@ -497,13 +549,15 @@ df1 %>% full_join(df2, by = "name")
 ```{r}
 df1 %>% semi_join(df2, by = "name")
 ```
+
 可以看作对`df1`做筛选
 ```{r}
 df1 %>% filter(
   name %in% df2$name
 )
 ```
 
+
 - 反联结`anti_join(x, y)`，丢弃name与df2的name相一致的所有行
 ```{r}
 df1 %>% anti_join(df2, by = "name")

diff --git a/tidyverse_ggplot2_academic.Rmd b/tidyverse_ggplot2_academic.Rmd
@@ -1,4 +1,4 @@
-# 科研数据可视化 {#tidyverse-ggplot2-academic}
+# ggplot2之科研数据可视化 {#tidyverse-ggplot2-academic}
 
 
 ```{r ggplot2-academic-1, echo = FALSE, message = FALSE}
-Original file line number
+Diff line change
@@ Expand Up / @@ -149,3 +149,8 @@ m2 <- stan(model_code = stan_program, data = stan_data) @@
     m2
     ```
+    ```{r, echo = F, message = F, warning = F, results = "hide"}
+    pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+    ```