diff --git a/_bookdown.yml b/_bookdown.yml index a398696..a394302 100644 --- a/_bookdown.yml +++ b/_bookdown.yml @@ -50,6 +50,7 @@ rmd_files: [ "broom.Rmd", "tidystats.Rmd", "tests_as_linear.Rmd", + "infer.Rmd", "lmm.Rmd", "poisson_regression.Rmd", "logistic_regression.Rmd", @@ -65,6 +66,7 @@ rmd_files: [ "eda_height.Rmd", "eda_caribou.Rmd", "eda_penguins.Rmd", + "eda_career_decision.Rmd", "rvest.Rmd", "tidygraph.Rmd", "tidytext.Rmd", diff --git a/demo_data/career-decision.xlsx b/demo_data/career-decision.xlsx new file mode 100644 index 0000000..c1f3018 Binary files /dev/null and b/demo_data/career-decision.xlsx differ diff --git a/demo_data/gss.rds b/demo_data/gss.rds new file mode 100644 index 0000000..83098d0 Binary files /dev/null and b/demo_data/gss.rds differ diff --git a/eda_career_decision.Rmd b/eda_career_decision.Rmd new file mode 100644 index 0000000..ef41647 --- /dev/null +++ b/eda_career_decision.Rmd @@ -0,0 +1,411 @@ +# 探索性数据分析-大学生职业决策 {#eda-career-decision} + +## 预备知识 + +```{r, message=FALSE, warning=FALSE} +library(tidyverse) + +example <- + tibble::tribble( + ~name, ~english, ~chinese, ~math, ~sport, ~psy, ~edu, + "A", 133, 100, 102, 56, 89, 89, + "B", 120, 120, 86, 88, 45, 75, + "C", 98, 109, 114, 87, NA, 84, + "D", 120, 78, 106, 68, 86, 69, + "E", 110, 99, 134, 98, 75, 70, + "F", NA, 132, 130, NA, 68, 88 + ) + +example +``` + +### 缺失值检查 + +我们需要判断每一列的缺失值 + +```{r} +example %>% + summarise( + na_in_english = sum(is.na(english)), + na_in_chinese = sum(is.na(chinese)), + na_in_math = sum(is.na(math)), + na_in_sport = sum(is.na(sport)), + na_in_psy = sum(is.na(math)), # tpyo here + na_in_edu = sum(is.na(edu)) + ) +``` + +我们发现,这种写法比较笨,而且容易出错,比如`na_in_psy = sum(is.na(math))` 就写错了。那么有没有`既偷懒又安全`的方法呢?有的。但代价是需要学会`across()`函数,大家可以在Console中输入`?dplyr::across`查看帮助文档,或者看第 \@ref(colwise) 章。 + +```{r} +example %>% + summarise( + across(everything(), mean) + ) + + +example %>% + summarise( + across(everything(), function(x) sum(is.na(x)) ) + ) +``` + + +### 数据预处理 + +- 直接**丢弃**缺失值所在的行 + +```{r} +example %>% drop_na() +``` + + + +- 用**均值**代替缺失值 + +```{r eval=FALSE, include=FALSE} +example %>% + mutate( + english_new = if_else(is.na(english), mean(english, na.rm = T), english) + ) +``` + + + +```{r} +d <- example %>% + mutate( + across(where(is.numeric), ~ if_else(is.na(.), mean(., na.rm = T), .)) + ) +d +``` + + + + +- 计算总分/均值 + +```{r} +d %>% + rowwise() %>% + mutate( + total = sum(c_across(-name)) + ) + +d %>% + rowwise() %>% + mutate( + mean = mean(c_across(-name)) + ) +``` + + + + +- **数据标准化**处理 + +```{r} +standard <- function(x) { + (x - mean(x)) / sd(x) +} + +``` + + + + +```{r} +d %>% + mutate( + across(where(is.numeric), standard) + ) +``` + + + + +## 开始 + +### 文件管理中需要注意的地方 + +感谢康钦虹同学提供的数据,但这里有几点需要注意的地方: + +| 事项 | 问题 | 解决办法 | +|---------- |--------------------------- |-----------------------------------------------| +| 文件名 | excel的文件名是中文 | 用英文,比如 `data.xlsx` | +| 列名 | 列名中有-号,大小写不统一 | 规范列名,或用`janitor::clean_names()`偷懒 | +| 预处理 | 直接在原始数据中新增 | 不要在原始数据上改动,统计工作可以在R里实现 | +| 文件管理 | 没有层级 | 新建`data`文件夹装数据,与`code.Rmd`并列 | + + + + +```{r, message=FALSE, warning=FALSE} +data <- readxl::read_excel("demo_data/career-decision.xlsx", skip = 1) %>% + janitor::clean_names() + +#glimpse(data) +``` + + + +```{r} +d <- data %>% select(1:61) +#glimpse(d) +``` + + + + + +### 缺失值检查 + +```{r} +d %>% + summarise( + across(everything(), ~sum(is.na(.))) + ) +``` + +没有缺失值,挺好 + + +### 数据预处理 + +采用利克特式 5 点计分... (这方面你们懂得比我多) + +```{r} +d <- d %>% + rowwise() %>% + mutate( + environment_exploration = sum(c_across(z1:z5)), + self_exploration = sum(c_across(z6:z9)), + objective_system_exploration = sum(c_across(z10:z15)), + info_quantity_exploration = sum(c_across(z16:z18)), + + self_evaluation = sum(c_across(j1:j6)), + information_collection = sum(c_across(j7:j15)), + target_select = sum(c_across(j16:j24)), + formulate = sum(c_across(j25:j32)), + problem_solving = sum(c_across(j33:j39)), + + career_exploration = sum(c_across(z1:z18)), + career_decision_making = sum(c_across(j1:j39)) + ) %>% + select(-starts_with("z"), -starts_with("j")) %>% + ungroup() %>% + mutate(pid = 1:n(), .before = sex) %>% + mutate( + across(c(pid, sex, majoy, grade, from), as_factor) + ) + +#glimpse(d) +``` + +### 标准化 + + +```{r} +standard <- function(x) { + (x - mean(x)) / sd(x) +} + +d <- d %>% + mutate( + across(where(is.numeric), standard) + ) +d +``` + + +## 探索 + +### 想探索的问题 + +- 不同性别(或者年级,生源地,专业)下,各指标分值的差异性 +- 两个变量的相关分析和回归分析 +- 更多(欢迎大家提出了喔) + + +### 男生女生在职业探索上有所不同? + +以性别为例。因为性别变量是男女,仅仅2组,所以检查男女**在各自指标上的均值差异**,可以用T检验。 + +```{r} +d %>% + group_by(sex) %>% + summarise( + across(where(is.numeric), mean) +) +``` + +你可以给这个图颜色弄得更好看点? +```{r, fig.width=4, fig.height=3.5, fig.align="center"} +library(ggridges) +d %>% + ggplot(aes(x = career_exploration, y = sex, fill = sex)) + + geom_density_ridges() +``` + + + + +```{r} +t_test_eq <- t.test(career_exploration ~ sex, data = d, var.equal = TRUE) %>% + broom::tidy() +t_test_eq +``` + + + +```{r} +t_test_uneq <- t.test(career_exploration ~ sex, data = d, var.equal = FALSE) %>% + broom::tidy() +t_test_uneq +``` + + + +当然,也可以用第 \@ref(infer) 章介绍的统计推断的方法 + + +```{r} +library(infer) + +obs_diff <- d %>% + specify(formula = career_exploration ~ sex) %>% + calculate("diff in means", order = c("1", "2")) +obs_diff +``` + + + + +```{r} +null_dist <- d %>% + specify(formula = career_exploration ~ sex) %>% + hypothesize(null = "independence") %>% + generate(reps = 5000, type = "permute") %>% + calculate(stat = "diff in means", order = c("1", "2")) +null_dist +``` + + + +```{r} +null_dist %>% + visualize() + + shade_p_value(obs_stat = obs_diff, direction = "two_sided") +``` + + + +```{r} +null_dist %>% + get_p_value(obs_stat = obs_diff, direction = "two_sided") %>% + #get_p_value(obs_stat = obs_diff, direction = "less") %>% + mutate(p_value_clean = scales::pvalue(p_value)) +``` + + + + + +也可以用tidyverse的方法一次性的搞定**所有指标** + +```{r} +d %>% + pivot_longer( + cols = -c(pid, sex, majoy, grade, from), + names_to = "index", + values_to = "value" + ) %>% + group_by(index) %>% + summarise( + broom::tidy( t.test(value ~ sex, data = cur_data())) + ) %>% + select(index, estimate, statistic, p.value) %>% + arrange(p.value) +``` + + + + + +### 来自不同地方的学生在职业探索上有所不同? + +以生源地为例。因为生源地有3类,所以可以使用方差分析。 + +```{r} +aov(career_exploration ~ from, data = d) %>% + TukeyHSD(which = "from") %>% + broom::tidy() +``` + + + +```{r, eval=FALSE, include=FALSE} +lm(career_exploration ~ from, data = d) %>% + broom::tidy() +``` + + + +```{r, fig.width=4, fig.height=3.5, fig.align="center"} +library(ggridges) +d %>% + ggplot(aes(x = career_exploration, y = from, fill = from)) + + geom_density_ridges() +``` + + + +也可以一次性的搞定**所有指标** + +```{r} +d %>% + pivot_longer( + cols = -c(pid, sex, majoy, grade, from), + names_to = "index", + values_to = "value" + ) %>% + group_by(index) %>% + summarise( + broom::tidy( aov(value ~ from, data = cur_data())) + ) %>% + select(index, term, statistic, p.value) %>% + filter(term != "Residuals") %>% + arrange(p.value) +``` + + +### 职业探索和决策之间有关联? + +可以用第 \@ref(lm) 章线性模型来探索 + +```{r} +lm(career_decision_making ~ career_exploration, data = d) +``` + + + + +不要因为我讲课讲的很垃圾,就错过了R的美,瑕不掩瑜啦。要相信自己,你们是川师研究生中最聪明的。 + + +```{r echo=FALSE, fig.align='center', out.width='90%'} +knitr::include_graphics("images/support.jpg") +``` + + + + +```{r, echo = F} +# remove the objects +# rm(list=ls()) +rm(d, data, example, null_dist, obs_diff, standard, t_test_eq, t_test_uneq) +``` + +```{r, echo = F, message = F, warning = F, results = "hide"} +pacman::p_unload(pacman::p_loaded(), character.only = TRUE) +``` diff --git a/exercises/01_stats.R b/exercises/01_stats.R new file mode 100644 index 0000000..6c02fac --- /dev/null +++ b/exercises/01_stats.R @@ -0,0 +1,46 @@ +# 计算 +1 + 5 + + + +# 生成序列 +1:100 + + + +# 把序列装到变量里 +x <- 1:100 + + + +# 求和 +sum(x) + + + +# 求均值 +mean(x) + + + +# 求方差 +sd(x) + + +# 生成正态分布的随机数 +rnorm(20, mean = 0, sd = 1) + + + +# T检验 +t.test(extra ~ group, data = sleep) + + +# 方差分析 +aov(mpg ~ wt, data = mtcars) + + + +# 线性模型 +lm(mpg ~ wt, data = mtcars) + diff --git a/exercises/02_visual.R b/exercises/02_visual.R new file mode 100644 index 0000000..0261c1d --- /dev/null +++ b/exercises/02_visual.R @@ -0,0 +1,4 @@ +library(ggplot2) + +ggplot(mpg, aes(x = displ, y = hwy)) + + geom_point(aes(colour = class)) diff --git a/exercises/3_eda.R b/exercises/03_eda.R similarity index 100% rename from exercises/3_eda.R rename to exercises/03_eda.R diff --git a/exercises/4_reproducible.Rmd b/exercises/04_reproducible.Rmd similarity index 61% rename from exercises/4_reproducible.Rmd rename to exercises/04_reproducible.Rmd index bac7710..a034336 100644 --- a/exercises/4_reproducible.Rmd +++ b/exercises/04_reproducible.Rmd @@ -8,7 +8,7 @@ output: extra_dependencies: ctex: UTF8 number_sections: yes - toc: yes + #toc: yes df_print: kable classoptions: "hyperref, 12pt, a4paper" --- @@ -23,31 +23,28 @@ knitr::opts_chunk$set(echo = TRUE, ``` - # 引言 -这是一份关于新冠肺炎的探索性分析报告。 - +新型冠状病毒疫情在多国蔓延,一些国家的病例确诊数量明显增多,各国防疫力度继续加强。本章通过分析疫情数据,了解疫情发展,祝愿人类早日会战胜病毒! # 导入数据 -首先,我们加载需要的宏包,其中tidyverse用于数据探索、covdata用于获取数据 +首先,我们加载宏包tidyverse -```{r} -# Load libraries +```{r, warning=FALSE, message=FALSE} library(tidyverse) library(covdata) ``` 论文的数据来源 -[https://kjhealy.github.io/covdata/](https://kjhealy.github.io/covdata/),我们选取部分数据看看 +[https://kjhealy.github.io/covdata/](https://kjhealy.github.io/covdata/),我们选取最近数据看看 ```{r, echo = FALSE} covnat %>% - tail(10) + tail(8) ``` @@ -69,19 +66,43 @@ covnat %>% -# 可视化 +# 数据探索 + +找出**累积确诊病例**最多的几个国家 + +```{r} +covnat %>% + ungroup() %>% + filter(date == max(date)) %>% + slice_max(cu_cases, n = 8) +``` + -为了更好的呈现数据,我们将筛选出美国确诊病例数据,并可视化 +全球**确诊病例**和**死亡病例** + +```{r} +covnat %>% + ungroup() %>% + summarise( + total_cases = sum(cases), + total_death = sum(deaths), + ) +``` + + + +美国疫情变化情况 ```{r, fig.showtext = TRUE} -covdata::covnat %>% - dplyr::filter(iso3 == "USA") %>% - dplyr::filter(cu_cases > 0) %>% - +covnat %>% + filter(iso3 == "USA") %>% + filter(cu_cases > 0) %>% + ungroup() %>% ggplot(aes(x = date, y = cases)) + - geom_path() + + geom_line() + + scale_x_date(name = NULL, breaks = "month") + labs(title = "美国新冠肺炎累积确诊病例", - subtitle = "数据来源https://kjhealy.github.io/covdata/") + subtitle = "数据来源https://kjhealy.github.io/covdata/") ``` diff --git a/exercises/05_R_vs_python.Rmd b/exercises/05_R_vs_python.Rmd new file mode 100644 index 0000000..09da2ba --- /dev/null +++ b/exercises/05_R_vs_python.Rmd @@ -0,0 +1,38 @@ +--- +title: "R_vs_python" +output: html_document +--- + + + +```{r} +library(reticulate) +``` + + +```{python} +[1,2,3] *2 +``` + + + +```{r} +c(1, 2, 3) *2 +``` + + + + + +```{python} +a = [1,2,3] +a[0] +``` + + + + +```{r} +x <- c(1, 2, 3) +x[1] +``` \ No newline at end of file diff --git a/exercises/1_stats.R b/exercises/1_stats.R deleted file mode 100644 index 5f96ca8..0000000 --- a/exercises/1_stats.R +++ /dev/null @@ -1,58 +0,0 @@ -# Numeric Functions -1 + 5 -1:100 -abs(-3.14) -sqrt(3.14) -floor(3.14) -round(3.14) -cos(3.14) -log(3.14) -exp(3.14) - -seq(1, 10, 2) -rep(1:3, 2) - - - - - -# Character Functions -substr("abcdef", 2, 4) -grep("a", c("alice", "bob", "claro")) -strsplit("a.b.c", "\\.") -toupper("Alice") -tolower("Alice") - - - - -# Statistical Functions -x <- 1:10 -sum(x) -min(x) -mean(x) -sd(x) -var(x) -median(x) -quantile(x, probs = 0.75) -range(x) -scale(x, center = TRUE, scale = TRUE) - - - -# Statistical Probability Functions -rnorm(20, mean = 0, sd = 1) -dnorm(0.5, mean = 0, sd = 1) -rpois(100, lambda = 10) -dpois(2, lambda = 10) - - - - - -# Regression Modeling -lm(mpg ~ wt, data = mtcars) -aov(mpg ~ wt, data = mtcars) -t.test(extra ~ group, data = sleep) - - diff --git a/exercises/2_visual.R b/exercises/2_visual.R deleted file mode 100644 index fe098e0..0000000 --- a/exercises/2_visual.R +++ /dev/null @@ -1,14 +0,0 @@ -library(ggplot2) - -ggplot(midwest, aes(x = area, y = poptotal)) + - geom_point(aes(color = state, size = popdensity)) + - geom_smooth(method = "loess", se = F) + - xlim(c(0, 0.1)) + - ylim(c(0, 500000)) + - labs( - subtitle = "Area Vs Population", - y = "Population", - x = "Area", - title = "Scatterplot", - caption = "Source: midwest" - ) diff --git a/images/BBC-style-Plot.png b/images/BBC-style-Plot.png new file mode 100644 index 0000000..140762a Binary files /dev/null and b/images/BBC-style-Plot.png differ diff --git a/images/Resampling.jpg b/images/Resampling.jpg new file mode 100644 index 0000000..0f7fa85 Binary files /dev/null and b/images/Resampling.jpg differ diff --git a/images/downey.png b/images/downey.png new file mode 100644 index 0000000..2044228 Binary files /dev/null and b/images/downey.png differ diff --git a/images/engine_dashboard.png b/images/engine_dashboard.png new file mode 100644 index 0000000..20d34f3 Binary files /dev/null and b/images/engine_dashboard.png differ diff --git a/images/fail_to_reject_you.png b/images/fail_to_reject_you.png new file mode 100644 index 0000000..b9e8edc Binary files /dev/null and b/images/fail_to_reject_you.png differ diff --git a/images/imdb.png b/images/imdb.png new file mode 100644 index 0000000..7741210 Binary files /dev/null and b/images/imdb.png differ diff --git a/images/infer-ht-diagram.png b/images/infer-ht-diagram.png new file mode 100644 index 0000000..b940396 Binary files /dev/null and b/images/infer-ht-diagram.png differ diff --git a/images/infer_workflow.jpeg b/images/infer_workflow.jpeg new file mode 100644 index 0000000..eece5f8 Binary files /dev/null and b/images/infer_workflow.jpeg differ diff --git a/images/nature_editorial.png b/images/nature_editorial.png new file mode 100644 index 0000000..772431f Binary files /dev/null and b/images/nature_editorial.png differ diff --git a/images/support.jpg b/images/support.jpg new file mode 100644 index 0000000..a70004d Binary files /dev/null and b/images/support.jpg differ diff --git a/images/time_effort.gif b/images/time_effort.gif new file mode 100644 index 0000000..3e82eb1 Binary files /dev/null and b/images/time_effort.gif differ diff --git a/index.Rmd b/index.Rmd index de6790f..ea48c2d 100644 --- a/index.Rmd +++ b/index.Rmd @@ -98,6 +98,7 @@ knitr::include_graphics("images/rbook1.png") - 第 \@ref(broom) 章介绍模型输出结果的规整 - 第 \@ref(tidystats) 章介绍方差分析 - 第 \@ref(tests-as-linear) 章介绍统计检验与线性模型的等价性 + - 第 \@ref(infer) 章介绍统计推断 - 第 \@ref(lmm) 章介绍多层线性模型 - 第 \@ref(poisson-regression) 章介绍广义线性模型中的泊松回归 - 第 \@ref(logistic-regression) 章介绍logistic回归模型 @@ -111,6 +112,7 @@ knitr::include_graphics("images/rbook1.png") - 第 \@ref(eda-height) 章介绍探索性数据分析-身高体重 - 第 \@ref(eda-caribou) 章介绍探索性数据分析-驯鹿迁移 - 第 \@ref(eda-penguins) 章介绍探索性数据分析-企鹅的故事 + - 第 \@ref(eda-career-decision) 章介绍探索性数据分析-大学生职业决策 - 第 \@ref(rvest) 章介绍网页爬虫 - 第 \@ref(tidygraph) 章介绍社会网络分析 - 第 \@ref(tidytext) 章介绍文本挖掘 diff --git a/infer.Rmd b/infer.Rmd new file mode 100644 index 0000000..eeef516 --- /dev/null +++ b/infer.Rmd @@ -0,0 +1,414 @@ +# 统计推断 {#infer} + +Statistical Inference: A Tidy Approach + +## 案例1:你会给爱情片还是动作片高分? + +```{r out.width = '65%', fig.align='left', echo = FALSE} +knitr::include_graphics("images/imdb.png") +``` + +这是一个关于电影评分的数据集[^1], + +[^1]: + +```{r} +library(tidyverse) +d <- ggplot2movies::movies +d +``` + +数据集包含58788 行 和 24 变量 + +| variable | description | +|:------------|:-----------------| +| title | 电影名 | +| year | 发行年份 | +| budget | 预算金额 | +| length | 电影时长 | +| rating | 平均得分 | +| votes | 投票人数 | +| r1-10 | 各分段投票人占比 | +| mpaa | MPAA 分级 | +| action | 动作片 | +| animation | 动画片 | +| comedy | 喜剧片 | +| drama | 戏剧 | +| documentary | 纪录片 | +| romance | 爱情片 | +| short | 短片 | + +```{r, eval=FALSE, include=FALSE} +d %>% + rowwise() %>% + mutate( + t = sum(c_across(starts_with("r"))) + ) +``` + +我们想看下爱情片与动作片(不是爱情动作片)的平均得分是否显著不同。 + +- 首先我们简单的整理下数据,主要是剔除既是爱情片又是动作片的电影 + +```{r} +movies_genre_sample <- d %>% + select(title, year, rating, Action, Romance) %>% + filter(!(Action == 1 & Romance == 1)) %>% # 既是爱情片又是动作片的,删去 + mutate(genre = case_when( + Action == 1 ~ "Action", + Romance == 1 ~ "Romance", + TRUE ~ "Neither" + )) %>% + filter(genre != "Neither") %>% + select(-Action, -Romance) %>% + group_by(genre) %>% + slice_sample(n = 34) %>% # 每种题材的电影只选取了34个 + ungroup() + +movies_genre_sample +``` + +- 先看下图形 + +```{r message=FALSE, warning=FALSE} +movies_genre_sample %>% + ggplot(aes(x = genre, y = rating)) + + geom_boxplot() + + geom_jitter() +``` + +- 看下两种题材电影评分的分布 + +```{r} +movies_genre_sample %>% + ggplot(mapping = aes(x = rating)) + + geom_histogram(binwidth = 1, color = "white") + + facet_grid(vars(genre)) +``` + +- 统计两种题材电影评分的均值 + +```{r} +summary_ratings <- movies_genre_sample %>% + group_by(genre) %>% + summarize( + mean = mean(rating), + std_dev = sd(rating), + n = n() + ) +summary_ratings +``` + +### 传统的基于频率方法的t检验 + +假设: + +- 零假设: + + - $H_0: \mu_{1} - \mu_{2} = 0$ + +- 备选假设: + + - $H_A: \mu_{1} - \mu_{2} \neq 0$ + +两种可能的结论: + +- 拒绝 $H_0$ +- 不能拒绝 $H_0$ + +```{r} +t_test_eq <- t.test(rating ~ genre, + data = movies_genre_sample, + var.equal = TRUE +) %>% + broom::tidy() +t_test_eq +``` + +```{r} +t_test_uneq <- t.test(rating ~ genre, + data = movies_genre_sample, + var.equal = FALSE +) %>% + broom::tidy() +t_test_uneq +``` + +### infer:基于模拟的检验 + +所有的假设检验都符合这个框架[^2]: + +[^2]: + +```{r out.width = '85%', fig.align='left', echo = FALSE, fig.cap = "Hypothesis Testing Framework"} +knitr::include_graphics("images/downey.png") +``` + +- 实际观察的差别 + +```{r} +library(infer) + +obs_diff <- movies_genre_sample %>% + specify(formula = rating ~ genre) %>% + calculate( + stat = "diff in means", + order = c("Romance", "Action") + ) +obs_diff +``` + +- 模拟 + +```{r} +null_dist <- movies_genre_sample %>% + specify(formula = rating ~ genre) %>% + hypothesize(null = "independence") %>% + generate(reps = 5000, type = "permute") %>% + calculate( + stat = "diff in means", + order = c("Romance", "Action") + ) +head(null_dist) +``` + +- 可视化 + +```{r} +null_dist %>% + visualize() +``` + +```{r} +null_dist %>% + visualize() + + shade_p_value(obs_stat = obs_diff, direction = "both") +# shade_p_value(bins = 100, obs_stat = obs_diff, direction = "both") +``` + +- 计算p值 + +```{r} +pvalue <- null_dist %>% + get_pvalue(obs_stat = obs_diff, direction = "two_sided") + +pvalue +``` + +- 结论 + +在构建的虚拟($\Delta = 0$)的平行世界里,出现实际观察值(`r obs_diff$stat`)的概率很小,这里是(`r pvalue$p_value`)。 如果以(p\< 0.05)为标准,那我们有足够的证据证明,H0不成立,即爱情电影和动作电影的评分均值存在**显著差异**,具体来说,动作电影的平均评分要比爱情电影低些。 + +## 案例2: 航天事业的预算有党派门户之见? + +美国国家航空航天局的预算是否存在党派门户之见? + +```{r} +gss <- read_rds("./demo_data/gss.rds") + +gss %>% + select(NASA, party) %>% + count(NASA, party) %>% + head(8) +``` + +```{r} +gss %>% + ggplot(aes(x = party, fill = NASA)) + + geom_bar() +``` + +假设: + +- 零假设 $H_0$: + + - 不同党派对预算的态度的构成比(TOO LITTLE, ABOUT RIGHT, TOO MUCH) 没有区别 + +- 备选假设 $H_a$: + + - 不同党派对预算的态度的构成比(TOO LITTLE, ABOUT RIGHT, TOO MUCH) 存在区别 + +两种可能的结论: + +- 拒绝 $H_0$ +- 不能拒绝 $H_0$ + +### 传统的方法 + +```{r} +chisq.test(gss$party, gss$NASA) +``` + +或者 + +```{r} +gss %>% + chisq_test(NASA ~ party) %>% + dplyr::select(p_value) %>% + dplyr::pull() +``` + +### infer:Simulation-based tests + +```{r} +obs_stat <- gss %>% + specify(NASA ~ party) %>% + calculate(stat = "Chisq") +obs_stat +``` + +```{r} +null_dist <- gss %>% + specify(NASA ~ party) %>% # (1) + hypothesize(null = "independence") %>% # (2) + generate(reps = 5000, type = "permute") %>% # (3) + calculate(stat = "Chisq") # (4) +null_dist +``` + +```{r} +null_dist %>% + visualize() + + shade_p_value(obs_stat = obs_stat, method = "both", direction = "right") +``` + +```{r} +null_dist %>% + get_pvalue(obs_stat = obs_stat, direction = "greater") +``` + +看到 `p_value > 0.05`,不能拒绝 $H_0$,我们没有足够的证据证明党派之间有显著差异 + +```{r out.width = '50%', fig.align='center', echo = FALSE} +knitr::include_graphics("images/fail_to_reject_you.png") +``` + +## 案例3:原住民中的女学生多? + +案例 `quine` 数据集有 146 行 5 列,包含学生的生源、文化、性别和学习成效,具体说明如下 + +- Eth: 民族背景:原住民与否 (是"A"; 否 "N") +- Sex: 性别 +- Age: 年龄组 ("F0", "F1," "F2" or "F3") +- Lrn: 学习者状态(平均水平 "AL", 学习缓慢 "SL") +- Days:一年中缺勤天数 + + +```{r} +td <- MASS::quine %>% + as_tibble() %>% + mutate( + across(c(Sex, Eth), as_factor) + ) +td +``` + + + +从民族背景有两组(A, N)来看,性别为 F 的占比 是否有区别? + +```{r} +td %>% count(Eth, Sex) +``` + +### 传统方法 + +```{r} +prop.test(table(td$Eth, td$Sex), correct = FALSE) +``` + +### 基于模拟的方法 + +```{r} + +obs_diff <- td %>% + specify(Sex ~ Eth, success = "F") %>% # #被解释变量 sex中F的占比 + calculate( + stat = "diff in props", + order = c("A", "N") # 解释变量中两组A,N + ) + +obs_diff +``` + +```{r} +null_distribution <- td %>% + specify(Sex ~ Eth, success = "F") %>% + hypothesize(null = "independence") %>% + generate(reps = 5000, type = "permute") %>% + calculate(stat = "diff in props", order = c("A", "N")) +``` + +```{r} +null_distribution %>% + visualize() +``` + +```{r} +pvalue <- null_distribution %>% + get_pvalue(obs_stat = obs_diff, direction = "both") + +pvalue +``` + +```{r} +null_distribution %>% + get_ci(level = 0.95, type = "percentile") +``` + + + +## 宏包`infer` + +我比较喜欢[infer](https://github.com/tidymodels/infer)宏包的[设计思想](http://allendowney.blogspot.com/2016/06/there-is-still-only-one-test.html),它把统计推断分成了四个步骤 + +```{r out.width = '70%', fig.align='center', echo = FALSE} +knitr::include_graphics("images/infer-ht-diagram.png") +``` + + +下图可以更好的帮助我们理解infer的工作流程 +```{r out.width = '70%', fig.align='center', echo = FALSE} +knitr::include_graphics("images/infer_workflow.jpeg") +``` + +- `specify()` 指定解释变量和被解释变量 (`y ~ x`) + +- `hypothesize()` 指定**零假设** (比如, `independence`= `y` 和 `x` 彼此独立) + +- `generate()` 从基于零假设的平行世界中抽样: + + - 指定每次重抽样的类型,通俗点讲就是数据洗牌,重抽样`type = "bootstrap"` (有放回的),对应的零假设往往是null = "point" ; 重抽样`type = "permuting"` (无放回的),对应的零假设往往是null = "independence", 指的是y和x之间彼此独立的,因此抽样后会重新排列,也就说原先 value1-group1 可能变成了value1-group2,(因为我们假定他们是独立的啊,这种操作,也不会影响y和x的关系) + - 指定多少组 (`reps = 1000`) + +- `calculate()` 计算每组(`reps`)的统计值 (`stat = "diff in props"`) +- `visualize()` 可视化,对比零假设的分布与实际观察值. + + +下面是我自己对重抽样的理解 +```{r out.width = '70%', fig.align='center', echo = FALSE} +knitr::include_graphics("images/Resampling.jpg") +``` + +## 更多 + +更多统计推断的内容可参考 + +- +- +- +- + + + +```{r, echo = F} +# remove the objects +# rm(list=ls()) +rm(d, gss, movies_genre_sample, null_dist, null_distribution, obs_diff, obs_stat, pvalue, summary_ratings, t_test_eq, t_test_uneq, td) +``` + + + +```{r, echo = F, message = F, warning = F, results = "hide"} +pacman::p_unload(pacman::p_loaded(), character.only = TRUE) +``` diff --git a/intro_R.Rmd b/intro_R.Rmd index 477f8f6..2d7aa35 100644 --- a/intro_R.Rmd +++ b/intro_R.Rmd @@ -20,8 +20,9 @@ R 软件是一个自由、开源软件平台,具有统计分析、可视化和 ## 安装 RStudio 安装完R, 还需要安装RStudio。有同学可能要问 R 与 RStudio 是什么关系呢?打个比方吧,R 就像汽车的发动机, RStudio 就是汽车的仪表盘。但我更觉得 R 是有趣的灵魂,而 Rstudio 是好看的皮囊。 -```{r out.width = '50%', echo = FALSE} -knitr::include_graphics(c("images/engine.jpg", "images/dashboard.jpg")) +```{r out.width = '100%', echo = FALSE} +#knitr::include_graphics(c("images/engine.jpg", "images/dashboard.jpg")) +knitr::include_graphics("images/engine_dashboard.png") ``` diff --git a/intro_ds.Rmd b/intro_ds.Rmd index ac396de..dad073b 100644 --- a/intro_ds.Rmd +++ b/intro_ds.Rmd @@ -122,10 +122,10 @@ knitr::include_graphics("images/tidyverse.png") | 序号 | 内容 | 代码演示 | |:---|:---|:---| -| 1 | 统计 | `r xfun::embed_file('./exercises/1_stats.R')` | -| 2 | 可视化 | `r xfun::embed_file('./exercises/2_visual.R')` | -| 3 | 探索性分析 | `r xfun::embed_file('./exercises/3_eda.R')` | -| 4 | 可重复性报告 | `r xfun::embed_file('./exercises/4_reproducible.Rmd')` | +| 1 | 统计 | `r xfun::embed_file('./exercises/01_stats.R')` | +| 2 | 可视化 | `r xfun::embed_file('./exercises/02_visual.R')` | +| 3 | 探索性分析 | `r xfun::embed_file('./exercises/03_eda.R')` | +| 4 | 可重复性报告 | `r xfun::embed_file('./exercises/04_reproducible.Rmd')` | 看了这些代码,可能第一眼感觉是这样的 @@ -156,8 +156,9 @@ knitr::include_graphics("images/social_science.jpg") ### 社会科学需要可视化 -```{r echo=FALSE, out.width = '50%'} -knitr::include_graphics("images/R4art.png") + +```{r echo=FALSE, out.width = '100%'} +knitr::include_graphics("images/BBC-style-Plot.png") ``` 我们不是学美术的,但要可视化 @@ -166,11 +167,19 @@ knitr::include_graphics("images/R4art.png") ### 社会科学需要编程 -```{r echo=FALSE, out.width = '80%'} +```{r eval=FALSE, include=FALSE} knitr::include_graphics("images/Coding-Lab.jpg") ``` -我们不是学计算机的,但需要编程(小学生已经开始学编程了) +为什么不能用excel做数据分析?画个图说明下 + +```{r out.width = '70%', fig.align='center', echo = FALSE} +knitr::include_graphics("images/R_Excel.png", dpi = 150) +``` + +对于数据量不大,或者复杂程度不高的需求来说,excel很方便也很直观。但随着数据量或复杂程度不断增大,excel解决起来难度系数就陡增,或者无法搞定,这就需要借助编程完成。也就说,掌握了编程技能,对于简单的问题和复杂的问题,难度系数是差不多了。 + + @@ -178,7 +187,8 @@ knitr::include_graphics("images/Coding-Lab.jpg") ### 社会科学需要可重复性 ```{r echo=FALSE, out.width = '60%'} -knitr::include_graphics("images/latex-tweak.gif") +#knitr::include_graphics("images/latex-tweak.gif") +knitr::include_graphics("images/nature_editorial.png") ``` @@ -219,15 +229,6 @@ knitr::include_graphics("images/Languages02.jpg", dpi = 90) ### 一见钟情,还是相见恨晚? -为什么不能用excel做数据分析?画个图说明下 - -```{r out.width = '70%', fig.align='center', echo = FALSE} -knitr::include_graphics("images/R_Excel.png", dpi = 150) -``` - - -一见钟情,还是相见恨晚? - ```{r echo=FALSE, fig.align='center', out.width = '70%'} knitr::include_graphics("images/meme.png") ``` diff --git a/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf b/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf index bcb499a..1a2615f 100644 --- a/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf +++ b/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf @@ -5,6 +5,6 @@ account: wangminjie server: bookdown.org hostUrl: https://bookdown.org/__api__ appId: 3039 -bundleId: 30087 +bundleId: 30364 url: https://bookdown.org/wangminjie/R4DS/ -when: 1604026394.63435 +when: 1604734320.75596