diff --git a/_bookdown.yml b/_bookdown.yml
index a398696..a394302 100644
--- a/_bookdown.yml
+++ b/_bookdown.yml
@@ -50,6 +50,7 @@ rmd_files: [
   "broom.Rmd",
   "tidystats.Rmd",
   "tests_as_linear.Rmd",
+  "infer.Rmd",
   "lmm.Rmd",
   "poisson_regression.Rmd",
   "logistic_regression.Rmd",
@@ -65,6 +66,7 @@ rmd_files: [
   "eda_height.Rmd",
   "eda_caribou.Rmd",
   "eda_penguins.Rmd",
+  "eda_career_decision.Rmd",
   "rvest.Rmd",
   "tidygraph.Rmd",
   "tidytext.Rmd",
diff --git a/demo_data/career-decision.xlsx b/demo_data/career-decision.xlsx
new file mode 100644
index 0000000..c1f3018
Binary files /dev/null and b/demo_data/career-decision.xlsx differ
diff --git a/demo_data/gss.rds b/demo_data/gss.rds
new file mode 100644
index 0000000..83098d0
Binary files /dev/null and b/demo_data/gss.rds differ
diff --git a/eda_career_decision.Rmd b/eda_career_decision.Rmd
new file mode 100644
index 0000000..ef41647
--- /dev/null
+++ b/eda_career_decision.Rmd
@@ -0,0 +1,411 @@
+# 探索性数据分析-大学生职业决策 {#eda-career-decision}
+
+## 预备知识
+
+```{r, message=FALSE, warning=FALSE}
+library(tidyverse)
+
+example <- 
+ tibble::tribble(
+   ~name, ~english, ~chinese, ~math, ~sport, ~psy, ~edu,
+     "A",     133,    100,    102,     56,    89,   89,
+     "B",     120,    120,     86,     88,    45,   75,
+     "C",      98,    109,    114,     87,    NA,   84,
+     "D",     120,     78,    106,     68,    86,   69,
+     "E",     110,     99,    134,     98,    75,   70,
+     "F",      NA,    132,    130,     NA,    68,   88
+   )
+
+example
+```
+
+### 缺失值检查
+
+我们需要判断每一列的缺失值
+
+```{r}
+example %>% 
+  summarise(
+    na_in_english = sum(is.na(english)),
+    na_in_chinese = sum(is.na(chinese)),
+    na_in_math    = sum(is.na(math)),
+    na_in_sport   = sum(is.na(sport)),
+    na_in_psy     = sum(is.na(math)),   # tpyo here
+    na_in_edu     = sum(is.na(edu))
+  )
+```
+
+我们发现，这种写法比较笨，而且容易出错，比如`na_in_psy = sum(is.na(math))` 就写错了。那么有没有`既偷懒又安全`的方法呢？有的。但代价是需要学会`across()`函数，大家可以在Console中输入`?dplyr::across`查看帮助文档，或者看第 \@ref(colwise) 章。
+
+```{r}
+example %>% 
+  summarise(
+    across(everything(), mean)
+  )
+
+
+example %>% 
+  summarise(
+    across(everything(), function(x) sum(is.na(x)) )
+  )
+```
+
+
+### 数据预处理
+
+- 直接**丢弃**缺失值所在的行
+
+```{r}
+example %>% drop_na()
+```
+
+
+
+- 用**均值**代替缺失值
+
+```{r eval=FALSE, include=FALSE}
+example %>% 
+  mutate(
+    english_new = if_else(is.na(english), mean(english, na.rm = T), english)
+    )
+```
+
+
+
+```{r}
+d <- example %>% 
+  mutate(
+    across(where(is.numeric), ~ if_else(is.na(.), mean(., na.rm = T), .))
+  )
+d
+```
+
+
+
+
+- 计算总分/均值
+
+```{r}
+d %>% 
+  rowwise() %>% 
+  mutate(
+    total = sum(c_across(-name))
+  )
+
+d %>% 
+  rowwise() %>% 
+  mutate(
+    mean = mean(c_across(-name))
+  )
+```
+
+
+
+
+- **数据标准化**处理
+
+```{r}
+standard <- function(x) {
+  (x - mean(x)) / sd(x)
+}
+
+```
+
+
+
+
+```{r}
+d %>% 
+  mutate(
+    across(where(is.numeric), standard)
+  )
+```
+
+
+
+
+## 开始
+
+### 文件管理中需要注意的地方
+
+感谢康钦虹同学提供的数据，但这里有几点需要注意的地方：
+
+| 事项     	| 问题                      	| 解决办法                                    	|
+|----------	|---------------------------	|-----------------------------------------------|
+| 文件名   	| excel的文件名是中文       	| 用英文，比如 `data.xlsx`                     	|
+| 列名     	| 列名中有-号，大小写不统一 	| 规范列名，或用`janitor::clean_names()`偷懒   	|
+| 预处理   	| 直接在原始数据中新增      	| 不要在原始数据上改动，统计工作可以在R里实现 	|
+| 文件管理 	| 没有层级                  	| 新建`data`文件夹装数据，与`code.Rmd`并列     	|
+
+
+
+
+```{r, message=FALSE, warning=FALSE}
+data <- readxl::read_excel("demo_data/career-decision.xlsx", skip = 1) %>% 
+        janitor::clean_names()
+
+#glimpse(data)
+```
+
+
+
+```{r}
+d <- data %>% select(1:61)
+#glimpse(d)
+```
+
+
+
+
+
+### 缺失值检查
+
+```{r}
+d %>% 
+  summarise(
+    across(everything(), ~sum(is.na(.)))
+  )
+```
+
+没有缺失值，挺好
+
+
+### 数据预处理
+
+采用利克特式 5 点计分... (这方面你们懂得比我多)
+
+```{r}
+d <- d %>%
+  rowwise() %>%
+  mutate(
+    environment_exploration          = sum(c_across(z1:z5)),
+    self_exploration                 = sum(c_across(z6:z9)),
+    objective_system_exploration     = sum(c_across(z10:z15)),
+    info_quantity_exploration        = sum(c_across(z16:z18)),
+    
+    self_evaluation                  = sum(c_across(j1:j6)),
+    information_collection           = sum(c_across(j7:j15)),
+    target_select                    = sum(c_across(j16:j24)),
+    formulate                        = sum(c_across(j25:j32)),
+    problem_solving                  = sum(c_across(j33:j39)),
+
+    career_exploration               = sum(c_across(z1:z18)),
+    career_decision_making           = sum(c_across(j1:j39))
+  ) %>% 
+  select(-starts_with("z"), -starts_with("j")) %>% 
+  ungroup() %>% 
+  mutate(pid = 1:n(), .before = sex) %>%
+  mutate(
+    across(c(pid, sex, majoy, grade, from), as_factor)
+  )
+
+#glimpse(d)
+```
+
+### 标准化
+
+
+```{r}
+standard <- function(x) {
+  (x - mean(x)) / sd(x)
+}
+
+d <- d %>% 
+  mutate(
+    across(where(is.numeric), standard)
+  )
+d
+```
+
+
+## 探索
+
+### 想探索的问题
+
+- 不同性别（或者年级，生源地，专业）下，各指标分值的差异性
+- 两个变量的相关分析和回归分析
+- 更多（欢迎大家提出了喔）
+
+
+### 男生女生在职业探索上有所不同？
+
+以性别为例。因为性别变量是男女，仅仅2组，所以检查男女**在各自指标上的均值差异**，可以用T检验。
+
+```{r}
+d %>% 
+  group_by(sex) %>% 
+  summarise(
+   across(where(is.numeric), mean)
+)
+```
+
+你可以给这个图颜色弄得更好看点？
+```{r, fig.width=4, fig.height=3.5, fig.align="center"}
+library(ggridges)
+d %>% 
+  ggplot(aes(x = career_exploration, y = sex, fill = sex)) +
+  geom_density_ridges()
+```
+
+
+
+
+```{r}
+t_test_eq <- t.test(career_exploration ~ sex, data = d, var.equal = TRUE) %>% 
+  broom::tidy()
+t_test_eq
+```
+
+
+
+```{r}
+t_test_uneq <- t.test(career_exploration ~ sex, data = d, var.equal = FALSE) %>% 
+  broom::tidy()
+t_test_uneq 
+```
+
+
+
+当然，也可以用第 \@ref(infer) 章介绍的统计推断的方法
+
+
+```{r}
+library(infer)
+
+obs_diff <- d %>% 
+  specify(formula = career_exploration ~ sex) %>% 
+  calculate("diff in means", order = c("1", "2"))
+obs_diff
+```
+
+
+
+
+```{r}
+null_dist <- d %>% 
+  specify(formula = career_exploration ~ sex) %>% 
+  hypothesize(null = "independence") %>% 
+  generate(reps = 5000, type = "permute") %>% 
+  calculate(stat = "diff in means", order = c("1", "2"))
+null_dist
+```
+
+
+
+```{r}
+null_dist %>%  
+  visualize() +
+  shade_p_value(obs_stat = obs_diff, direction = "two_sided")
+```
+
+
+
+```{r}
+null_dist %>%  
+  get_p_value(obs_stat = obs_diff, direction = "two_sided") %>% 
+  #get_p_value(obs_stat = obs_diff, direction = "less") %>% 
+  mutate(p_value_clean = scales::pvalue(p_value))
+```
+
+
+
+
+
+也可以用tidyverse的方法一次性的搞定**所有指标**
+
+```{r}
+d %>%
+  pivot_longer(
+    cols = -c(pid, sex, majoy, grade, from),
+    names_to = "index",
+    values_to = "value"
+  ) %>% 
+  group_by(index) %>% 
+  summarise(
+    broom::tidy( t.test(value ~ sex, data = cur_data()))
+  ) %>% 
+  select(index, estimate, statistic, p.value) %>% 
+  arrange(p.value)
+```
+
+
+
+
+
+### 来自不同地方的学生在职业探索上有所不同？
+
+以生源地为例。因为生源地有3类，所以可以使用方差分析。
+
+```{r}
+aov(career_exploration ~ from, data = d) %>%
+  TukeyHSD(which = "from") %>%
+  broom::tidy()
+```
+
+
+
+```{r, eval=FALSE, include=FALSE}
+lm(career_exploration ~ from, data = d) %>% 
+  broom::tidy()
+```
+
+
+
+```{r, fig.width=4, fig.height=3.5, fig.align="center"}
+library(ggridges)
+d %>% 
+  ggplot(aes(x = career_exploration, y = from, fill = from)) +
+  geom_density_ridges()
+```
+
+
+
+也可以一次性的搞定**所有指标**
+
+```{r}
+d %>%
+  pivot_longer(
+    cols = -c(pid, sex, majoy, grade, from),
+    names_to = "index",
+    values_to = "value"
+  ) %>% 
+  group_by(index) %>% 
+  summarise(
+    broom::tidy( aov(value ~ from, data = cur_data()))
+  ) %>% 
+  select(index, term, statistic, p.value) %>% 
+  filter(term != "Residuals") %>% 
+  arrange(p.value)
+```
+
+
+### 职业探索和决策之间有关联？
+
+可以用第 \@ref(lm) 章线性模型来探索
+
+```{r}
+lm(career_decision_making  ~ career_exploration, data = d)
+```
+
+
+
+
+不要因为我讲课讲的很垃圾，就错过了R的美，瑕不掩瑜啦。要相信自己，你们是川师研究生中最聪明的。
+
+
+```{r echo=FALSE, fig.align='center', out.width='90%'}
+knitr::include_graphics("images/support.jpg")
+```
+
+
+
+
+```{r, echo = F}
+# remove the objects
+# rm(list=ls())
+rm(d, data, example, null_dist, obs_diff, standard, t_test_eq, t_test_uneq)
+```
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/exercises/01_stats.R b/exercises/01_stats.R
new file mode 100644
index 0000000..6c02fac
--- /dev/null
+++ b/exercises/01_stats.R
@@ -0,0 +1,46 @@
+# 计算
+1 + 5
+
+
+
+# 生成序列
+1:100
+
+
+
+# 把序列装到变量里
+x <- 1:100
+
+
+
+# 求和
+sum(x)
+
+
+
+# 求均值
+mean(x)
+
+
+
+# 求方差
+sd(x)
+
+
+# 生成正态分布的随机数
+rnorm(20, mean = 0, sd = 1)
+
+
+
+# T检验
+t.test(extra ~ group, data = sleep)
+
+
+# 方差分析
+aov(mpg ~ wt, data = mtcars)
+
+
+
+# 线性模型
+lm(mpg ~ wt, data = mtcars)
+
diff --git a/exercises/02_visual.R b/exercises/02_visual.R
new file mode 100644
index 0000000..0261c1d
--- /dev/null
+++ b/exercises/02_visual.R
@@ -0,0 +1,4 @@
+library(ggplot2)
+
+ggplot(mpg, aes(x = displ, y = hwy)) +
+  geom_point(aes(colour = class))
diff --git a/exercises/3_eda.R b/exercises/03_eda.R
similarity index 100%
rename from exercises/3_eda.R
rename to exercises/03_eda.R
diff --git a/exercises/4_reproducible.Rmd b/exercises/04_reproducible.Rmd
similarity index 61%
rename from exercises/4_reproducible.Rmd
rename to exercises/04_reproducible.Rmd
index bac7710..a034336 100644
--- a/exercises/4_reproducible.Rmd
+++ b/exercises/04_reproducible.Rmd
@@ -8,7 +8,7 @@ output:
     extra_dependencies:
       ctex: UTF8
     number_sections: yes
-    toc: yes
+    #toc: yes
     df_print: kable
 classoptions: "hyperref, 12pt, a4paper"
 ---
@@ -23,31 +23,28 @@ knitr::opts_chunk$set(echo = TRUE,
 ```
 
 
-
 # 引言
 
-这是一份关于新冠肺炎的探索性分析报告。
-
+新型冠状病毒疫情在多国蔓延，一些国家的病例确诊数量明显增多，各国防疫力度继续加强。本章通过分析疫情数据，了解疫情发展，祝愿人类早日会战胜病毒！
 
 
 # 导入数据
 
-首先，我们加载需要的宏包，其中tidyverse用于数据探索、covdata用于获取数据
+首先，我们加载宏包tidyverse
 
-```{r}
-# Load libraries
+```{r, warning=FALSE, message=FALSE}
 library(tidyverse) 
 library(covdata)
 ```
 
 
 论文的数据来源
-[https://kjhealy.github.io/covdata/](https://kjhealy.github.io/covdata/)，我们选取部分数据看看
+[https://kjhealy.github.io/covdata/](https://kjhealy.github.io/covdata/)，我们选取最近数据看看
 
 
 ```{r, echo = FALSE}
 covnat %>% 
-  tail(10) 
+  tail(8) 
 ```
 
 
@@ -69,19 +66,43 @@ covnat %>%
 
 
 
-# 可视化
+# 数据探索
+
+找出**累积确诊病例**最多的几个国家
+
+```{r}
+covnat %>% 
+  ungroup() %>% 
+  filter(date == max(date)) %>% 
+  slice_max(cu_cases, n = 8)
+```
+
 
-为了更好的呈现数据，我们将筛选出美国确诊病例数据，并可视化
+全球**确诊病例**和**死亡病例**
+
+```{r}
+covnat %>% 
+  ungroup() %>% 
+  summarise(
+    total_cases = sum(cases),
+    total_death = sum(deaths),
+  )
+```
+
+
+
+美国疫情变化情况
 
 ```{r, fig.showtext = TRUE}
-covdata::covnat %>% 
-  dplyr::filter(iso3 == "USA") %>% 
-  dplyr::filter(cu_cases > 0) %>% 
-  
+covnat %>% 
+  filter(iso3 == "USA") %>% 
+  filter(cu_cases > 0) %>% 
+  ungroup() %>% 
   ggplot(aes(x = date, y = cases)) +
-  geom_path() +
+  geom_line() +
+  scale_x_date(name = NULL, breaks = "month") +
   labs(title = "美国新冠肺炎累积确诊病例",
-       subtitle = "数据来源https://kjhealy.github.io/covdata/")
+       subtitle = "数据来源https://kjhealy.github.io/covdata/") 
 ```
 
 
diff --git a/exercises/05_R_vs_python.Rmd b/exercises/05_R_vs_python.Rmd
new file mode 100644
index 0000000..09da2ba
--- /dev/null
+++ b/exercises/05_R_vs_python.Rmd
@@ -0,0 +1,38 @@
+---
+title: "R_vs_python"
+output: html_document
+---
+
+
+
+```{r}
+library(reticulate)
+```
+
+
+```{python}
+[1,2,3] *2
+```
+
+
+
+```{r}
+c(1, 2, 3) *2
+```
+
+
+
+
+
+```{python}
+a = [1,2,3]
+a[0]
+```
+
+
+
+
+```{r}
+x <- c(1, 2, 3) 
+x[1]
+```
\ No newline at end of file
diff --git a/exercises/1_stats.R b/exercises/1_stats.R
deleted file mode 100644
index 5f96ca8..0000000
--- a/exercises/1_stats.R
+++ /dev/null
@@ -1,58 +0,0 @@
-# Numeric Functions
-1 + 5
-1:100
-abs(-3.14)
-sqrt(3.14)
-floor(3.14)
-round(3.14)
-cos(3.14)
-log(3.14)
-exp(3.14)
-
-seq(1, 10, 2)
-rep(1:3, 2)
-
-
-
-
-
-# Character Functions
-substr("abcdef", 2, 4)
-grep("a",  c("alice", "bob", "claro"))
-strsplit("a.b.c", "\\.")
-toupper("Alice")
-tolower("Alice")
-
-
-
-
-# Statistical Functions
-x <- 1:10
-sum(x)
-min(x)
-mean(x)
-sd(x)
-var(x)
-median(x)
-quantile(x, probs = 0.75)
-range(x)
-scale(x, center = TRUE, scale = TRUE)
-
-
-
-# Statistical Probability Functions
-rnorm(20, mean = 0, sd = 1)
-dnorm(0.5, mean = 0, sd = 1)
-rpois(100, lambda = 10)
-dpois(2, lambda = 10)
-
-
-
-
-
-# Regression Modeling
-lm(mpg ~ wt, data = mtcars)
-aov(mpg ~ wt, data = mtcars)
-t.test(extra ~ group, data = sleep)
-
-
diff --git a/exercises/2_visual.R b/exercises/2_visual.R
deleted file mode 100644
index fe098e0..0000000
--- a/exercises/2_visual.R
+++ /dev/null
@@ -1,14 +0,0 @@
-library(ggplot2)
-
-ggplot(midwest, aes(x = area, y = poptotal)) +
-  geom_point(aes(color = state, size = popdensity)) +
-  geom_smooth(method = "loess", se = F) +
-  xlim(c(0, 0.1)) +
-  ylim(c(0, 500000)) +
-  labs(
-    subtitle = "Area Vs Population",
-    y = "Population",
-    x = "Area",
-    title = "Scatterplot",
-    caption = "Source: midwest"
-  )
diff --git a/images/BBC-style-Plot.png b/images/BBC-style-Plot.png
new file mode 100644
index 0000000..140762a
Binary files /dev/null and b/images/BBC-style-Plot.png differ
diff --git a/images/Resampling.jpg b/images/Resampling.jpg
new file mode 100644
index 0000000..0f7fa85
Binary files /dev/null and b/images/Resampling.jpg differ
diff --git a/images/downey.png b/images/downey.png
new file mode 100644
index 0000000..2044228
Binary files /dev/null and b/images/downey.png differ
diff --git a/images/engine_dashboard.png b/images/engine_dashboard.png
new file mode 100644
index 0000000..20d34f3
Binary files /dev/null and b/images/engine_dashboard.png differ
diff --git a/images/fail_to_reject_you.png b/images/fail_to_reject_you.png
new file mode 100644
index 0000000..b9e8edc
Binary files /dev/null and b/images/fail_to_reject_you.png differ
diff --git a/images/imdb.png b/images/imdb.png
new file mode 100644
index 0000000..7741210
Binary files /dev/null and b/images/imdb.png differ
diff --git a/images/infer-ht-diagram.png b/images/infer-ht-diagram.png
new file mode 100644
index 0000000..b940396
Binary files /dev/null and b/images/infer-ht-diagram.png differ
diff --git a/images/infer_workflow.jpeg b/images/infer_workflow.jpeg
new file mode 100644
index 0000000..eece5f8
Binary files /dev/null and b/images/infer_workflow.jpeg differ
diff --git a/images/nature_editorial.png b/images/nature_editorial.png
new file mode 100644
index 0000000..772431f
Binary files /dev/null and b/images/nature_editorial.png differ
diff --git a/images/support.jpg b/images/support.jpg
new file mode 100644
index 0000000..a70004d
Binary files /dev/null and b/images/support.jpg differ
diff --git a/images/time_effort.gif b/images/time_effort.gif
new file mode 100644
index 0000000..3e82eb1
Binary files /dev/null and b/images/time_effort.gif differ
diff --git a/index.Rmd b/index.Rmd
index de6790f..ea48c2d 100644
--- a/index.Rmd
+++ b/index.Rmd
@@ -98,6 +98,7 @@ knitr::include_graphics("images/rbook1.png")
    - 第 \@ref(broom) 章介绍模型输出结果的规整
    - 第 \@ref(tidystats) 章介绍方差分析
    - 第 \@ref(tests-as-linear) 章介绍统计检验与线性模型的等价性
+   - 第 \@ref(infer) 章介绍统计推断
    - 第 \@ref(lmm) 章介绍多层线性模型
    - 第 \@ref(poisson-regression) 章介绍广义线性模型中的泊松回归
    - 第 \@ref(logistic-regression) 章介绍logistic回归模型
@@ -111,6 +112,7 @@ knitr::include_graphics("images/rbook1.png")
    - 第 \@ref(eda-height) 章介绍探索性数据分析-身高体重 
    - 第 \@ref(eda-caribou) 章介绍探索性数据分析-驯鹿迁移  
    - 第 \@ref(eda-penguins) 章介绍探索性数据分析-企鹅的故事
+   - 第 \@ref(eda-career-decision) 章介绍探索性数据分析-大学生职业决策
    - 第 \@ref(rvest) 章介绍网页爬虫
    - 第 \@ref(tidygraph) 章介绍社会网络分析
    - 第 \@ref(tidytext) 章介绍文本挖掘
diff --git a/infer.Rmd b/infer.Rmd
new file mode 100644
index 0000000..eeef516
--- /dev/null
+++ b/infer.Rmd
@@ -0,0 +1,414 @@
+# 统计推断 {#infer}
+
+Statistical Inference: A Tidy Approach
+
+## 案例1:你会给爱情片还是动作片高分？
+
+```{r out.width = '65%', fig.align='left', echo = FALSE}
+knitr::include_graphics("images/imdb.png")
+```
+
+这是一个关于电影评分的数据集[^1]，
+
+[^1]: <https://github.com/hadley/ggplot2movies/blob/master/R/movies.R>
+
+```{r}
+library(tidyverse)
+d <- ggplot2movies::movies
+d
+```
+
+数据集包含58788 行 和 24 变量
+
+| variable    | description      |
+|:------------|:-----------------|
+| title       | 电影名           |
+| year        | 发行年份         |
+| budget      | 预算金额         |
+| length      | 电影时长         |
+| rating      | 平均得分         |
+| votes       | 投票人数         |
+| r1-10       | 各分段投票人占比 |
+| mpaa        | MPAA 分级        |
+| action      | 动作片           |
+| animation   | 动画片           |
+| comedy      | 喜剧片           |
+| drama       | 戏剧             |
+| documentary | 纪录片           |
+| romance     | 爱情片           |
+| short       | 短片             |
+
+```{r, eval=FALSE, include=FALSE}
+d %>%
+  rowwise() %>%
+  mutate(
+    t = sum(c_across(starts_with("r")))
+  )
+```
+
+我们想看下爱情片与动作片（不是爱情动作片）的平均得分是否显著不同。
+
+-   首先我们简单的整理下数据，主要是剔除既是爱情片又是动作片的电影
+
+```{r}
+movies_genre_sample <- d %>%
+  select(title, year, rating, Action, Romance) %>%
+  filter(!(Action == 1 & Romance == 1)) %>% # 既是爱情片又是动作片的，删去
+  mutate(genre = case_when(
+    Action == 1 ~ "Action",
+    Romance == 1 ~ "Romance",
+    TRUE ~ "Neither"
+  )) %>%
+  filter(genre != "Neither") %>%
+  select(-Action, -Romance) %>%
+  group_by(genre) %>%
+  slice_sample(n = 34) %>% # 每种题材的电影只选取了34个
+  ungroup()
+
+movies_genre_sample
+```
+
+-   先看下图形
+
+```{r message=FALSE, warning=FALSE}
+movies_genre_sample %>%
+  ggplot(aes(x = genre, y = rating)) +
+  geom_boxplot() +
+  geom_jitter()
+```
+
+-   看下两种题材电影评分的分布
+
+```{r}
+movies_genre_sample %>%
+  ggplot(mapping = aes(x = rating)) +
+  geom_histogram(binwidth = 1, color = "white") +
+  facet_grid(vars(genre))
+```
+
+-   统计两种题材电影评分的均值
+
+```{r}
+summary_ratings <- movies_genre_sample %>%
+  group_by(genre) %>%
+  summarize(
+    mean = mean(rating),
+    std_dev = sd(rating),
+    n = n()
+  )
+summary_ratings
+```
+
+### 传统的基于频率方法的t检验
+
+假设：
+
+-   零假设:
+
+    -   $H_0: \mu_{1} - \mu_{2} = 0$
+
+-   备选假设:
+
+    -   $H_A: \mu_{1} - \mu_{2} \neq 0$
+
+两种可能的结论:
+
+-   拒绝 $H_0$
+-   不能拒绝 $H_0$
+
+```{r}
+t_test_eq <- t.test(rating ~ genre,
+  data = movies_genre_sample,
+  var.equal = TRUE
+) %>%
+  broom::tidy()
+t_test_eq
+```
+
+```{r}
+t_test_uneq <- t.test(rating ~ genre,
+  data = movies_genre_sample,
+  var.equal = FALSE
+) %>%
+  broom::tidy()
+t_test_uneq
+```
+
+### infer:基于模拟的检验
+
+所有的假设检验都符合这个框架[^2]:
+
+[^2]: <http://allendowney.blogspot.com/2016/06/there-is-still-only-one-test.html>
+
+```{r out.width = '85%', fig.align='left', echo = FALSE, fig.cap = "Hypothesis Testing Framework"}
+knitr::include_graphics("images/downey.png")
+```
+
+-   实际观察的差别
+
+```{r}
+library(infer)
+
+obs_diff <- movies_genre_sample %>%
+  specify(formula = rating ~ genre) %>%
+  calculate(
+    stat = "diff in means",
+    order = c("Romance", "Action")
+  )
+obs_diff
+```
+
+-   模拟
+
+```{r}
+null_dist <- movies_genre_sample %>%
+  specify(formula = rating ~ genre) %>%
+  hypothesize(null = "independence") %>%
+  generate(reps = 5000, type = "permute") %>% 
+  calculate(
+    stat = "diff in means",
+    order = c("Romance", "Action")
+  )
+head(null_dist)
+```
+
+-   可视化
+
+```{r}
+null_dist %>%
+  visualize()
+```
+
+```{r}
+null_dist %>%
+  visualize() +
+  shade_p_value(obs_stat = obs_diff, direction = "both")
+# shade_p_value(bins = 100, obs_stat = obs_diff, direction = "both")
+```
+
+-   计算p值
+
+```{r}
+pvalue <- null_dist %>%
+  get_pvalue(obs_stat = obs_diff, direction = "two_sided")
+
+pvalue
+```
+
+-   结论
+
+在构建的虚拟($\Delta = 0$)的平行世界里，出现实际观察值（`r obs_diff$stat`）的概率很小，这里是（`r pvalue$p_value`）。 如果以(p\< 0.05)为标准，那我们有足够的证据证明，H0不成立，即爱情电影和动作电影的评分均值存在**显著差异**，具体来说，动作电影的平均评分要比爱情电影低些。
+
+## 案例2: 航天事业的预算有党派门户之见？
+
+美国国家航空航天局的预算是否存在党派门户之见？
+
+```{r}
+gss <- read_rds("./demo_data/gss.rds")
+
+gss %>%
+  select(NASA, party) %>%
+  count(NASA, party) %>%
+  head(8)
+```
+
+```{r}
+gss %>%
+  ggplot(aes(x = party, fill = NASA)) +
+  geom_bar()
+```
+
+假设：
+
+-   零假设 $H_0$:
+
+    -  不同党派对预算的态度的构成比(TOO LITTLE, ABOUT RIGHT, TOO MUCH) 没有区别
+
+-   备选假设 $H_a$:
+
+    -  不同党派对预算的态度的构成比(TOO LITTLE, ABOUT RIGHT, TOO MUCH) 存在区别
+
+两种可能的结论:
+
+-   拒绝 $H_0$
+-   不能拒绝 $H_0$
+
+### 传统的方法
+
+```{r}
+chisq.test(gss$party, gss$NASA)
+```
+
+或者
+
+```{r}
+gss %>%
+  chisq_test(NASA ~ party) %>%
+  dplyr::select(p_value) %>%
+  dplyr::pull()
+```
+
+### infer:Simulation-based tests
+
+```{r}
+obs_stat <- gss %>%
+  specify(NASA ~ party) %>%
+  calculate(stat = "Chisq")
+obs_stat
+```
+
+```{r}
+null_dist <- gss %>%
+  specify(NASA ~ party) %>%                     # (1)
+  hypothesize(null = "independence") %>%        # (2)
+    generate(reps = 5000, type = "permute") %>% # (3)
+  calculate(stat = "Chisq")                     # (4)
+null_dist
+```
+
+```{r}
+null_dist %>%
+  visualize() +
+  shade_p_value(obs_stat = obs_stat, method = "both", direction = "right")
+```
+
+```{r}
+null_dist %>%
+  get_pvalue(obs_stat = obs_stat, direction = "greater")
+```
+
+看到 `p_value > 0.05`，不能拒绝 $H_0$，我们没有足够的证据证明党派之间有显著差异
+
+```{r out.width = '50%', fig.align='center', echo = FALSE}
+knitr::include_graphics("images/fail_to_reject_you.png")
+```
+
+## 案例3:原住民中的女学生多？
+
+案例 `quine` 数据集有 146 行 5 列，包含学生的生源、文化、性别和学习成效，具体说明如下
+
+-   Eth: 民族背景：原住民与否 (是"A"; 否 "N")
+-   Sex: 性别
+-   Age: 年龄组 ("F0", "F1," "F2" or "F3")
+-   Lrn: 学习者状态(平均水平 "AL"， 学习缓慢 "SL")
+-   Days：一年中缺勤天数
+
+
+```{r}
+td <- MASS::quine %>%
+  as_tibble() %>%
+  mutate(
+    across(c(Sex, Eth), as_factor)
+  )
+td
+```
+
+
+
+从民族背景有两组（A， N）来看，性别为 F 的占比 是否有区别？
+
+```{r}
+td %>% count(Eth, Sex)
+```
+
+### 传统方法
+
+```{r}
+prop.test(table(td$Eth, td$Sex), correct = FALSE)
+```
+
+### 基于模拟的方法
+
+```{r}
+
+obs_diff <- td %>%
+  specify(Sex ~ Eth, success = "F") %>% # #被解释变量 sex中F的占比
+  calculate(
+    stat = "diff in props",
+    order = c("A", "N") # 解释变量中两组A，N
+  )
+
+obs_diff
+```
+
+```{r}
+null_distribution <- td %>%
+  specify(Sex ~ Eth, success = "F") %>%
+  hypothesize(null = "independence") %>%
+  generate(reps = 5000, type = "permute") %>%
+  calculate(stat = "diff in props", order = c("A", "N"))
+```
+
+```{r}
+null_distribution %>%
+  visualize()
+```
+
+```{r}
+pvalue <- null_distribution %>%
+  get_pvalue(obs_stat = obs_diff, direction = "both")
+
+pvalue
+```
+
+```{r}
+null_distribution %>%
+  get_ci(level = 0.95, type = "percentile")
+```
+
+
+
+## 宏包`infer`
+
+我比较喜欢[infer](https://github.com/tidymodels/infer)宏包的[设计思想](http://allendowney.blogspot.com/2016/06/there-is-still-only-one-test.html)，它把统计推断分成了四个步骤
+
+```{r out.width = '70%', fig.align='center', echo = FALSE}
+knitr::include_graphics("images/infer-ht-diagram.png")
+```
+
+
+下图可以更好的帮助我们理解infer的工作流程
+```{r out.width = '70%', fig.align='center', echo = FALSE}
+knitr::include_graphics("images/infer_workflow.jpeg")
+```
+
+-   `specify()` 指定解释变量和被解释变量 (`y ~ x`)
+
+-   `hypothesize()` 指定**零假设** (比如, `independence`= `y` 和 `x` 彼此独立)
+
+-   `generate()` 从基于零假设的平行世界中抽样:
+
+    -   指定每次重抽样的类型，通俗点讲就是数据洗牌，重抽样`type = "bootstrap"` (有放回的)，对应的零假设往往是null = "point" ； 重抽样`type = "permuting"` (无放回的)，对应的零假设往往是null = "independence", 指的是y和x之间彼此独立的，因此抽样后会重新排列，也就说原先 value1-group1 可能变成了value1-group2，(因为我们假定他们是独立的啊，这种操作，也不会影响y和x的关系)
+    -   指定多少组 (`reps = 1000`)
+
+-   `calculate()` 计算每组(`reps`)的统计值 (`stat = "diff in props"`)
+-   `visualize()` 可视化，对比零假设的分布与实际观察值.
+
+
+下面是我自己对重抽样的理解
+```{r out.width = '70%', fig.align='center', echo = FALSE}
+knitr::include_graphics("images/Resampling.jpg")
+```
+
+## 更多
+
+更多统计推断的内容可参考
+
+-   <http://infer.netlify.com>
+-   <https://moderndive.netlify.com/index.html>
+-   <https://moderndive.com/index.html>
+-   <https://github.com/tidymodels/infer>
+
+
+
+```{r, echo = F}
+# remove the objects
+# rm(list=ls())
+rm(d, gss, movies_genre_sample, null_dist, null_distribution, obs_diff, obs_stat, pvalue, summary_ratings, t_test_eq, t_test_uneq, td)
+```
+
+
+
+```{r, echo = F, message = F, warning = F, results = "hide"}
+pacman::p_unload(pacman::p_loaded(), character.only = TRUE)
+```
diff --git a/intro_R.Rmd b/intro_R.Rmd
index 477f8f6..2d7aa35 100644
--- a/intro_R.Rmd
+++ b/intro_R.Rmd
@@ -20,8 +20,9 @@ R 软件是一个自由、开源软件平台，具有统计分析、可视化和
 ## 安装 RStudio
 安装完R， 还需要安装RStudio。有同学可能要问 R 与 RStudio 是什么关系呢？打个比方吧，R 就像汽车的发动机, RStudio 就是汽车的仪表盘。但我更觉得 R 是有趣的灵魂，而 Rstudio 是好看的皮囊。
 
-```{r out.width = '50%', echo = FALSE}
-knitr::include_graphics(c("images/engine.jpg", "images/dashboard.jpg"))
+```{r out.width = '100%', echo = FALSE}
+#knitr::include_graphics(c("images/engine.jpg", "images/dashboard.jpg"))
+knitr::include_graphics("images/engine_dashboard.png")
 ```
 
 
diff --git a/intro_ds.Rmd b/intro_ds.Rmd
index ac396de..dad073b 100644
--- a/intro_ds.Rmd
+++ b/intro_ds.Rmd
@@ -122,10 +122,10 @@ knitr::include_graphics("images/tidyverse.png")
 
 | 序号 | 内容   |  代码演示   |
 |:---|:---|:---|
-| 1     |  统计           |   `r xfun::embed_file('./exercises/1_stats.R')` |
-| 2     |  可视化         |   `r xfun::embed_file('./exercises/2_visual.R')` |
-| 3     |  探索性分析     |   `r xfun::embed_file('./exercises/3_eda.R')` |
-| 4     |  可重复性报告   |   `r xfun::embed_file('./exercises/4_reproducible.Rmd')` |
+| 1     |  统计           |   `r xfun::embed_file('./exercises/01_stats.R')` |
+| 2     |  可视化         |   `r xfun::embed_file('./exercises/02_visual.R')` |
+| 3     |  探索性分析     |   `r xfun::embed_file('./exercises/03_eda.R')` |
+| 4     |  可重复性报告   |   `r xfun::embed_file('./exercises/04_reproducible.Rmd')` |
 
 
 看了这些代码，可能第一眼感觉是这样的
@@ -156,8 +156,9 @@ knitr::include_graphics("images/social_science.jpg")
 
 ### 社会科学需要可视化
 
-```{r echo=FALSE, out.width = '50%'}
-knitr::include_graphics("images/R4art.png")
+
+```{r echo=FALSE, out.width = '100%'}
+knitr::include_graphics("images/BBC-style-Plot.png")
 ```
 
 我们不是学美术的，但要可视化
@@ -166,11 +167,19 @@ knitr::include_graphics("images/R4art.png")
 
 ### 社会科学需要编程
 
-```{r echo=FALSE, out.width = '80%'}
+```{r eval=FALSE, include=FALSE}
 knitr::include_graphics("images/Coding-Lab.jpg")
 ```
 
-我们不是学计算机的，但需要编程（小学生已经开始学编程了）
+为什么不能用excel做数据分析？画个图说明下
+
+```{r out.width = '70%', fig.align='center', echo = FALSE}
+knitr::include_graphics("images/R_Excel.png", dpi = 150)
+```
+
+对于数据量不大，或者复杂程度不高的需求来说，excel很方便也很直观。但随着数据量或复杂程度不断增大，excel解决起来难度系数就陡增，或者无法搞定，这就需要借助编程完成。也就说，掌握了编程技能，对于简单的问题和复杂的问题，难度系数是差不多了。
+
+<!-- 我们不是学计算机的，但需要编程（小学生已经开始学编程了） -->
 
 
 
@@ -178,7 +187,8 @@ knitr::include_graphics("images/Coding-Lab.jpg")
 ### 社会科学需要可重复性
 
 ```{r echo=FALSE, out.width = '60%'}
-knitr::include_graphics("images/latex-tweak.gif")
+#knitr::include_graphics("images/latex-tweak.gif")
+knitr::include_graphics("images/nature_editorial.png")
 ```
 
 <!-- ~我们不是学设计的，但要操心**交叉引用**的事~  -->
@@ -219,15 +229,6 @@ knitr::include_graphics("images/Languages02.jpg", dpi = 90)
 
 ### 一见钟情，还是相见恨晚？
 
-为什么不能用excel做数据分析？画个图说明下
-
-```{r out.width = '70%', fig.align='center', echo = FALSE}
-knitr::include_graphics("images/R_Excel.png", dpi = 150)
-```
-
-
-一见钟情，还是相见恨晚？
-
 ```{r echo=FALSE,  fig.align='center', out.width = '70%'}
 knitr::include_graphics("images/meme.png")
 ```
diff --git a/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf b/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf
index bcb499a..1a2615f 100644
--- a/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf
+++ b/rsconnect/documents/index.Rmd/bookdown.org/wangminjie/R4DS.dcf
@@ -5,6 +5,6 @@ account: wangminjie
 server: bookdown.org
 hostUrl: https://bookdown.org/__api__
 appId: 3039
-bundleId: 30087
+bundleId: 30364
 url: https://bookdown.org/wangminjie/R4DS/
-when: 1604026394.63435
+when: 1604734320.75596