diff --git a/_book/R4DS_files/figure-html/broom-10-1.png b/_book/R4DS_files/figure-html/broom-10-1.png deleted file mode 100644 index 06fe839..0000000 Binary files a/_book/R4DS_files/figure-html/broom-10-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/broom-12-1.png b/_book/R4DS_files/figure-html/broom-12-1.png deleted file mode 100644 index d49d11e..0000000 Binary files a/_book/R4DS_files/figure-html/broom-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/broom-14-1.png b/_book/R4DS_files/figure-html/broom-14-1.png deleted file mode 100644 index f07277a..0000000 Binary files a/_book/R4DS_files/figure-html/broom-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/broom-19-1.png b/_book/R4DS_files/figure-html/broom-19-1.png deleted file mode 100644 index 7595697..0000000 Binary files a/_book/R4DS_files/figure-html/broom-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/broom-19-2-1.png b/_book/R4DS_files/figure-html/broom-19-2-1.png deleted file mode 100644 index 5f7c33d..0000000 Binary files a/_book/R4DS_files/figure-html/broom-19-2-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/broom-2-1.png b/_book/R4DS_files/figure-html/broom-2-1.png deleted file mode 100644 index d652b1a..0000000 Binary files a/_book/R4DS_files/figure-html/broom-2-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/broom-3-1.png b/_book/R4DS_files/figure-html/broom-3-1.png deleted file mode 100644 index 1b84513..0000000 Binary files a/_book/R4DS_files/figure-html/broom-3-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-14-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-14-1.png deleted file mode 100644 index 6d9a5e5..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-15-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-15-1.png deleted file mode 100644 index f96fa14..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-18-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-18-1.png deleted file mode 100644 index 37c46ef..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-19-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-19-1.png deleted file mode 100644 index b668f79..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-20-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-20-1.png deleted file mode 100644 index 9fc6c9a..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-20-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-21-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-21-1.png deleted file mode 100644 index 3b34034..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-22-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-22-1.png deleted file mode 100644 index 11ab8fc..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-22-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-ames-houseprice-23-1.png b/_book/R4DS_files/figure-html/eda-ames-houseprice-23-1.png deleted file mode 100644 index 948a5df..0000000 Binary files a/_book/R4DS_files/figure-html/eda-ames-houseprice-23-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-anscombe-18-1.png b/_book/R4DS_files/figure-html/eda-anscombe-18-1.png deleted file mode 100644 index 7af4c85..0000000 Binary files a/_book/R4DS_files/figure-html/eda-anscombe-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-career-decision-16-1.png b/_book/R4DS_files/figure-html/eda-career-decision-16-1.png deleted file mode 100644 index a0ba655..0000000 Binary files a/_book/R4DS_files/figure-html/eda-career-decision-16-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-career-decision-21-1.png b/_book/R4DS_files/figure-html/eda-career-decision-21-1.png deleted file mode 100644 index b949cb8..0000000 Binary files a/_book/R4DS_files/figure-html/eda-career-decision-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-career-decision-26-1.png b/_book/R4DS_files/figure-html/eda-career-decision-26-1.png deleted file mode 100644 index d189d88..0000000 Binary files a/_book/R4DS_files/figure-html/eda-career-decision-26-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-10-1.png b/_book/R4DS_files/figure-html/eda-caribou-10-1.png deleted file mode 100644 index 8400fa9..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-10-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-13-1.png b/_book/R4DS_files/figure-html/eda-caribou-13-1.png deleted file mode 100644 index 9b6759f..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-14-1.png b/_book/R4DS_files/figure-html/eda-caribou-14-1.png deleted file mode 100644 index 7b11451..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-15-1.png b/_book/R4DS_files/figure-html/eda-caribou-15-1.png deleted file mode 100644 index 4e2406a..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-17-1.png b/_book/R4DS_files/figure-html/eda-caribou-17-1.png deleted file mode 100644 index 6c36833..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-7-1.png b/_book/R4DS_files/figure-html/eda-caribou-7-1.png deleted file mode 100644 index 4712c70..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-8-1.png b/_book/R4DS_files/figure-html/eda-caribou-8-1.png deleted file mode 100644 index b4174a5..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-caribou-9-1.png b/_book/R4DS_files/figure-html/eda-caribou-9-1.png deleted file mode 100644 index 27ea1d0..0000000 Binary files a/_book/R4DS_files/figure-html/eda-caribou-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-18-1.png b/_book/R4DS_files/figure-html/eda-covid2019-18-1.png deleted file mode 100644 index e59b092..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-21-1.png b/_book/R4DS_files/figure-html/eda-covid2019-21-1.png deleted file mode 100644 index 9a1e08c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-24-1.png b/_book/R4DS_files/figure-html/eda-covid2019-24-1.png deleted file mode 100644 index d58f27f..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-25-1.png b/_book/R4DS_files/figure-html/eda-covid2019-25-1.png deleted file mode 100644 index 4b78112..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-25-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-30-1.png b/_book/R4DS_files/figure-html/eda-covid2019-30-1.png deleted file mode 100644 index 05ce84c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-30-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-34-1.png b/_book/R4DS_files/figure-html/eda-covid2019-34-1.png deleted file mode 100644 index f3212ab..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-34-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-36-1.png b/_book/R4DS_files/figure-html/eda-covid2019-36-1.png deleted file mode 100644 index dffe1f6..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-38-1.png b/_book/R4DS_files/figure-html/eda-covid2019-38-1.png deleted file mode 100644 index 0db3c93..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-38-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-39-1.png b/_book/R4DS_files/figure-html/eda-covid2019-39-1.png deleted file mode 100644 index 000464e..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-39-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-41-1.png b/_book/R4DS_files/figure-html/eda-covid2019-41-1.png deleted file mode 100644 index f7d6d51..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-41-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-covid2019-44-1.png b/_book/R4DS_files/figure-html/eda-covid2019-44-1.png deleted file mode 100644 index cf7f4c9..0000000 Binary files a/_book/R4DS_files/figure-html/eda-covid2019-44-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-11-1.png b/_book/R4DS_files/figure-html/eda-height-11-1.png deleted file mode 100644 index 7edec67..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-12-1.png b/_book/R4DS_files/figure-html/eda-height-12-1.png deleted file mode 100644 index d27844f..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-13-1.png b/_book/R4DS_files/figure-html/eda-height-13-1.png deleted file mode 100644 index 34c954c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-14-1.png b/_book/R4DS_files/figure-html/eda-height-14-1.png deleted file mode 100644 index 4810b97..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-15-1.png b/_book/R4DS_files/figure-html/eda-height-15-1.png deleted file mode 100644 index 52525b7..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-19-1.png b/_book/R4DS_files/figure-html/eda-height-19-1.png deleted file mode 100644 index e8a807d..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-4-1.png b/_book/R4DS_files/figure-html/eda-height-4-1.png deleted file mode 100644 index c8ce47d..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-height-5-1.png b/_book/R4DS_files/figure-html/eda-height-5-1.png deleted file mode 100644 index 46bfaea..0000000 Binary files a/_book/R4DS_files/figure-html/eda-height-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-10-1.png b/_book/R4DS_files/figure-html/eda-nobel-10-1.png deleted file mode 100644 index 17fa990..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-10-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-11-1.png b/_book/R4DS_files/figure-html/eda-nobel-11-1.png deleted file mode 100644 index 59b6594..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-13-1.png b/_book/R4DS_files/figure-html/eda-nobel-13-1.png deleted file mode 100644 index ed8050c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-24-1.png b/_book/R4DS_files/figure-html/eda-nobel-24-1.png deleted file mode 100644 index 6ed3123..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-26-1.png b/_book/R4DS_files/figure-html/eda-nobel-26-1.png deleted file mode 100644 index ed4233c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-26-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-27-1.png b/_book/R4DS_files/figure-html/eda-nobel-27-1.png deleted file mode 100644 index 421f30f..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-27-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-28-1.png b/_book/R4DS_files/figure-html/eda-nobel-28-1.png deleted file mode 100644 index 16646c2..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-28-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-29-1.png b/_book/R4DS_files/figure-html/eda-nobel-29-1.png deleted file mode 100644 index 397222c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-29-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-30-1.png b/_book/R4DS_files/figure-html/eda-nobel-30-1.png deleted file mode 100644 index 118ff4d..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-30-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-30-2.png b/_book/R4DS_files/figure-html/eda-nobel-30-2.png deleted file mode 100644 index 3b035dd..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-30-2.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-30-3.png b/_book/R4DS_files/figure-html/eda-nobel-30-3.png deleted file mode 100644 index 7ffcbd1..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-30-3.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-30-4.png b/_book/R4DS_files/figure-html/eda-nobel-30-4.png deleted file mode 100644 index e29a1c4..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-30-4.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-30-5.png b/_book/R4DS_files/figure-html/eda-nobel-30-5.png deleted file mode 100644 index 984885e..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-30-5.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-30-6.png b/_book/R4DS_files/figure-html/eda-nobel-30-6.png deleted file mode 100644 index 83bbe78..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-30-6.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-33-1.png b/_book/R4DS_files/figure-html/eda-nobel-33-1.png deleted file mode 100644 index 0c81716..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-33-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-34-1.png b/_book/R4DS_files/figure-html/eda-nobel-34-1.png deleted file mode 100644 index caac565..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-34-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-35-1.png b/_book/R4DS_files/figure-html/eda-nobel-35-1.png deleted file mode 100644 index ef42bd0..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-35-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-36-1.png b/_book/R4DS_files/figure-html/eda-nobel-36-1.png deleted file mode 100644 index 46a0560..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-42-1.png b/_book/R4DS_files/figure-html/eda-nobel-42-1.png deleted file mode 100644 index cd2fb9a..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-42-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-45-1.png b/_book/R4DS_files/figure-html/eda-nobel-45-1.png deleted file mode 100644 index 86524c9..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-45-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-46-1.png b/_book/R4DS_files/figure-html/eda-nobel-46-1.png deleted file mode 100644 index 42862d4..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-46-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-49-1.gif b/_book/R4DS_files/figure-html/eda-nobel-49-1.gif deleted file mode 100644 index 8e867ad..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-49-1.gif and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-52-1.png b/_book/R4DS_files/figure-html/eda-nobel-52-1.png deleted file mode 100644 index ad918fe..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-52-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-nobel-9-1.png b/_book/R4DS_files/figure-html/eda-nobel-9-1.png deleted file mode 100644 index 018ab57..0000000 Binary files a/_book/R4DS_files/figure-html/eda-nobel-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-olympics-15-1.png b/_book/R4DS_files/figure-html/eda-olympics-15-1.png deleted file mode 100644 index 82b1483..0000000 Binary files a/_book/R4DS_files/figure-html/eda-olympics-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-olympics-17-1.png b/_book/R4DS_files/figure-html/eda-olympics-17-1.png deleted file mode 100644 index 2208c79..0000000 Binary files a/_book/R4DS_files/figure-html/eda-olympics-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-olympics-4-1.png b/_book/R4DS_files/figure-html/eda-olympics-4-1.png deleted file mode 100644 index 0c30c94..0000000 Binary files a/_book/R4DS_files/figure-html/eda-olympics-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-olympics-7-1.png b/_book/R4DS_files/figure-html/eda-olympics-7-1.png deleted file mode 100644 index 0340368..0000000 Binary files a/_book/R4DS_files/figure-html/eda-olympics-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-11-1.png b/_book/R4DS_files/figure-html/eda-penguins-11-1.png deleted file mode 100644 index b2a53d1..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-12-1.png b/_book/R4DS_files/figure-html/eda-penguins-12-1.png deleted file mode 100644 index ff1749b..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-13-1.png b/_book/R4DS_files/figure-html/eda-penguins-13-1.png deleted file mode 100644 index 1a26eaf..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-14-1.png b/_book/R4DS_files/figure-html/eda-penguins-14-1.png deleted file mode 100644 index 1733e2a..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-15-1.png b/_book/R4DS_files/figure-html/eda-penguins-15-1.png deleted file mode 100644 index a251747..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-16-1.png b/_book/R4DS_files/figure-html/eda-penguins-16-1.png deleted file mode 100644 index 213c9cf..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-16-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-17-1.png b/_book/R4DS_files/figure-html/eda-penguins-17-1.png deleted file mode 100644 index 9420a99..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-18-1.png b/_book/R4DS_files/figure-html/eda-penguins-18-1.png deleted file mode 100644 index 13a25ae..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-19-1.png b/_book/R4DS_files/figure-html/eda-penguins-19-1.png deleted file mode 100644 index bc429c8..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-20-1.png b/_book/R4DS_files/figure-html/eda-penguins-20-1.png deleted file mode 100644 index 5cca3c9..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-20-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-21-1.png b/_book/R4DS_files/figure-html/eda-penguins-21-1.png deleted file mode 100644 index 93f17fa..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-22-1.png b/_book/R4DS_files/figure-html/eda-penguins-22-1.png deleted file mode 100644 index 7059056..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-22-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-23-1.png b/_book/R4DS_files/figure-html/eda-penguins-23-1.png deleted file mode 100644 index ee34008..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-23-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-24-1.png b/_book/R4DS_files/figure-html/eda-penguins-24-1.png deleted file mode 100644 index d4111f0..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-25-1.png b/_book/R4DS_files/figure-html/eda-penguins-25-1.png deleted file mode 100644 index d94a94d..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-25-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-26-1.png b/_book/R4DS_files/figure-html/eda-penguins-26-1.png deleted file mode 100644 index ced7e39..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-26-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-27-1.png b/_book/R4DS_files/figure-html/eda-penguins-27-1.png deleted file mode 100644 index 5e77c45..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-27-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-29-1.png b/_book/R4DS_files/figure-html/eda-penguins-29-1.png deleted file mode 100644 index 5e4694c..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-29-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-37-1.png b/_book/R4DS_files/figure-html/eda-penguins-37-1.png deleted file mode 100644 index 9f70c46..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-37-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-penguins-42-1.png b/_book/R4DS_files/figure-html/eda-penguins-42-1.png deleted file mode 100644 index a6e6d07..0000000 Binary files a/_book/R4DS_files/figure-html/eda-penguins-42-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-vaccine-effectiveness-14-1.png b/_book/R4DS_files/figure-html/eda-vaccine-effectiveness-14-1.png deleted file mode 100644 index c9f64ac..0000000 Binary files a/_book/R4DS_files/figure-html/eda-vaccine-effectiveness-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/eda-vaccine-effectiveness-8-1.png b/_book/R4DS_files/figure-html/eda-vaccine-effectiveness-8-1.png deleted file mode 100644 index 0bb957f..0000000 Binary files a/_book/R4DS_files/figure-html/eda-vaccine-effectiveness-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-13-1.png b/_book/R4DS_files/figure-html/forcats-13-1.png deleted file mode 100644 index 580c701..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-14-1.png b/_book/R4DS_files/figure-html/forcats-14-1.png deleted file mode 100644 index a0d10d0..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-15-1.png b/_book/R4DS_files/figure-html/forcats-15-1.png deleted file mode 100644 index f038cc8..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-16-1.png b/_book/R4DS_files/figure-html/forcats-16-1.png deleted file mode 100644 index 5eba1ce..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-16-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-17-1.png b/_book/R4DS_files/figure-html/forcats-17-1.png deleted file mode 100644 index 580c701..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-18-1.png b/_book/R4DS_files/figure-html/forcats-18-1.png deleted file mode 100644 index c5f13bc..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/forcats-19-1.png b/_book/R4DS_files/figure-html/forcats-19-1.png deleted file mode 100644 index 5dc6422..0000000 Binary files a/_book/R4DS_files/figure-html/forcats-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-15-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-15-1.png deleted file mode 100644 index 6249a9e..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-16-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-16-1.png deleted file mode 100644 index 72612e6..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-16-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-21-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-21-1.png deleted file mode 100644 index 1c8f313..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-27-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-27-1.png deleted file mode 100644 index 6249a9e..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-27-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-28-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-28-1.png deleted file mode 100644 index c5eb512..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-28-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-29-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-29-1.png deleted file mode 100644 index d090d98..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-29-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-3-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-3-1.png deleted file mode 100644 index a7f2358..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-3-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-3-2.png b/_book/R4DS_files/figure-html/ggplot2-aes2-3-2.png deleted file mode 100644 index ca968ad..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-3-2.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-30-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-30-1.png deleted file mode 100644 index e18eaa5..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-30-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-31-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-31-1.png deleted file mode 100644 index 72612e6..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-31-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-32-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-32-1.png deleted file mode 100644 index 72612e6..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-32-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-36-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-36-1.png deleted file mode 100644 index 4a65961..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-37-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-37-1.png deleted file mode 100644 index 816363d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-37-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-38-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-38-1.png deleted file mode 100644 index 9af216b..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-38-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-aes2-9-1.png b/_book/R4DS_files/figure-html/ggplot2-aes2-9-1.png deleted file mode 100644 index 7ddab06..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-aes2-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-colors-4-1.png b/_book/R4DS_files/figure-html/ggplot2-colors-4-1.png deleted file mode 100644 index e7a35df..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-colors-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-colors-5-1.png b/_book/R4DS_files/figure-html/ggplot2-colors-5-1.png deleted file mode 100644 index f16f5a9..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-colors-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-colors-6-1.png b/_book/R4DS_files/figure-html/ggplot2-colors-6-1.png deleted file mode 100644 index 4a3a884..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-colors-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-colors-7-1.png b/_book/R4DS_files/figure-html/ggplot2-colors-7-1.png deleted file mode 100644 index 4d8fb2f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-colors-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-11-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-11-1.png deleted file mode 100644 index 6587b2a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-12-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-12-1.png deleted file mode 100644 index 643bc8d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-13-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-13-1.png deleted file mode 100644 index 1a9d10d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-14-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-14-1.png deleted file mode 100644 index 3324fe7..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-15-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-15-1.png deleted file mode 100644 index dbd9fb6..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-17-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-17-1.png deleted file mode 100644 index e264cc4..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-18-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-18-1.png deleted file mode 100644 index f481de9..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-19-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-19-1.png deleted file mode 100644 index a12416f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-21-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-21-1.png deleted file mode 100644 index 0cb9023..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-23-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-23-1.png deleted file mode 100644 index 825ae53..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-23-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-24-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-24-1.png deleted file mode 100644 index 1021849..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-25-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-25-1.png deleted file mode 100644 index 6cc896b..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-25-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-26-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-26-1.png deleted file mode 100644 index 3f71789..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-26-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-27-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-27-1.png deleted file mode 100644 index bbbef35..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-27-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-29-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-29-1.png deleted file mode 100644 index 9e0c593..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-29-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-30-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-30-1.png deleted file mode 100644 index a74ad61..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-30-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-31-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-31-1.png deleted file mode 100644 index 3b22e9f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-31-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-32-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-32-1.png deleted file mode 100644 index e132e93..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-32-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-34-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-34-1.png deleted file mode 100644 index cf86fe1..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-34-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-35-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-35-1.png deleted file mode 100644 index ef4e54d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-35-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-36-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-36-1.png deleted file mode 100644 index 38dbd6f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-4-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-4-1.png deleted file mode 100644 index 9ce1656..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-6-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-6-1.png deleted file mode 100644 index f16a905..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-7-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-7-1.png deleted file mode 100644 index 3809ba0..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-customize-8-1.png b/_book/R4DS_files/figure-html/ggplot2-customize-8-1.png deleted file mode 100644 index 25f7c2d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-customize-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-14-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-14-1.png deleted file mode 100644 index 317b666..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-15-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-15-1.png deleted file mode 100644 index ed1a40c..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-16-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-16-1.png deleted file mode 100644 index 3db588f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-16-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-17-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-17-1.png deleted file mode 100644 index 317b666..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-19-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-19-1.png deleted file mode 100644 index 4b717ea..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-20-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-20-1.png deleted file mode 100644 index cc59781..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-20-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-21-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-21-1.png deleted file mode 100644 index b812ee8..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-22-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-22-1.png deleted file mode 100644 index 2dfa772..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-22-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-23-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-23-1.png deleted file mode 100644 index 6adce2a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-23-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-24-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-24-1.png deleted file mode 100644 index f62baf9..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-25-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-25-1.png deleted file mode 100644 index 7ed8ddc..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-25-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-26-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-26-1.png deleted file mode 100644 index 880d94a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-26-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-27-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-27-1.png deleted file mode 100644 index 3b4c4aa..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-27-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-28-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-28-1.png deleted file mode 100644 index 880d94a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-28-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-28-2.png b/_book/R4DS_files/figure-html/ggplot2-geom-28-2.png deleted file mode 100644 index 3631df7..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-28-2.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-29-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-29-1.png deleted file mode 100644 index d77ee43..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-29-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-30-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-30-1.png deleted file mode 100644 index 4bd80d7..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-30-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-31-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-31-1.png deleted file mode 100644 index d22c505..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-31-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-32-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-32-1.png deleted file mode 100644 index 42fc9f5..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-32-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-33-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-33-1.png deleted file mode 100644 index 931788d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-33-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-34-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-34-1.png deleted file mode 100644 index feb3af8..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-34-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-35-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-35-1.png deleted file mode 100644 index bd938b2..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-35-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-36-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-36-1.png deleted file mode 100644 index 7bec8b4..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-37-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-37-1.png deleted file mode 100644 index 5444520..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-37-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-38-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-38-1.png deleted file mode 100644 index 1f53ff3..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-38-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-39-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-39-1.png deleted file mode 100644 index a62efb6..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-39-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-40-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-40-1.png deleted file mode 100644 index 0e4e838..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-40-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-41-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-41-1.png deleted file mode 100644 index 7263fde..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-41-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-42-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-42-1.png deleted file mode 100644 index 374a9ef..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-42-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-43-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-43-1.png deleted file mode 100644 index d9c3c55..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-43-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-44-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-44-1.png deleted file mode 100644 index ea8071a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-44-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-45-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-45-1.png deleted file mode 100644 index 57224d0..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-45-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-46-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-46-1.png deleted file mode 100644 index 777cb86..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-46-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-47-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-47-1.png deleted file mode 100644 index 0d8f20f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-47-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-48-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-48-1.png deleted file mode 100644 index fce73c5..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-48-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-49-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-49-1.png deleted file mode 100644 index 44f287a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-49-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-5-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-5-1.png deleted file mode 100644 index 97b8798..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-50-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-50-1.png deleted file mode 100644 index 4c1b673..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-50-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-51-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-51-1.png deleted file mode 100644 index cc85037..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-51-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-52-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-52-1.png deleted file mode 100644 index 3bee4f4..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-52-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-53-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-53-1.png deleted file mode 100644 index 7bcd3dc..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-53-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-54-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-54-1.png deleted file mode 100644 index 764e3ae..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-54-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-55-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-55-1.png deleted file mode 100644 index 142f7fa..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-55-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-56-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-56-1.png deleted file mode 100644 index c755105..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-56-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-57-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-57-1.png deleted file mode 100644 index bb6c63d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-57-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-58-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-58-1.png deleted file mode 100644 index daca301..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-58-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-59-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-59-1.png deleted file mode 100644 index da73277..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-59-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-60-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-60-1.png deleted file mode 100644 index 4fe7902..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-60-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-62-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-62-1.png deleted file mode 100644 index f5bfe8a..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-62-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-63-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-63-1.png deleted file mode 100644 index 64c2e37..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-63-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-64-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-64-1.png deleted file mode 100644 index 7d178d9..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-64-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-65-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-65-1.png deleted file mode 100644 index 8ee778f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-65-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-66-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-66-1.png deleted file mode 100644 index a919cd3..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-66-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-68-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-68-1.png deleted file mode 100644 index b7b4688..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-68-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-70-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-70-1.png deleted file mode 100644 index 0b6e804..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-70-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-71-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-71-1.png deleted file mode 100644 index fe84d7b..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-71-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-72-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-72-1.png deleted file mode 100644 index 3ec3f74..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-72-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-73-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-73-1.png deleted file mode 100644 index a1bd9e5..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-73-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-74-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-74-1.png deleted file mode 100644 index 756b2f0..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-74-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-75-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-75-1.png deleted file mode 100644 index 2465589..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-75-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-76-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-76-1.png deleted file mode 100644 index 0fe2121..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-76-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-77-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-77-1.png deleted file mode 100644 index bb6c63d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-77-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-78-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-78-1.png deleted file mode 100644 index 31ca5ba..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-78-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-79-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-79-1.png deleted file mode 100644 index bb6c63d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-79-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-80-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-80-1.png deleted file mode 100644 index 7182989..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-80-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-81-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-81-1.png deleted file mode 100644 index 146415f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-81-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-82-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-82-1.png deleted file mode 100644 index 7863f33..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-82-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-83-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-83-1.png deleted file mode 100644 index f7c9036..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-83-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-84-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-84-1.png deleted file mode 100644 index f928780..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-84-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-85-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-85-1.png deleted file mode 100644 index db08aba..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-85-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-86-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-86-1.png deleted file mode 100644 index e6e155b..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-86-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-geom-87-1.png b/_book/R4DS_files/figure-html/ggplot2-geom-87-1.png deleted file mode 100644 index c9a8006..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-geom-87-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-1-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-1-1.png deleted file mode 100644 index aab8de3..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-1-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-3-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-3-1.png deleted file mode 100644 index 0b90d08..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-3-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-4-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-4-1.png deleted file mode 100644 index 426acb2..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-5-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-5-1.png deleted file mode 100644 index babebc8..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-6-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-6-1.png deleted file mode 100644 index 5893b89..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-7-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-7-1.png deleted file mode 100644 index 98da3da..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-guides-8-1.png b/_book/R4DS_files/figure-html/ggplot2-guides-8-1.png deleted file mode 100644 index 8f1f974..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-guides-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-1-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-1-1.png deleted file mode 100644 index 71008ff..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-1-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-10-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-10-1.png deleted file mode 100644 index a264422..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-10-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-11-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-11-1.png deleted file mode 100644 index ca471b1..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-12-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-12-1.png deleted file mode 100644 index 7b0274c..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-13-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-13-1.png deleted file mode 100644 index 4211db1..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-14-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-14-1.png deleted file mode 100644 index 1168873..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-15-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-15-1.png deleted file mode 100644 index 4605545..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-2-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-2-1.png deleted file mode 100644 index 71008ff..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-2-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-3-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-3-1.png deleted file mode 100644 index 743e7fc..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-3-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-scales-9-1.png b/_book/R4DS_files/figure-html/ggplot2-scales-9-1.png deleted file mode 100644 index 6db95c8..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-scales-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-1-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-1-1.png deleted file mode 100644 index b469ba0..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-1-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-11-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-11-1.png deleted file mode 100644 index 5619f55..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-13-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-13-1.png deleted file mode 100644 index 00927f4..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-14-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-14-1.png deleted file mode 100644 index ae48c94..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-20-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-20-1.png deleted file mode 100644 index ae48c94..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-20-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-21-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-21-1.png deleted file mode 100644 index 614f358..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-24-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-24-1.png deleted file mode 100644 index 800b07b..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-26-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-26-1.png deleted file mode 100644 index 9aae8da..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-26-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-5-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-5-1.png deleted file mode 100644 index 172481d..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-8-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-8-1.png deleted file mode 100644 index d39230e..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-stat-layer-9-1.png b/_book/R4DS_files/figure-html/ggplot2-stat-layer-9-1.png deleted file mode 100644 index 05bda72..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-stat-layer-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-10-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-10-1.png deleted file mode 100644 index 9079f70..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-10-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-11-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-11-1.png deleted file mode 100644 index 4cce604..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-12-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-12-1.png deleted file mode 100644 index b95a78f..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-13-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-13-1.png deleted file mode 100644 index 24fa4b6..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-14-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-14-1.png deleted file mode 100644 index 524b3ee..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-15-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-15-1.png deleted file mode 100644 index 6e2e513..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-18-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-18-1.png deleted file mode 100644 index 0daf2fd..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-18-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-6-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-6-1.png deleted file mode 100644 index ddd7b96..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-7-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-7-1.png deleted file mode 100644 index afcf196..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-8-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-8-1.png deleted file mode 100644 index 76f28b8..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ggplot2-theme-9-1.png b/_book/R4DS_files/figure-html/ggplot2-theme-9-1.png deleted file mode 100644 index b59c1e1..0000000 Binary files a/_book/R4DS_files/figure-html/ggplot2-theme-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-13-1.png b/_book/R4DS_files/figure-html/infer-13-1.png deleted file mode 100644 index 410132a..0000000 Binary files a/_book/R4DS_files/figure-html/infer-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-14-1.png b/_book/R4DS_files/figure-html/infer-14-1.png deleted file mode 100644 index cba3988..0000000 Binary files a/_book/R4DS_files/figure-html/infer-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-17-1.png b/_book/R4DS_files/figure-html/infer-17-1.png deleted file mode 100644 index 1620f6f..0000000 Binary files a/_book/R4DS_files/figure-html/infer-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-22-1.png b/_book/R4DS_files/figure-html/infer-22-1.png deleted file mode 100644 index c5046b0..0000000 Binary files a/_book/R4DS_files/figure-html/infer-22-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-30-1.png b/_book/R4DS_files/figure-html/infer-30-1.png deleted file mode 100644 index c37ee3c..0000000 Binary files a/_book/R4DS_files/figure-html/infer-30-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-5-1.png b/_book/R4DS_files/figure-html/infer-5-1.png deleted file mode 100644 index 10e0f3d..0000000 Binary files a/_book/R4DS_files/figure-html/infer-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/infer-6-1.png b/_book/R4DS_files/figure-html/infer-6-1.png deleted file mode 100644 index ccf9b27..0000000 Binary files a/_book/R4DS_files/figure-html/infer-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lazyman-17-1.png b/_book/R4DS_files/figure-html/lazyman-17-1.png deleted file mode 100644 index 864f300..0000000 Binary files a/_book/R4DS_files/figure-html/lazyman-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lazyman-19-1.png b/_book/R4DS_files/figure-html/lazyman-19-1.png deleted file mode 100644 index d2976c7..0000000 Binary files a/_book/R4DS_files/figure-html/lazyman-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lazyman-gganonymize-1.png b/_book/R4DS_files/figure-html/lazyman-gganonymize-1.png deleted file mode 100644 index 854e53b..0000000 Binary files a/_book/R4DS_files/figure-html/lazyman-gganonymize-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lincoln-temp-all-points-1.png b/_book/R4DS_files/figure-html/lincoln-temp-all-points-1.png deleted file mode 100644 index 36a8d63..0000000 Binary files a/_book/R4DS_files/figure-html/lincoln-temp-all-points-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lincoln-temp-boxplots-1.png b/_book/R4DS_files/figure-html/lincoln-temp-boxplots-1.png deleted file mode 100644 index 1a36ff3..0000000 Binary files a/_book/R4DS_files/figure-html/lincoln-temp-boxplots-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lincoln-temp-jittered-1.png b/_book/R4DS_files/figure-html/lincoln-temp-jittered-1.png deleted file mode 100644 index be39aef..0000000 Binary files a/_book/R4DS_files/figure-html/lincoln-temp-jittered-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lincoln-temp-points-errorbars-1.png b/_book/R4DS_files/figure-html/lincoln-temp-points-errorbars-1.png deleted file mode 100644 index e5c4165..0000000 Binary files a/_book/R4DS_files/figure-html/lincoln-temp-points-errorbars-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lincoln-temp-sina-1.png b/_book/R4DS_files/figure-html/lincoln-temp-sina-1.png deleted file mode 100644 index c327f80..0000000 Binary files a/_book/R4DS_files/figure-html/lincoln-temp-sina-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lincoln-temp-violins-1.png b/_book/R4DS_files/figure-html/lincoln-temp-violins-1.png deleted file mode 100644 index 4fb45f4..0000000 Binary files a/_book/R4DS_files/figure-html/lincoln-temp-violins-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-13-1.png b/_book/R4DS_files/figure-html/lm-13-1.png deleted file mode 100644 index 1d61dc9..0000000 Binary files a/_book/R4DS_files/figure-html/lm-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-20-1.png b/_book/R4DS_files/figure-html/lm-20-1.png deleted file mode 100644 index f3d0e3a..0000000 Binary files a/_book/R4DS_files/figure-html/lm-20-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-31-1.png b/_book/R4DS_files/figure-html/lm-31-1.png deleted file mode 100644 index 75eedbb..0000000 Binary files a/_book/R4DS_files/figure-html/lm-31-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-39-1.png b/_book/R4DS_files/figure-html/lm-39-1.png deleted file mode 100644 index 0ca8091..0000000 Binary files a/_book/R4DS_files/figure-html/lm-39-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-44-1.png b/_book/R4DS_files/figure-html/lm-44-1.png deleted file mode 100644 index 08b9571..0000000 Binary files a/_book/R4DS_files/figure-html/lm-44-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-45-1.png b/_book/R4DS_files/figure-html/lm-45-1.png deleted file mode 100644 index 9b2050e..0000000 Binary files a/_book/R4DS_files/figure-html/lm-45-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-50-1.png b/_book/R4DS_files/figure-html/lm-50-1.png deleted file mode 100644 index 1cc5493..0000000 Binary files a/_book/R4DS_files/figure-html/lm-50-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-51-1.png b/_book/R4DS_files/figure-html/lm-51-1.png deleted file mode 100644 index b7e82d9..0000000 Binary files a/_book/R4DS_files/figure-html/lm-51-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lm-9-1.png b/_book/R4DS_files/figure-html/lm-9-1.png deleted file mode 100644 index 24ff2a7..0000000 Binary files a/_book/R4DS_files/figure-html/lm-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lmm-12-1.png b/_book/R4DS_files/figure-html/lmm-12-1.png deleted file mode 100644 index 129e9d9..0000000 Binary files a/_book/R4DS_files/figure-html/lmm-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lmm-15-1.png b/_book/R4DS_files/figure-html/lmm-15-1.png deleted file mode 100644 index f28e718..0000000 Binary files a/_book/R4DS_files/figure-html/lmm-15-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lmm-25-1.png b/_book/R4DS_files/figure-html/lmm-25-1.png deleted file mode 100644 index fb5f412..0000000 Binary files a/_book/R4DS_files/figure-html/lmm-25-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lmm-6-1.png b/_book/R4DS_files/figure-html/lmm-6-1.png deleted file mode 100644 index f0b8971..0000000 Binary files a/_book/R4DS_files/figure-html/lmm-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/lmm-9-1.png b/_book/R4DS_files/figure-html/lmm-9-1.png deleted file mode 100644 index f6b640b..0000000 Binary files a/_book/R4DS_files/figure-html/lmm-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/logistic-regression-12-1.png b/_book/R4DS_files/figure-html/logistic-regression-12-1.png deleted file mode 100644 index 4d6fd19..0000000 Binary files a/_book/R4DS_files/figure-html/logistic-regression-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/logistic-regression-6-1.png b/_book/R4DS_files/figure-html/logistic-regression-6-1.png deleted file mode 100644 index 993ef52..0000000 Binary files a/_book/R4DS_files/figure-html/logistic-regression-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/logistic-regression-9-1.png b/_book/R4DS_files/figure-html/logistic-regression-9-1.png deleted file mode 100644 index 4638f07..0000000 Binary files a/_book/R4DS_files/figure-html/logistic-regression-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/ordinal-12-1.png b/_book/R4DS_files/figure-html/ordinal-12-1.png deleted file mode 100644 index b13d107..0000000 Binary files a/_book/R4DS_files/figure-html/ordinal-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/poisson-regression-16-1.png b/_book/R4DS_files/figure-html/poisson-regression-16-1.png deleted file mode 100644 index 1d991ba..0000000 Binary files a/_book/R4DS_files/figure-html/poisson-regression-16-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/poisson-regression-19-1.png b/_book/R4DS_files/figure-html/poisson-regression-19-1.png deleted file mode 100644 index 976a746..0000000 Binary files a/_book/R4DS_files/figure-html/poisson-regression-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/poisson-regression-22-1.png b/_book/R4DS_files/figure-html/poisson-regression-22-1.png deleted file mode 100644 index 9d28553..0000000 Binary files a/_book/R4DS_files/figure-html/poisson-regression-22-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/poisson-regression-4-1.png b/_book/R4DS_files/figure-html/poisson-regression-4-1.png deleted file mode 100644 index a4ff199..0000000 Binary files a/_book/R4DS_files/figure-html/poisson-regression-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/poisson-regression-6-1.png b/_book/R4DS_files/figure-html/poisson-regression-6-1.png deleted file mode 100644 index edcf88f..0000000 Binary files a/_book/R4DS_files/figure-html/poisson-regression-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/poisson-regression-7-1.png b/_book/R4DS_files/figure-html/poisson-regression-7-1.png deleted file mode 100644 index 56da214..0000000 Binary files a/_book/R4DS_files/figure-html/poisson-regression-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-12-1.png b/_book/R4DS_files/figure-html/sampling-12-1.png deleted file mode 100644 index 84f70f4..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-13-1.png b/_book/R4DS_files/figure-html/sampling-13-1.png deleted file mode 100644 index a6e6eec..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-14-1.png b/_book/R4DS_files/figure-html/sampling-14-1.png deleted file mode 100644 index 258baa9..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-14-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-21-1.png b/_book/R4DS_files/figure-html/sampling-21-1.png deleted file mode 100644 index dce1f75..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-21-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-23-1.png b/_book/R4DS_files/figure-html/sampling-23-1.png deleted file mode 100644 index 3eaa94f..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-23-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-25-1.png b/_book/R4DS_files/figure-html/sampling-25-1.png deleted file mode 100644 index 36d57c7..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-25-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-28-1.png b/_book/R4DS_files/figure-html/sampling-28-1.png deleted file mode 100644 index 3370ebe..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-28-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-35-1.png b/_book/R4DS_files/figure-html/sampling-35-1.png deleted file mode 100644 index 393f7df..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-35-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-36-1.png b/_book/R4DS_files/figure-html/sampling-36-1.png deleted file mode 100644 index 1f3b573..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-4-1.png b/_book/R4DS_files/figure-html/sampling-4-1.png deleted file mode 100644 index 371077f..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-4-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-40-1.png b/_book/R4DS_files/figure-html/sampling-40-1.png deleted file mode 100644 index a5d2f90..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-40-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-43-1.png b/_book/R4DS_files/figure-html/sampling-43-1.png deleted file mode 100644 index 32d1d60..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-43-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-5-1.png b/_book/R4DS_files/figure-html/sampling-5-1.png deleted file mode 100644 index 12b9374..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-7-1.png b/_book/R4DS_files/figure-html/sampling-7-1.png deleted file mode 100644 index f160bce..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-7-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/sampling-8-1.png b/_book/R4DS_files/figure-html/sampling-8-1.png deleted file mode 100644 index 3d73f50..0000000 Binary files a/_book/R4DS_files/figure-html/sampling-8-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/temp-ridgeline-1.png b/_book/R4DS_files/figure-html/temp-ridgeline-1.png deleted file mode 100644 index e520eeb..0000000 Binary files a/_book/R4DS_files/figure-html/temp-ridgeline-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/temp-ridgeline-colorbar-1.png b/_book/R4DS_files/figure-html/temp-ridgeline-colorbar-1.png deleted file mode 100644 index 58fd720..0000000 Binary files a/_book/R4DS_files/figure-html/temp-ridgeline-colorbar-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tests-as-linear-42-1.png b/_book/R4DS_files/figure-html/tests-as-linear-42-1.png deleted file mode 100644 index 686cb7e..0000000 Binary files a/_book/R4DS_files/figure-html/tests-as-linear-42-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tests-as-linear-5-1.png b/_book/R4DS_files/figure-html/tests-as-linear-5-1.png deleted file mode 100644 index cb95a92..0000000 Binary files a/_book/R4DS_files/figure-html/tests-as-linear-5-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidygraph-12-1.png b/_book/R4DS_files/figure-html/tidygraph-12-1.png deleted file mode 100644 index 33fb157..0000000 Binary files a/_book/R4DS_files/figure-html/tidygraph-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidygraph-19-1.png b/_book/R4DS_files/figure-html/tidygraph-19-1.png deleted file mode 100644 index e672d31..0000000 Binary files a/_book/R4DS_files/figure-html/tidygraph-19-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidygraph-22-1.png b/_book/R4DS_files/figure-html/tidygraph-22-1.png deleted file mode 100644 index 1325460..0000000 Binary files a/_book/R4DS_files/figure-html/tidygraph-22-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidygraph-24-1.png b/_book/R4DS_files/figure-html/tidygraph-24-1.png deleted file mode 100644 index a4f306b..0000000 Binary files a/_book/R4DS_files/figure-html/tidygraph-24-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidygraph-28-1.png b/_book/R4DS_files/figure-html/tidygraph-28-1.png deleted file mode 100644 index b3963f2..0000000 Binary files a/_book/R4DS_files/figure-html/tidygraph-28-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidygraph-9-1.png b/_book/R4DS_files/figure-html/tidygraph-9-1.png deleted file mode 100644 index d72e4df..0000000 Binary files a/_book/R4DS_files/figure-html/tidygraph-9-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidymodels-17-1.png b/_book/R4DS_files/figure-html/tidymodels-17-1.png deleted file mode 100644 index 4be493c..0000000 Binary files a/_book/R4DS_files/figure-html/tidymodels-17-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyr-12-1.png b/_book/R4DS_files/figure-html/tidyr-12-1.png deleted file mode 100644 index e686ce7..0000000 Binary files a/_book/R4DS_files/figure-html/tidyr-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidystats-6-1.png b/_book/R4DS_files/figure-html/tidystats-6-1.png deleted file mode 100644 index 5e2f6be..0000000 Binary files a/_book/R4DS_files/figure-html/tidystats-6-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-32-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-32-1.png deleted file mode 100644 index f00709b..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-32-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-33-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-33-1.png deleted file mode 100644 index ada490b..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-33-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-36-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-36-1.png deleted file mode 100644 index 9189fde..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-36-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-37-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-37-1.png deleted file mode 100644 index 3ec535a..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-37-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-39-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-39-1.png deleted file mode 100644 index c76f844..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-39-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-40-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-40-1.png deleted file mode 100644 index 5ff9950..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-40-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-41-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-41-1.png deleted file mode 100644 index 44f287a..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-41-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-42-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-42-1.png deleted file mode 100644 index cc85037..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-42-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-45-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-45-1.png deleted file mode 100644 index 84acb1b..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-45-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-46-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-46-1.png deleted file mode 100644 index 320db36..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-46-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-60-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-60-1.png deleted file mode 100644 index 91753fb..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-60-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-tips-62-1.png b/_book/R4DS_files/figure-html/tidyverse-tips-62-1.png deleted file mode 100644 index 21ef4ad..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-tips-62-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-workflow-11-1.png b/_book/R4DS_files/figure-html/tidyverse-workflow-11-1.png deleted file mode 100644 index a354de1..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-workflow-11-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-workflow-12-1.png b/_book/R4DS_files/figure-html/tidyverse-workflow-12-1.png deleted file mode 100644 index 1e4453c..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-workflow-12-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/tidyverse-workflow-13-1.png b/_book/R4DS_files/figure-html/tidyverse-workflow-13-1.png deleted file mode 100644 index 07c0499..0000000 Binary files a/_book/R4DS_files/figure-html/tidyverse-workflow-13-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/unnamed-chunk-10-1.png b/_book/R4DS_files/figure-html/unnamed-chunk-10-1.png deleted file mode 100644 index 0d4fb53..0000000 Binary files a/_book/R4DS_files/figure-html/unnamed-chunk-10-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/unnamed-chunk-20-1.png b/_book/R4DS_files/figure-html/unnamed-chunk-20-1.png deleted file mode 100644 index 022bec8..0000000 Binary files a/_book/R4DS_files/figure-html/unnamed-chunk-20-1.png and /dev/null differ diff --git a/_book/R4DS_files/figure-html/unnamed-chunk-9-1.png b/_book/R4DS_files/figure-html/unnamed-chunk-9-1.png deleted file mode 100644 index 8e7c101..0000000 Binary files a/_book/R4DS_files/figure-html/unnamed-chunk-9-1.png and /dev/null differ diff --git a/_book/author.html b/_book/author.html deleted file mode 100644 index 4c0720b..0000000 --- a/_book/author.html +++ /dev/null @@ -1,1281 +0,0 @@ - - -
- - - -library(tidyverse)
-library(tidybayes)
-library(rstan)
-library(brms)
-rstan_options(auto_write = TRUE)
-options(mc.cores = parallel::detectCores())
之前我们讲了线性模型和混合线性模型,今天我们往前一步,应该说是一大步。因为这一步迈向了贝叶斯分析,与频率学派的分析有本质的区别,这种区别类似经典物理和量子物理的区别。
-事实上,贝叶斯在生活中应用很广泛,我们自觉和不自觉中都在使用贝叶斯分析。
-参数是假设,数据是证据。对于参数 \(\theta\) 和数据 \(D\),贝叶斯公式可以写为
-\[ -\underbrace{p(\theta|D)}_\text{posterior} \; = \; \underbrace{p(D|\theta)}_\text{likelihood} \;\; \underbrace{p(\theta)}_\text{prior} \;. -\]
-第一张图: 在看到数据之前,我们预先认为参数,应该在某个范围且服从某种分布
第二张图: 曲线与数据匹配得怎么样? 相似性概率
第三张图: 看到数据之后,可能的曲线
观察到数据点后,我们认为服从线性模型,这个线性模型不是一条直线,而是很多条,有些线的可能性大,有些线的可能性低,但都是有可能的。那么,综合这些有可能的线,(截距和斜率)构成了一种分布,即后验概率分布。
-因为我们是R语言课,我们跳过很多理论推导。事实上,我在学习贝叶斯数据分析的时候,也是先从代码操作人手,然后理解贝叶斯推断相关理论,有时候更直观更容易理解。当然,我不是说我的方法一定正确,只是供大家的一个选项。我会用到brms和stan,但我个人更喜欢stan.
-从最简单的线性模式开始 -\[ -y_n = \alpha + \beta x_n + \epsilon_n \quad \text{where}\quad -\epsilon_n \sim \operatorname{normal}(0,\sigma). -\]
-等价于
-\[ -y_n - (\alpha + \beta X_n) \sim \operatorname{normal}(0,\sigma), -\]
-进一步等价
-\[ -y_n \sim \operatorname{normal}(\alpha + \beta X_n, \, \sigma). -\]
-<- "
- stan_program data {
- int<lower=0> N;
- vector[N] x;
- vector[N] y;
-}
-parameters {
- real alpha;
- real beta;
- real<lower=0> sigma;
-}
-model {
- y ~ normal(alpha + beta * x, sigma);
-}
-"
dplyr 1.0版本增加了across()
函数,这个函数集中体现了dplyr宏包的强大和简约,今天我用企鹅数据,来领略它的美。
library(tidyverse)
-library(palmerpenguins)
- penguins
## # A tibble: 344 x 8
-## species island bill_length_mm bill_depth_mm
-## <fct> <fct> <dbl> <dbl>
-## 1 Adelie Torge~ 39.1 18.7
-## 2 Adelie Torge~ 39.5 17.4
-## 3 Adelie Torge~ 40.3 18
-## 4 Adelie Torge~ NA NA
-## 5 Adelie Torge~ 36.7 19.3
-## 6 Adelie Torge~ 39.3 20.6
-## 7 Adelie Torge~ 38.9 17.8
-## 8 Adelie Torge~ 39.2 19.6
-## 9 Adelie Torge~ 34.1 18.1
-## 10 Adelie Torge~ 42 20.2
-## # ... with 334 more rows, and 4 more variables:
-## # flipper_length_mm <int>, body_mass_g <int>,
-## # sex <fct>, year <int>
-看到数据框里有很多缺失值,需要统计每一列缺失值的数量,按照常规的写法
-%>%
- penguins summarise(
- na_in_species = sum(is.na(species)),
- na_in_island = sum(is.na(island)),
- na_in_length = sum(is.na(bill_length_mm)),
- na_in_depth = sum(is.na(bill_depth_mm)),
- na_in_flipper = sum(is.na(flipper_length_mm)),
- na_in_body = sum(is.na(body_mass_g)),
- na_in_sex = sum(is.na(sex)),
- na_in_year = sum(is.na(year))
- )
## # A tibble: 1 x 8
-## na_in_species na_in_island na_in_length na_in_depth
-## <int> <int> <int> <int>
-## 1 0 0 2 2
-## # ... with 4 more variables: na_in_flipper <int>,
-## # na_in_body <int>, na_in_sex <int>,
-## # na_in_year <int>
-幸亏数据框的列数不够多,只有8列,如果数据框有几百列,那就成体力活了,同时代码复制粘贴也容易出错。想偷懒,我们自然想到用summarise_all()
,
%>%
- penguins summarise_all(
- ~ sum(is.na(.))
- )
## # A tibble: 1 x 8
-## species island bill_length_mm bill_depth_mm
-## <int> <int> <int> <int>
-## 1 0 0 2 2
-## # ... with 4 more variables: flipper_length_mm <int>,
-## # body_mass_g <int>, sex <int>, year <int>
-挺好。接着探索,我们想先按企鹅类型分组,然后统计出各体征数据的均值,这个好说,直接写代码
-%>%
- penguins group_by(species) %>%
- summarise(
- mean_length = mean(bill_length_mm, na.rm = TRUE),
- mean_depth = mean(bill_depth_mm, na.rm = TRUE),
- mean_flipper = mean(flipper_length_mm, na.rm = TRUE),
- mean_body = mean(body_mass_g, na.rm = TRUE)
- )
## # A tibble: 3 x 5
-## species mean_length mean_depth mean_flipper mean_body
-## <fct> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie 38.8 18.3 190. 3701.
-## 2 Chinst~ 48.8 18.4 196. 3733.
-## 3 Gentoo 47.5 15.0 217. 5076.
-或者用summarise_if()
偷懒
<- penguins %>%
- d1 group_by(species) %>%
- summarise_if(is.numeric, mean, na.rm = TRUE)
- d1
## # A tibble: 3 x 6
-## species bill_length_mm bill_depth_mm flipper_length_~
-## <fct> <dbl> <dbl> <dbl>
-## 1 Adelie 38.8 18.3 190.
-## 2 Chinst~ 48.8 18.4 196.
-## 3 Gentoo 47.5 15.0 217.
-## # ... with 2 more variables: body_mass_g <dbl>,
-## # year <dbl>
-方法不错,从语义上还算很好理解。 但多了一列year
, 我想在summarise_if()
中用 is.numeric & !year
去掉year
,却没成功。人类的欲望是无穷的,我们还需要统计每组下企鹅的个数,然后合并到一起。因此,我们再接再厉
<- penguins %>%
- d2 group_by(species) %>%
- summarise(
- n = n()
-
- ) d2
## # A tibble: 3 x 2
-## species n
-## <fct> <int>
-## 1 Adelie 152
-## 2 Chinstrap 68
-## 3 Gentoo 124
-最后合并
-%>% left_join(d2, by = "species") d1
## # A tibble: 3 x 7
-## species bill_length_mm bill_depth_mm flipper_length_~
-## <fct> <dbl> <dbl> <dbl>
-## 1 Adelie 38.8 18.3 190.
-## 2 Chinst~ 48.8 18.4 196.
-## 3 Gentoo 47.5 15.0 217.
-## # ... with 3 more variables: body_mass_g <dbl>,
-## # year <dbl>, n <int>
-结果应该没问题,然鹅,总让人感觉怪怪的,过程有点折腾,希望不这么麻烦。
-across()
的出现,让这一切变得简单和清晰,上面三步完成的动作,一步搞定
%>%
- penguins group_by(species) %>%
- summarise(
- across(where(is.numeric) & !year, mean, na.rm = TRUE),
- n = n()
- )
## # A tibble: 3 x 6
-## species bill_length_mm bill_depth_mm flipper_length_~
-## <fct> <dbl> <dbl> <dbl>
-## 1 Adelie 38.8 18.3 190.
-## 2 Chinst~ 48.8 18.4 196.
-## 3 Gentoo 47.5 15.0 217.
-## # ... with 2 more variables: body_mass_g <dbl>,
-## # n <int>
-是不是很强大。大爱Hadley Wickham !!!
-across()
函数,它有三个主要的参数:
across(.cols = , .fns = , .names = )
第一个参数.cols = ,选取我们要需要的若干列,选取多列的语法与select()
的语法一致,选择方法非常丰富和人性化
:
,变量在位置上是连续的,可以使用类似 1:3
或者species:island
!
,变量名前加!,意思是求这个变量的补集,等价于去掉这个变量,比如!species
&
与 |
,两组变量集的交集和并集,比如 is.numeric & !year
, 就是选取数值类型变量,但不包括year
; 再比如 is.numeric | is.factor
就是选取数值型变量和因子型变量c()
,选取变量的组合,比如c(a, b, x)
everything()
: 选取所有的变量last_col()
: 选取最后一列,也就说倒数第一列,也可以last_col(offset = 1L)
就是倒数第二列starts_with()
: 指定一组变量名的前缀,也就把选取具有这一前缀的变量,starts_with("bill_")
ends_with()
: 指定一组变量名的后缀,也就选取具有这一后缀的变量,ends_with("_mm")
contains()
: 指定变量名含有特定的字符串,也就是选取含有指定字符串的变量,ends_with("length")
matches()
: 同上,字符串可以是正则表达式all_of()
: 选取字符串向量对应的变量名,比如all_of(c("species", "sex", "year"))
,当然前提是,数据框中要有这些变量,否则会报错。any_of()
: 同all_of()
,只不过数据框中没有字符串向量对应的变量,也不会报错,比如数据框中没有people这一列,代码any_of(c("species", "sex", "year", "people"))
也正常运行,挺人性化的where(is.numeric), where(is.factor), where(is.character), where(is.date)
第二个参数.fns =
,我们要执行的函数(或者多个函数),函数的语法有三种形式可选:
mean
.~ mean(.x, na.rm = TRUE)
list(mean = mean, n_miss = ~ sum(is.na(.x))
第三个参数.names =
, 如果.fns
是单个函数就默认保留原来数据列的名称,即"{.col}"
;如果.fns
是多个函数,就在数据列的列名后面跟上函数名,比如"{.col}_{.fn}"
;当然,我们也可以简单调整列名和函数之间的顺序或者增加一个标识的字符串,比如弄成"{.fn}_{.col}"
,"{.col}_{.fn}_aa"
下面通过一些小案例,继续呈现across()
函数的功能
就是本章开始的需求
-%>%
- penguins summarise(
- na_in_species = sum(is.na(species)),
- na_in_island = sum(is.na(island)),
- na_in_length = sum(is.na(bill_length_mm)),
- na_in_depth = sum(is.na(bill_depth_mm)),
- na_in_flipper = sum(is.na(flipper_length_mm)),
- na_in_body = sum(is.na(body_mass_g)),
- na_in_sex = sum(is.na(sex)),
- na_in_year = sum(is.na(year))
- )
# using across()
-%>%
- penguins summarise(
- across(everything(), function(x) sum(is.na(x)))
- )
## # A tibble: 1 x 8
-## species island bill_length_mm bill_depth_mm
-## <int> <int> <int> <int>
-## 1 0 0 2 2
-## # ... with 4 more variables: flipper_length_mm <int>,
-## # body_mass_g <int>, sex <int>, year <int>
-# or
-%>%
- penguins summarise(
- across(everything(), ~ sum(is.na(.)))
- )
## # A tibble: 1 x 8
-## species island bill_length_mm bill_depth_mm
-## <int> <int> <int> <int>
-## 1 0 0 2 2
-## # ... with 4 more variables: flipper_length_mm <int>,
-## # body_mass_g <int>, sex <int>, year <int>
-%>%
- penguins summarise(
- distinct_species = n_distinct(species),
- distinct_island = n_distinct(island),
- distinct_sex = n_distinct(sex)
- )
## # A tibble: 1 x 3
-## distinct_species distinct_island distinct_sex
-## <int> <int> <int>
-## 1 3 3 3
-# using across()
-%>%
- penguins summarise(
- across(c(species, island, sex), n_distinct)
- )
## # A tibble: 1 x 3
-## species island sex
-## <int> <int> <int>
-## 1 3 3 3
-%>%
- penguins group_by(species) %>%
- summarise(
- length_mean = mean(bill_length_mm, na.rm = TRUE),
- length_sd = sd(bill_length_mm, na.rm = TRUE),
- depth_mean = mean(bill_depth_mm, na.rm = TRUE),
- depth_sd = sd(bill_depth_mm, na.rm = TRUE),
- flipper_mean = mean(flipper_length_mm, na.rm = TRUE),
- flipper_sd = sd(flipper_length_mm, na.rm = TRUE),
- n = n()
- )
## # A tibble: 3 x 8
-## species length_mean length_sd depth_mean depth_sd
-## <fct> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie 38.8 2.66 18.3 1.22
-## 2 Chinst~ 48.8 3.34 18.4 1.14
-## 3 Gentoo 47.5 3.08 15.0 0.981
-## # ... with 3 more variables: flipper_mean <dbl>,
-## # flipper_sd <dbl>, n <int>
-# using across()
-%>%
- penguins group_by(species) %>%
- summarise(
- across(ends_with("_mm"), list(mean = mean, sd = sd), na.rm = TRUE),
- n = n()
- )
## # A tibble: 3 x 8
-## species bill_length_mm_~ bill_length_mm_~
-## <fct> <dbl> <dbl>
-## 1 Adelie 38.8 2.66
-## 2 Chinst~ 48.8 3.34
-## 3 Gentoo 47.5 3.08
-## # ... with 5 more variables: bill_depth_mm_mean <dbl>,
-## # bill_depth_mm_sd <dbl>,
-## # flipper_length_mm_mean <dbl>,
-## # flipper_length_mm_sd <dbl>, n <int>
-事实上,这里是across()
与summarise()
的强大结合起来
%>%
- penguins group_by(species, island) %>%
- summarise(
- prob = c(.25, .75),
- length = quantile(bill_length_mm, prob, na.rm = TRUE),
- depth = quantile(bill_depth_mm, prob, na.rm = TRUE),
- flipper = quantile(flipper_length_mm, prob, na.rm = TRUE)
- )
## # A tibble: 10 x 6
-## # Groups: species, island [5]
-## species island prob length depth flipper
-## <fct> <fct> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie Biscoe 0.25 37.7 17.6 185.
-## 2 Adelie Biscoe 0.75 40.7 19.0 193
-## 3 Adelie Dream 0.25 36.8 17.5 185
-## 4 Adelie Dream 0.75 40.4 18.8 193
-## 5 Adelie Torgersen 0.25 36.7 17.4 187
-## 6 Adelie Torgersen 0.75 41.1 19.2 195
-## 7 Chinstrap Dream 0.25 46.3 17.5 191
-## 8 Chinstrap Dream 0.75 51.1 19.4 201
-## 9 Gentoo Biscoe 0.25 45.3 14.2 212
-## 10 Gentoo Biscoe 0.75 49.6 15.7 221
-# using across()
-%>%
- penguins group_by(species, island) %>%
- summarise(
- prob = c(.25, .75),
- across(
- c(bill_length_mm, bill_depth_mm, flipper_length_mm),
- ~ quantile(., prob, na.rm = TRUE)
-
- ) )
## # A tibble: 10 x 6
-## # Groups: species, island [5]
-## species island prob bill_length_mm bill_depth_mm
-## <fct> <fct> <dbl> <dbl> <dbl>
-## 1 Adelie Biscoe 0.25 37.7 17.6
-## 2 Adelie Biscoe 0.75 40.7 19.0
-## 3 Adelie Dream 0.25 36.8 17.5
-## 4 Adelie Dream 0.75 40.4 18.8
-## 5 Adelie Torge~ 0.25 36.7 17.4
-## 6 Adelie Torge~ 0.75 41.1 19.2
-## 7 Chinst~ Dream 0.25 46.3 17.5
-## 8 Chinst~ Dream 0.75 51.1 19.4
-## 9 Gentoo Biscoe 0.25 45.3 14.2
-## 10 Gentoo Biscoe 0.75 49.6 15.7
-## # ... with 1 more variable: flipper_length_mm <dbl>
-# or
-%>%
- penguins group_by(species, island) %>%
- summarise(
- prob = c(.25, .75),
- across(where(is.numeric) & !year, ~ quantile(., prob, na.rm = TRUE))
- )
## # A tibble: 10 x 7
-## # Groups: species, island [5]
-## species island prob bill_length_mm bill_depth_mm
-## <fct> <fct> <dbl> <dbl> <dbl>
-## 1 Adelie Biscoe 0.375 37.7 17.6
-## 2 Adelie Biscoe 0.625 40.7 19.0
-## 3 Adelie Dream 0.375 36.8 17.5
-## 4 Adelie Dream 0.625 40.4 18.8
-## 5 Adelie Torge~ 0.375 36.7 17.4
-## 6 Adelie Torge~ 0.625 41.1 19.2
-## 7 Chinst~ Dream 0.375 46.3 17.5
-## 8 Chinst~ Dream 0.625 51.1 19.4
-## 9 Gentoo Biscoe 0.375 45.3 14.2
-## 10 Gentoo Biscoe 0.625 49.6 15.7
-## # ... with 2 more variables: flipper_length_mm <dbl>,
-## # body_mass_g <dbl>
-# using across()
-%>%
- penguins group_by(species) %>%
- summarise(
- n = n(),
- across(starts_with("bill_"), mean, na.rm = TRUE),
- Area = mean(bill_length_mm * bill_depth_mm, na.rm = TRUE),
- across(ends_with("_g"), mean, na.rm = TRUE),
- )
## # A tibble: 3 x 6
-## species n bill_length_mm bill_depth_mm Area
-## <fct> <int> <dbl> <dbl> <dbl>
-## 1 Adelie 152 38.8 18.3 712.
-## 2 Chinst~ 68 48.8 18.4 900.
-## 3 Gentoo 124 47.5 15.0 712.
-## # ... with 1 more variable: body_mass_g <dbl>
-<- function(x) {
- std - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
- (x
- }
-# using across()
-%>%
- penguins summarise(
- across(where(is.numeric), std),
- across(where(is.character), as.factor)
- )
## # A tibble: 344 x 5
-## bill_length_mm bill_depth_mm flipper_length_~
-## <dbl> <dbl> <dbl>
-## 1 -0.883 0.784 -1.42
-## 2 -0.810 0.126 -1.06
-## 3 -0.663 0.430 -0.421
-## 4 NA NA NA
-## 5 -1.32 1.09 -0.563
-## 6 -0.847 1.75 -0.776
-## 7 -0.920 0.329 -1.42
-## 8 -0.865 1.24 -0.421
-## 9 -1.80 0.480 -0.563
-## 10 -0.352 1.54 -0.776
-## # ... with 334 more rows, and 2 more variables:
-## # body_mass_g <dbl>, year <dbl>
-# using across() and purrr style
-%>%
- penguins drop_na() %>%
- summarise(
- across(starts_with("bill_"), ~ (.x - mean(.x)) / sd(.x))
- )
## # A tibble: 333 x 2
-## bill_length_mm bill_depth_mm
-## <dbl> <dbl>
-## 1 -0.895 0.780
-## 2 -0.822 0.119
-## 3 -0.675 0.424
-## 4 -1.33 1.08
-## 5 -0.858 1.74
-## 6 -0.931 0.323
-## 7 -0.876 1.24
-## 8 -0.529 0.221
-## 9 -0.986 2.05
-## 10 -1.72 2.00
-## # ... with 323 more rows
-# using across()
-%>%
- penguins drop_na() %>%
- mutate(
- across(where(is.numeric), log),
- across(where(is.character), as.factor)
- )
## # A tibble: 333 x 8
-## species island bill_length_mm bill_depth_mm
-## <fct> <fct> <dbl> <dbl>
-## 1 Adelie Torge~ 3.67 2.93
-## 2 Adelie Torge~ 3.68 2.86
-## 3 Adelie Torge~ 3.70 2.89
-## 4 Adelie Torge~ 3.60 2.96
-## 5 Adelie Torge~ 3.67 3.03
-## 6 Adelie Torge~ 3.66 2.88
-## 7 Adelie Torge~ 3.67 2.98
-## 8 Adelie Torge~ 3.72 2.87
-## 9 Adelie Torge~ 3.65 3.05
-## 10 Adelie Torge~ 3.54 3.05
-## # ... with 323 more rows, and 4 more variables:
-## # flipper_length_mm <dbl>, body_mass_g <dbl>,
-## # sex <fct>, year <dbl>
-# using across()
-%>%
- penguins drop_na() %>%
- mutate(
- across(where(is.numeric), .fns = list(log = log), .names = "{.fn}_{.col}"),
- across(where(is.character), as.factor)
- )
## # A tibble: 333 x 13
-## species island bill_length_mm bill_depth_mm
-## <fct> <fct> <dbl> <dbl>
-## 1 Adelie Torge~ 39.1 18.7
-## 2 Adelie Torge~ 39.5 17.4
-## 3 Adelie Torge~ 40.3 18
-## 4 Adelie Torge~ 36.7 19.3
-## 5 Adelie Torge~ 39.3 20.6
-## 6 Adelie Torge~ 38.9 17.8
-## 7 Adelie Torge~ 39.2 19.6
-## 8 Adelie Torge~ 41.1 17.6
-## 9 Adelie Torge~ 38.6 21.2
-## 10 Adelie Torge~ 34.6 21.1
-## # ... with 323 more rows, and 9 more variables:
-## # flipper_length_mm <int>, body_mass_g <int>,
-## # sex <fct>, year <int>, log_bill_length_mm <dbl>,
-## # log_bill_depth_mm <dbl>,
-## # log_flipper_length_mm <dbl>,
-## # log_body_mass_g <dbl>, log_year <dbl>
-cur_data()
配合使用%>%
- penguins group_by(species) %>%
- summarise(
- ::tidy(lm(bill_length_mm ~ bill_depth_mm, data = cur_data()))
- broom )
## # A tibble: 6 x 6
-## # Groups: species [3]
-## species term estimate std.error statistic p.value
-## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie (Inte~ 23.1 3.03 7.60 3.01e-12
-## 2 Adelie bill_~ 0.857 0.165 5.19 6.67e- 7
-## 3 Chinstr~ (Inte~ 13.4 5.06 2.66 9.92e- 3
-## 4 Chinstr~ bill_~ 1.92 0.274 7.01 1.53e- 9
-## 5 Gentoo (Inte~ 17.2 3.28 5.25 6.60e- 7
-## 6 Gentoo bill_~ 2.02 0.219 9.24 1.02e-15
-%>%
- penguins group_by(species) %>%
- summarise(
- ::tidy(lm(bill_length_mm ~ ., data = cur_data() %>% select(is.numeric)))
- broom )
## # A tibble: 15 x 6
-## # Groups: species [3]
-## species term estimate std.error statistic p.value
-## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie (Inte~ -2.75e+2 5.09e+2 -0.539 5.90e-1
-## 2 Adelie bill_~ 2.70e-1 1.92e-1 1.40 1.63e-1
-## 3 Adelie flipp~ 2.51e-2 3.50e-2 0.717 4.74e-1
-## 4 Adelie body_~ 2.62e-3 5.25e-4 4.98 1.74e-6
-## 5 Adelie year 1.47e-1 2.55e-1 0.576 5.66e-1
-## 6 Chinstr~ (Inte~ -4.20e+2 8.24e+2 -0.509 6.12e-1
-## 7 Chinstr~ bill_~ 1.58e+0 3.76e-1 4.20 8.62e-5
-## 8 Chinstr~ flipp~ 1.67e-2 6.82e-2 0.244 8.08e-1
-## 9 Chinstr~ body_~ 1.43e-3 1.15e-3 1.24 2.19e-1
-## 10 Chinstr~ year 2.15e-1 4.12e-1 0.520 6.05e-1
-## 11 Gentoo (Inte~ -6.25e+2 5.10e+2 -1.23 2.23e-1
-## 12 Gentoo bill_~ 5.89e-1 3.15e-1 1.87 6.40e-2
-## 13 Gentoo flipp~ 1.32e-1 4.58e-2 2.89 4.59e-3
-## 14 Gentoo body_~ 2.04e-3 6.07e-4 3.36 1.05e-3
-## 15 Gentoo year 3.11e-1 2.55e-1 1.22 2.24e-1
-%>%
- penguins group_by(species) %>%
- summarise(
- ::tidy(lm(bill_length_mm ~ .,
- broomdata = cur_data() %>% transmute(across(is.numeric))
-
- )) )
## # A tibble: 15 x 6
-## # Groups: species [3]
-## species term estimate std.error statistic p.value
-## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie (Inte~ -2.75e+2 5.09e+2 -0.539 5.90e-1
-## 2 Adelie bill_~ 2.70e-1 1.92e-1 1.40 1.63e-1
-## 3 Adelie flipp~ 2.51e-2 3.50e-2 0.717 4.74e-1
-## 4 Adelie body_~ 2.62e-3 5.25e-4 4.98 1.74e-6
-## 5 Adelie year 1.47e-1 2.55e-1 0.576 5.66e-1
-## 6 Chinstr~ (Inte~ -4.20e+2 8.24e+2 -0.509 6.12e-1
-## 7 Chinstr~ bill_~ 1.58e+0 3.76e-1 4.20 8.62e-5
-## 8 Chinstr~ flipp~ 1.67e-2 6.82e-2 0.244 8.08e-1
-## 9 Chinstr~ body_~ 1.43e-3 1.15e-3 1.24 2.19e-1
-## 10 Chinstr~ year 2.15e-1 4.12e-1 0.520 6.05e-1
-## 11 Gentoo (Inte~ -6.25e+2 5.10e+2 -1.23 2.23e-1
-## 12 Gentoo bill_~ 5.89e-1 3.15e-1 1.87 6.40e-2
-## 13 Gentoo flipp~ 1.32e-1 4.58e-2 2.89 4.59e-3
-## 14 Gentoo body_~ 2.04e-3 6.07e-4 3.36 1.05e-3
-## 15 Gentoo year 3.11e-1 2.55e-1 1.22 2.24e-1
-%>%
- penguins group_by(species) %>%
- summarise(
- ::tidy(lm(bill_length_mm ~ ., data = across(is.numeric)))
- broom )
## # A tibble: 15 x 6
-## # Groups: species [3]
-## species term estimate std.error statistic p.value
-## <fct> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Adelie (Inte~ -2.75e+2 5.09e+2 -0.539 5.90e-1
-## 2 Adelie bill_~ 2.70e-1 1.92e-1 1.40 1.63e-1
-## 3 Adelie flipp~ 2.51e-2 3.50e-2 0.717 4.74e-1
-## 4 Adelie body_~ 2.62e-3 5.25e-4 4.98 1.74e-6
-## 5 Adelie year 1.47e-1 2.55e-1 0.576 5.66e-1
-## 6 Chinstr~ (Inte~ -4.20e+2 8.24e+2 -0.509 6.12e-1
-## 7 Chinstr~ bill_~ 1.58e+0 3.76e-1 4.20 8.62e-5
-## 8 Chinstr~ flipp~ 1.67e-2 6.82e-2 0.244 8.08e-1
-## 9 Chinstr~ body_~ 1.43e-3 1.15e-3 1.24 2.19e-1
-## 10 Chinstr~ year 2.15e-1 4.12e-1 0.520 6.05e-1
-## 11 Gentoo (Inte~ -6.25e+2 5.10e+2 -1.23 2.23e-1
-## 12 Gentoo bill_~ 5.89e-1 3.15e-1 1.87 6.40e-2
-## 13 Gentoo flipp~ 1.32e-1 4.58e-2 2.89 4.59e-3
-## 14 Gentoo body_~ 2.04e-3 6.07e-4 3.36 1.05e-3
-## 15 Gentoo year 3.11e-1 2.55e-1 1.22 2.24e-1
-cur_column()
配合使用# 每一列乘以各自的系数
-<- tibble(x = 1:3, y = 3:5, z = 5:7)
- df <- list(x = 1, y = 10, z = 100)
- mult
-%>%
- df mutate(across(all_of(names(mult)), ~ .x * mult[[cur_column()]]))
## # A tibble: 3 x 3
-## x y z
-## <dbl> <dbl> <dbl>
-## 1 1 30 500
-## 2 2 40 600
-## 3 3 50 700
-# 每一列乘以各自的权重
-<- tibble(x = 1:3, y = 3:5, z = 5:7)
- df <- list(x = 0.2, y = 0.3, z = 0.5)
- weights
-%>%
- df mutate(
- across(all_of(names(weights)),
- list(wt = ~ .x * weights[[cur_column()]]),
- .names = "{col}.{fn}"
-
- ) )
## # A tibble: 3 x 6
-## x y z x.wt y.wt z.wt
-## <int> <int> <int> <dbl> <dbl> <dbl>
-## 1 1 3 5 0.2 0.900 2.5
-## 2 2 4 6 0.4 1.2 3
-## 3 3 5 7 0.6 1.5 3.5
-# 每一列有各自的阈值,如果在阈值之上为1,否则为 0
-<- tibble(x = 1:3, y = 3:5, z = 5:7)
- df <- list(x = 2, y = 3, z = 7)
- cutoffs
-%>% mutate(
- df across(all_of(names(cutoffs)), ~ if_else(.x > cutoffs[[cur_column()]], 1, 0))
- )
## # A tibble: 3 x 3
-## x y z
-## <dbl> <dbl> <dbl>
-## 1 0 0 0
-## 2 0 1 0
-## 3 1 1 0
-c_across()
配合也挺默契在一行中的占比
-<- tibble(x = 1:3, y = 3:5, z = 5:7)
- df
-%>%
- df rowwise() %>%
- mutate(total = sum(c_across(x:z))) %>%
- ungroup() %>%
- mutate(across(x:z, ~ . / total))
## # A tibble: 3 x 4
-## x y z total
-## <dbl> <dbl> <dbl> <int>
-## 1 0.111 0.333 0.556 9
-## 2 0.167 0.333 0.5 12
-## 3 0.2 0.333 0.467 15
-看一行中哪个最大,最大的变为1,其余的变为0
-<- function(vec) {
- replace_col_max if (!is.vector(vec)) {
- stop("input of replace_col_max must be vector.")
-
- }
-if_else(vec == max(vec), 1L, 0L)
-
- }
-
-%>%
- df rowwise() %>%
- mutate(
- new = list(replace_col_max(c_across(everything())))
- %>%
- ) unnest_wider(new, names_sep = "_")
## # A tibble: 3 x 6
-## x y z new_1 new_2 new_3
-## <int> <int> <int> <int> <int> <int>
-## 1 1 3 5 0 0 1
-## 2 2 4 6 0 0 1
-## 3 3 5 7 0 0 1
-我们看到了,across()
函数在summarise()/mutate()/transmute()/condense()
中使用,它能实现以下几个功能:
left_join()
还是用第 13 章的gapminder
案例
library(tidyverse)
-library(gapminder)
- gapminder
## # A tibble: 1,704 x 6
-## country continent year lifeExp pop gdpPercap
-## <fct> <fct> <int> <dbl> <int> <dbl>
-## 1 Afghanist~ Asia 1952 28.8 8.43e6 779.
-## 2 Afghanist~ Asia 1957 30.3 9.24e6 821.
-## 3 Afghanist~ Asia 1962 32.0 1.03e7 853.
-## 4 Afghanist~ Asia 1967 34.0 1.15e7 836.
-## 5 Afghanist~ Asia 1972 36.1 1.31e7 740.
-## 6 Afghanist~ Asia 1977 38.4 1.49e7 786.
-## 7 Afghanist~ Asia 1982 39.9 1.29e7 978.
-## 8 Afghanist~ Asia 1987 40.8 1.39e7 852.
-## 9 Afghanist~ Asia 1992 41.7 1.63e7 649.
-## 10 Afghanist~ Asia 1997 41.8 2.22e7 635.
-## # ... with 1,694 more rows
-画个简单的图
-%>%
- gapminder ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
- geom_point(alpha = 0.2)
我们想用不同的模型拟合log(gdpPercap)
与lifeExp
的关联
library(colorspace)
-
-<- colorspace::qualitative_hcl(4, palette = "dark 2")
- model_colors # model_colors <- c("darkorange", "purple", "cyan4")
-
-ggplot(
-data = gapminder,
- mapping = aes(x = log(gdpPercap), y = lifeExp)
- +
- ) geom_point(alpha = 0.2) +
- geom_smooth(
- method = "lm",
- aes(color = "OLS", fill = "OLS") # one
- +
- ) geom_smooth(
- method = "lm", formula = y ~ splines::bs(x, df = 3),
- aes(color = "Cubic Spline", fill = "Cubic Spline") # two
- +
- ) geom_smooth(
- method = "loess",
- aes(color = "LOESS", fill = "LOESS") # three
- +
- ) scale_color_manual(name = "Models", values = model_colors) +
- scale_fill_manual(name = "Models", values = model_colors) +
- theme(legend.position = "top")
还是回到我们今天的主题。我们建立一个简单的线性模型
-<- lm(
- out formula = lifeExp ~ gdpPercap + pop + continent,
- data = gapminder
-
- ) out
##
-## Call:
-## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
-##
-## Coefficients:
-## (Intercept) gdpPercap
-## 4.78e+01 4.50e-04
-## pop continentAmericas
-## 6.57e-09 1.35e+01
-## continentAsia continentEurope
-## 8.19e+00 1.75e+01
-## continentOceania
-## 1.81e+01
-str(out)
summary(out)
##
-## Call:
-## lm(formula = lifeExp ~ gdpPercap + pop + continent, data = gapminder)
-##
-## Residuals:
-## Min 1Q Median 3Q Max
-## -49.16 -4.49 0.30 5.11 25.17
-##
-## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) 4.78e+01 3.40e-01 140.82 <2e-16
-## gdpPercap 4.50e-04 2.35e-05 19.16 <2e-16
-## pop 6.57e-09 1.98e-09 3.33 9e-04
-## continentAmericas 1.35e+01 6.00e-01 22.46 <2e-16
-## continentAsia 8.19e+00 5.71e-01 14.34 <2e-16
-## continentEurope 1.75e+01 6.25e-01 27.97 <2e-16
-## continentOceania 1.81e+01 1.78e+00 10.15 <2e-16
-##
-## (Intercept) ***
-## gdpPercap ***
-## pop ***
-## continentAmericas ***
-## continentAsia ***
-## continentEurope ***
-## continentOceania ***
-## ---
-## Signif. codes:
-## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Residual standard error: 8.37 on 1697 degrees of freedom
-## Multiple R-squared: 0.582, Adjusted R-squared: 0.581
-## F-statistic: 394 on 6 and 1697 DF, p-value: <2e-16
-模型的输出结果是一个复杂的list,图 29.1给出了out
的结构
-我们发现out
对象包含了很多元素,比如系数、残差、模型残差自由度等等,用读取列表的方法可以直接读取
$coefficients
- out$residuals
- out$fitted.values out
事实上,前面使用的suammary()
函数只是选取和打印了out
对象的一小部分信息,同时这些信息的结构不适合用dplyr
操作和ggplot2
画图。
为规整模型结果,这里我们推荐用David Robinson 开发的broom
宏包。
library(broom)
broom
宏包将常用的100多种模型的输出结果规整成数据框
-tibble()
的格式,在模型比较和可视化中就可以方便使用dplyr
函数了。
-broom
提供了三个主要的函数:
tidy()
提取模型输出结果的主要信息,比如 coefficients
和 t-statistics
glance()
把模型视为一个整体,提取如 F-statistic
,model deviance
或者 r-squared
等信息augment()
模型输出的信息添加到建模用的数据集中,比如fitted values
和 residuals
tidy(out)
## # A tibble: 7 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 4.78e+1 3.40e-1 141. 0.
-## 2 gdpPercap 4.50e-4 2.35e-5 19.2 3.24e- 74
-## 3 pop 6.57e-9 1.98e-9 3.33 9.01e- 4
-## 4 continentAm~ 1.35e+1 6.00e-1 22.5 5.19e- 98
-## 5 continentAs~ 8.19e+0 5.71e-1 14.3 4.06e- 44
-## 6 continentEu~ 1.75e+1 6.25e-1 28.0 6.34e-142
-## 7 continentOc~ 1.81e+1 1.78e+0 10.1 1.59e- 23
-%>%
- out tidy() %>%
- ggplot(mapping = aes(
- x = term,
- y = estimate
- +
- )) geom_point() +
- coord_flip()
可以很方便的获取系数的置信区间
-%>%
- out tidy(conf.int = TRUE)
## # A tibble: 7 x 7
-## term estimate std.error statistic p.value conf.low
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 (Int~ 4.78e+1 3.40e-1 141. 0. 4.71e+1
-## 2 gdpP~ 4.50e-4 2.35e-5 19.2 3.24e- 74 4.03e-4
-## 3 pop 6.57e-9 1.98e-9 3.33 9.01e- 4 2.70e-9
-## 4 cont~ 1.35e+1 6.00e-1 22.5 5.19e- 98 1.23e+1
-## 5 cont~ 8.19e+0 5.71e-1 14.3 4.06e- 44 7.07e+0
-## 6 cont~ 1.75e+1 6.25e-1 28.0 6.34e-142 1.62e+1
-## 7 cont~ 1.81e+1 1.78e+0 10.1 1.59e- 23 1.46e+1
-## # ... with 1 more variable: conf.high <dbl>
-%>%
- out tidy(conf.int = TRUE) %>%
- filter(!term %in% c("(Intercept)")) %>%
- ggplot(aes(
- x = reorder(term, estimate),
- y = estimate, ymin = conf.low, ymax = conf.high
- +
- )) geom_pointrange() +
- coord_flip() +
- labs(x = "", y = "OLS Estimate")
augment()
会返回一个数据框,这个数据框是在原始数据框的基础上,增加了模型的拟合值(.fitted
), 拟合值的标准误(.se.fit
), 残差(.resid
)等列。
augment(out)
## # A tibble: 1,704 x 10
-## lifeExp gdpPercap pop continent .fitted .resid
-## <dbl> <dbl> <int> <fct> <dbl> <dbl>
-## 1 28.8 779. 8.43e6 Asia 56.4 -27.6
-## 2 30.3 821. 9.24e6 Asia 56.4 -26.1
-## 3 32.0 853. 1.03e7 Asia 56.5 -24.5
-## 4 34.0 836. 1.15e7 Asia 56.5 -22.4
-## 5 36.1 740. 1.31e7 Asia 56.4 -20.3
-## 6 38.4 786. 1.49e7 Asia 56.5 -18.0
-## 7 39.9 978. 1.29e7 Asia 56.5 -16.7
-## 8 40.8 852. 1.39e7 Asia 56.5 -15.7
-## 9 41.7 649. 1.63e7 Asia 56.4 -14.7
-## 10 41.8 635. 2.22e7 Asia 56.4 -14.7
-## # ... with 1,694 more rows, and 4 more variables:
-## # .std.resid <dbl>, .hat <dbl>, .sigma <dbl>,
-## # .cooksd <dbl>
-%>%
- out augment() %>%
- ggplot(mapping = aes(x = lifeExp, y = .fitted)) +
- geom_point()
glance()
函数也会返回数据框,但这个数据框只有一行,内容实际上是summary()
输出结果的最底下一行。
glance(out)
## # A tibble: 1 x 12
-## r.squared adj.r.squared sigma statistic p.value
-## <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 0.582 0.581 8.37 394. 3.94e-317
-## # ... with 7 more variables: df <dbl>, logLik <dbl>,
-## # AIC <dbl>, BIC <dbl>, deviance <dbl>,
-## # df.residual <int>, nobs <int>
-broom的三个主要函数在分组统计建模时,格外方便。
-<-
- penguins ::penguins %>%
- palmerpenguinsdrop_na()
%>%
- penguins group_nest(species) %>%
- mutate(model = purrr::map(data, ~ lm(bill_depth_mm ~ bill_length_mm, data = .))) %>%
- mutate(glance = purrr::map(model, ~ broom::glance(.))) %>%
- ::unnest(glance) tidyr
## # A tibble: 3 x 15
-## species data model r.squared adj.r.squared sigma
-## <fct> <list<tb> <lis> <dbl> <dbl> <dbl>
-## 1 Adelie [146 x 7] <lm> 0.149 0.143 1.13
-## 2 Chinst~ [68 x 7] <lm> 0.427 0.418 0.866
-## 3 Gentoo [119 x 7] <lm> 0.428 0.423 0.749
-## # ... with 9 more variables: statistic <dbl>,
-## # p.value <dbl>, df <dbl>, logLik <dbl>, AIC <dbl>,
-## # BIC <dbl>, deviance <dbl>, df.residual <int>,
-## # nobs <int>
-<- function(df) {
- fit_ols lm(body_mass_g ~ bill_depth_mm + bill_length_mm, data = df)
-
- }
-
-<- penguins %>%
- out_tidy group_nest(species) %>%
- mutate(model = purrr::map(data, fit_ols)) %>%
- mutate(tidy = purrr::map(model, ~ broom::tidy(.))) %>%
- ::unnest(tidy) %>%
- tidyr::filter(!term %in% "(Intercept)")
- dplyr
- out_tidy
## # A tibble: 6 x 8
-## species data model term estimate std.error
-## <fct> <list<tb> <lis> <chr> <dbl> <dbl>
-## 1 Adelie [146 x 7] <lm> bill~ 164. 25.1
-## 2 Adelie [146 x 7] <lm> bill~ 64.8 11.5
-## 3 Chinst~ [68 x 7] <lm> bill~ 159. 43.3
-## 4 Chinst~ [68 x 7] <lm> bill~ 23.8 14.7
-## 5 Gentoo [119 x 7] <lm> bill~ 255. 40.0
-## 6 Gentoo [119 x 7] <lm> bill~ 54.7 12.7
-## # ... with 2 more variables: statistic <dbl>,
-## # p.value <dbl>
-%>%
- out_tidy ggplot(aes(
- x = species, y = estimate,
- ymin = estimate - 2 * std.error,
- ymax = estimate + 2 * std.error,
- color = term
- +
- )) geom_pointrange(position = position_dodge(width = 0.25)) +
- theme(legend.position = "top") +
- labs(x = NULL, y = "Estimate", color = "系数")
假定数据是
-<- tibble(
- df x = runif(30, 2, 10),
- y = -2*x + rnorm(30, 0, 5)
-
- ) df
## # A tibble: 30 x 2
-## x y
-## <dbl> <dbl>
-## 1 8.59 -19.3
-## 2 7.95 -8.92
-## 3 8.06 -16.3
-## 4 2.82 -3.84
-## 5 2.17 -4.88
-## 6 5.19 -5.14
-## 7 7.94 -16.1
-## 8 8.01 -28.1
-## 9 4.10 -3.55
-## 10 9.62 -12.2
-## # ... with 20 more rows
-用broom::augment()
和ggplot2做出类似的残差图
dplyr宏包是数据科学tidyverse集合的核心部件之一,Hadley Wickham大神说将会在5月15日发布dplyr 1.0版本,欢呼。
-为迎接新时代的到来,我在线上同大家一起分享dplyr 1.0版本新的特点和功能,看看都为我们带来哪些惊喜?
-New dplyr - 8 things to know:
-tidyselect
relocate()
summarise()
across()
cur_data()
, cur_group()
and cur_column()
rowwise()
grammarnest_by()
library(dplyr, warn.conflicts = FALSE)
-library(tidyr)
mutate()
-select()
-filter()
-group_by()
-summarise()
-arrange()
-rename()
-left_join()
在dplyr 1.0之前,summarise()
会把统计结果整理成一行一列的数据框,现在可以根据函数返回的结果,可以有多种形式:
min(x), n(), or sum(is.na(y))
quantile()
<- tibble(
- df grp = rep(c("a", "b"), each = 5),
- x = c(rnorm(5, -0.25, 1), rnorm(5, 0, 1.5)),
- y = c(rnorm(5, 0.25, 1), rnorm(5, 0, 0.5))
-
- ) df
## # A tibble: 10 x 3
-## grp x y
-## <chr> <dbl> <dbl>
-## 1 a -0.665 -0.387
-## 2 a -0.270 -0.839
-## 3 a 0.791 0.0371
-## 4 a -1.38 -0.144
-## 5 a 0.903 0.148
-## 6 b 1.55 0.143
-## 7 b 1.10 0.0986
-## 8 b -0.400 -1.11
-## 9 b -2.47 -0.670
-## 10 b -0.374 -0.440
-%>%
- df group_by(grp) %>%
- summarise(rng = mean(x))
## # A tibble: 2 x 2
-## grp rng
-## <chr> <dbl>
-## 1 a -0.124
-## 2 b -0.117
-当统计函数返回多个值的时候,比如range()
返回是最小值和最大值,summarise()
很贴心地将结果整理成多行,这样符合tidy的格式。
%>%
- df group_by(grp) %>%
- summarise(rng = range(x))
## # A tibble: 4 x 2
-## # Groups: grp [2]
-## grp rng
-## <chr> <dbl>
-## 1 a -1.38
-## 2 a 0.903
-## 3 b -2.47
-## 4 b 1.55
-类似的还有quantile()
函数,也是返回多个值
%>%
- df group_by(grp) %>%
- summarise(
- rng = quantile(x, probs = c(0.05, 0.5, 0.95))
- )
## # A tibble: 6 x 2
-## # Groups: grp [2]
-## grp rng
-## <chr> <dbl>
-## 1 a -1.23
-## 2 a -0.270
-## 3 a 0.881
-## 4 b -2.05
-## 5 b -0.374
-## 6 b 1.46
-%>%
- df group_by(grp) %>%
- summarise(
- x = quantile(x, c(0.25, 0.5, 0.75)),
- q = c(0.25, 0.5, 0.75)
- )
## # A tibble: 6 x 3
-## # Groups: grp [2]
-## grp x q
-## <chr> <dbl> <dbl>
-## 1 a -0.665 0.25
-## 2 a -0.270 0.5
-## 3 a 0.791 0.75
-## 4 b -0.400 0.25
-## 5 b -0.374 0.5
-## 6 b 1.10 0.75
-summarise()
可以输出数据框,比如
<- function(x, probs) {
- my_quantile tibble(x = quantile(x, probs), probs = probs)
-
- }%>%
- mtcars group_by(cyl) %>%
- summarise(my_quantile(disp, c(0.25, 0.75)))
## # A tibble: 6 x 3
-## # Groups: cyl [3]
-## cyl x probs
-## <dbl> <dbl> <dbl>
-## 1 4 78.8 0.25
-## 2 4 121. 0.75
-## 3 6 160 0.25
-## 4 6 196. 0.75
-## 5 8 302. 0.25
-## 6 8 390 0.75
-再比如:
-dplyr 1.0 之前是需要group_modify()
来实现数据框进,数据框出
%>%
- mtcars group_by(cyl) %>%
- group_modify(
- ~ broom::tidy(lm(mpg ~ wt, data = .))
- )
## # A tibble: 6 x 6
-## # Groups: cyl [3]
-## cyl term estimate std.error statistic p.value
-## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 4 (Interce~ 39.6 4.35 9.10 7.77e-6
-## 2 4 wt -5.65 1.85 -3.05 1.37e-2
-## 3 6 (Interce~ 28.4 4.18 6.79 1.05e-3
-## 4 6 wt -2.78 1.33 -2.08 9.18e-2
-## 5 8 (Interce~ 23.9 3.01 7.94 4.05e-6
-## 6 8 wt -2.19 0.739 -2.97 1.18e-2
-dplyr 1.0 之后,有了新的方案
-%>%
- mtcars group_by(cyl) %>%
- summarise(
- ::tidy(lm(mpg ~ wt))
- broom )
## # A tibble: 6 x 6
-## # Groups: cyl [3]
-## cyl term estimate std.error statistic p.value
-## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 4 (Interce~ 39.6 4.35 9.10 7.77e-6
-## 2 4 wt -5.65 1.85 -3.05 1.37e-2
-## 3 6 (Interce~ 28.4 4.18 6.79 1.05e-3
-## 4 6 wt -2.78 1.33 -2.08 9.18e-2
-## 5 8 (Interce~ 23.9 3.01 7.94 4.05e-6
-## 6 8 wt -2.19 0.739 -2.97 1.18e-2
-当 group_by()
与summarise()
配合使用的时候,summarise()
默认会抵消掉最近一次的分组信息,比如下面按照cyl
和vs
分组,但summarise()
后,就只剩下cyl
的分组信息了。
%>%
- mtcars group_by(cyl, vs) %>%
- summarise(cyl_n = n())
## # A tibble: 5 x 3
-## # Groups: cyl [3]
-## cyl vs cyl_n
-## <dbl> <dbl> <int>
-## 1 4 0 1
-## 2 4 1 10
-## 3 6 0 3
-## 4 6 1 4
-## 5 8 0 14
-%>%
- mtcars group_by(cyl, vs) %>%
- summarise(cyl_n = n()) %>%
- group_vars()
## [1] "cyl"
-如果想保留vs的分组信息,就需要设置.groups = keep
参数
%>%
- mtcars group_by(cyl, vs) %>%
- summarise(cyl_n = n(), .groups = "keep") %>%
- group_vars()
## [1] "cyl" "vs"
-当然summarise()
可以控制输出的更多形式
%>%
- mtcars group_by(cyl, vs) %>%
- summarise(cyl_n = n(), .groups = "drop") %>%
- group_vars()
## character(0)
-%>%
- mtcars group_by(cyl, vs) %>%
- summarise(cyl_n = n(), .groups = "rowwise") %>%
- group_vars()
## [1] "cyl" "vs"
-%>% select(1, 3) df
## # A tibble: 10 x 2
-## grp y
-## <chr> <dbl>
-## 1 a -0.387
-## 2 a -0.839
-## 3 a 0.0371
-## 4 a -0.144
-## 5 a 0.148
-## 6 b 0.143
-## 7 b 0.0986
-## 8 b -1.11
-## 9 b -0.670
-## 10 b -0.440
-%>% select(2:3) df
## # A tibble: 10 x 2
-## x y
-## <dbl> <dbl>
-## 1 -0.665 -0.387
-## 2 -0.270 -0.839
-## 3 0.791 0.0371
-## 4 -1.38 -0.144
-## 5 0.903 0.148
-## 6 1.55 0.143
-## 7 1.10 0.0986
-## 8 -0.400 -1.11
-## 9 -2.47 -0.670
-## 10 -0.374 -0.440
-%>% select(grp, x, y) df
## # A tibble: 10 x 3
-## grp x y
-## <chr> <dbl> <dbl>
-## 1 a -0.665 -0.387
-## 2 a -0.270 -0.839
-## 3 a 0.791 0.0371
-## 4 a -1.38 -0.144
-## 5 a 0.903 0.148
-## 6 b 1.55 0.143
-## 7 b 1.10 0.0986
-## 8 b -0.400 -1.11
-## 9 b -2.47 -0.670
-## 10 b -0.374 -0.440
-%>% select(x:y) df
## # A tibble: 10 x 2
-## x y
-## <dbl> <dbl>
-## 1 -0.665 -0.387
-## 2 -0.270 -0.839
-## 3 0.791 0.0371
-## 4 -1.38 -0.144
-## 5 0.903 0.148
-## 6 1.55 0.143
-## 7 1.10 0.0986
-## 8 -0.400 -1.11
-## 9 -2.47 -0.670
-## 10 -0.374 -0.440
-%>% select(starts_with("x")) df
## # A tibble: 10 x 1
-## x
-## <dbl>
-## 1 -0.665
-## 2 -0.270
-## 3 0.791
-## 4 -1.38
-## 5 0.903
-## 6 1.55
-## 7 1.10
-## 8 -0.400
-## 9 -2.47
-## 10 -0.374
-%>% select(ends_with("p")) df
## # A tibble: 10 x 1
-## grp
-## <chr>
-## 1 a
-## 2 a
-## 3 a
-## 4 a
-## 5 a
-## 6 b
-## 7 b
-## 8 b
-## 9 b
-## 10 b
-%>% select(contains("x")) df
## # A tibble: 10 x 1
-## x
-## <dbl>
-## 1 -0.665
-## 2 -0.270
-## 3 0.791
-## 4 -1.38
-## 5 0.903
-## 6 1.55
-## 7 1.10
-## 8 -0.400
-## 9 -2.47
-## 10 -0.374
-%>% select(matches("x")) df
## # A tibble: 10 x 1
-## x
-## <dbl>
-## 1 -0.665
-## 2 -0.270
-## 3 0.791
-## 4 -1.38
-## 5 0.903
-## 6 1.55
-## 7 1.10
-## 8 -0.400
-## 9 -2.47
-## 10 -0.374
-%>% select(where(is.character)) df
## # A tibble: 10 x 1
-## grp
-## <chr>
-## 1 a
-## 2 a
-## 3 a
-## 4 a
-## 5 a
-## 6 b
-## 7 b
-## 8 b
-## 9 b
-## 10 b
-%>% select(where(is.numeric)) df
## # A tibble: 10 x 2
-## x y
-## <dbl> <dbl>
-## 1 -0.665 -0.387
-## 2 -0.270 -0.839
-## 3 0.791 0.0371
-## 4 -1.38 -0.144
-## 5 0.903 0.148
-## 6 1.55 0.143
-## 7 1.10 0.0986
-## 8 -0.400 -1.11
-## 9 -2.47 -0.670
-## 10 -0.374 -0.440
-%>% select(!where(is.character)) df
## # A tibble: 10 x 2
-## x y
-## <dbl> <dbl>
-## 1 -0.665 -0.387
-## 2 -0.270 -0.839
-## 3 0.791 0.0371
-## 4 -1.38 -0.144
-## 5 0.903 0.148
-## 6 1.55 0.143
-## 7 1.10 0.0986
-## 8 -0.400 -1.11
-## 9 -2.47 -0.670
-## 10 -0.374 -0.440
-%>% select(where(is.numeric) & starts_with("x")) df
## # A tibble: 10 x 1
-## x
-## <dbl>
-## 1 -0.665
-## 2 -0.270
-## 3 0.791
-## 4 -1.38
-## 5 0.903
-## 6 1.55
-## 7 1.10
-## 8 -0.400
-## 9 -2.47
-## 10 -0.374
-%>% select(starts_with("g") | ends_with("y")) df
## # A tibble: 10 x 2
-## grp y
-## <chr> <dbl>
-## 1 a -0.387
-## 2 a -0.839
-## 3 a 0.0371
-## 4 a -0.144
-## 5 a 0.148
-## 6 b 0.143
-## 7 b 0.0986
-## 8 b -1.11
-## 9 b -0.670
-## 10 b -0.440
-# 注意any_of和all_of的区别
-
-<- c("x", "y", "z")
- vars %>% select(all_of(vars))
- df %>% select(any_of(vars)) df
%>% rename(group = grp) df
## # A tibble: 10 x 3
-## group x y
-## <chr> <dbl> <dbl>
-## 1 a -0.665 -0.387
-## 2 a -0.270 -0.839
-## 3 a 0.791 0.0371
-## 4 a -1.38 -0.144
-## 5 a 0.903 0.148
-## 6 b 1.55 0.143
-## 7 b 1.10 0.0986
-## 8 b -0.400 -1.11
-## 9 b -2.47 -0.670
-## 10 b -0.374 -0.440
-%>% rename_with(toupper) df
## # A tibble: 10 x 3
-## GRP X Y
-## <chr> <dbl> <dbl>
-## 1 a -0.665 -0.387
-## 2 a -0.270 -0.839
-## 3 a 0.791 0.0371
-## 4 a -1.38 -0.144
-## 5 a 0.903 0.148
-## 6 b 1.55 0.143
-## 7 b 1.10 0.0986
-## 8 b -0.400 -1.11
-## 9 b -2.47 -0.670
-## 10 b -0.374 -0.440
-%>% rename_with(toupper, is.numeric) df
## # A tibble: 10 x 3
-## grp X Y
-## <chr> <dbl> <dbl>
-## 1 a -0.665 -0.387
-## 2 a -0.270 -0.839
-## 3 a 0.791 0.0371
-## 4 a -1.38 -0.144
-## 5 a 0.903 0.148
-## 6 b 1.55 0.143
-## 7 b 1.10 0.0986
-## 8 b -0.400 -1.11
-## 9 b -2.47 -0.670
-## 10 b -0.374 -0.440
-%>% rename_with(toupper, starts_with("x")) df
## # A tibble: 10 x 3
-## grp X y
-## <chr> <dbl> <dbl>
-## 1 a -0.665 -0.387
-## 2 a -0.270 -0.839
-## 3 a 0.791 0.0371
-## 4 a -1.38 -0.144
-## 5 a 0.903 0.148
-## 6 b 1.55 0.143
-## 7 b 1.10 0.0986
-## 8 b -0.400 -1.11
-## 9 b -2.47 -0.670
-## 10 b -0.374 -0.440
-我们前面一章讲过arrange()
排序,这是行方向的排序, 比如按照x变量绝对值的大小从高到低排序。
%>% arrange(desc(abs(x))) df
## # A tibble: 10 x 3
-## grp x y
-## <chr> <dbl> <dbl>
-## 1 b -2.47 -0.670
-## 2 b 1.55 0.143
-## 3 a -1.38 -0.144
-## 4 b 1.10 0.0986
-## 5 a 0.903 0.148
-## 6 a 0.791 0.0371
-## 7 a -0.665 -0.387
-## 8 b -0.400 -1.11
-## 9 b -0.374 -0.440
-## 10 a -0.270 -0.839
-我们现在想调整列的位置,比如,这里调整数据框三列的位置,让grp
列放在x
列的后面
%>% select(x, grp, y) df
## # A tibble: 10 x 3
-## x grp y
-## <dbl> <chr> <dbl>
-## 1 -0.665 a -0.387
-## 2 -0.270 a -0.839
-## 3 0.791 a 0.0371
-## 4 -1.38 a -0.144
-## 5 0.903 a 0.148
-## 6 1.55 b 0.143
-## 7 1.10 b 0.0986
-## 8 -0.400 b -1.11
-## 9 -2.47 b -0.670
-## 10 -0.374 b -0.440
-如果列变量很多的时候,上面的方法就不太好用,因此推荐大家使用relocate()
%>% relocate(grp, .after = y) df
## # A tibble: 10 x 3
-## x y grp
-## <dbl> <dbl> <chr>
-## 1 -0.665 -0.387 a
-## 2 -0.270 -0.839 a
-## 3 0.791 0.0371 a
-## 4 -1.38 -0.144 a
-## 5 0.903 0.148 a
-## 6 1.55 0.143 b
-## 7 1.10 0.0986 b
-## 8 -0.400 -1.11 b
-## 9 -2.47 -0.670 b
-## 10 -0.374 -0.440 b
-%>% relocate(x, .before = grp) df
## # A tibble: 10 x 3
-## x grp y
-## <dbl> <chr> <dbl>
-## 1 -0.665 a -0.387
-## 2 -0.270 a -0.839
-## 3 0.791 a 0.0371
-## 4 -1.38 a -0.144
-## 5 0.903 a 0.148
-## 6 1.55 b 0.143
-## 7 1.10 b 0.0986
-## 8 -0.400 b -1.11
-## 9 -2.47 b -0.670
-## 10 -0.374 b -0.440
-还有
-%>% relocate(grp, .after = last_col()) df
## # A tibble: 10 x 3
-## x y grp
-## <dbl> <dbl> <chr>
-## 1 -0.665 -0.387 a
-## 2 -0.270 -0.839 a
-## 3 0.791 0.0371 a
-## 4 -1.38 -0.144 a
-## 5 0.903 0.148 a
-## 6 1.55 0.143 b
-## 7 1.10 0.0986 b
-## 8 -0.400 -1.11 b
-## 9 -2.47 -0.670 b
-## 10 -0.374 -0.440 b
-我们必须为这个函数点赞。大爱Hadley Wickham !!!
-我们经常需要对数据框的多列执行相同的操作。比如
-<- iris %>% as_tibble()
- iris iris
## # A tibble: 150 x 5
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 5.1 3.5 1.4 0.2
-## 2 4.9 3 1.4 0.2
-## 3 4.7 3.2 1.3 0.2
-## 4 4.6 3.1 1.5 0.2
-## 5 5 3.6 1.4 0.2
-## 6 5.4 3.9 1.7 0.4
-## 7 4.6 3.4 1.4 0.3
-## 8 5 3.4 1.5 0.2
-## 9 4.4 2.9 1.4 0.2
-## 10 4.9 3.1 1.5 0.1
-## # ... with 140 more rows, and 1 more variable:
-## # Species <fct>
-%>%
- iris group_by(Species) %>%
- summarise(
- mean_Sepal_Length = mean(Sepal.Length),
- mean_Sepal_Width = mean(Sepal.Width),
- mean_Petal_Length = mean(Petal.Length),
- mean_Petal_Width = mean(Petal.Width)
- )
## # A tibble: 3 x 5
-## Species mean_Sepal_Leng~ mean_Sepal_Width
-## <fct> <dbl> <dbl>
-## 1 setosa 5.01 3.43
-## 2 versic~ 5.94 2.77
-## 3 virgin~ 6.59 2.97
-## # ... with 2 more variables: mean_Petal_Length <dbl>,
-## # mean_Petal_Width <dbl>
-dplyr 1.0之后,使用across()
函数异常简练
%>%
- iris group_by(Species) %>%
- summarise(
- across(everything(), mean)
- )
## # A tibble: 3 x 5
-## Species Sepal.Length Sepal.Width Petal.Length
-## <fct> <dbl> <dbl> <dbl>
-## 1 setosa 5.01 3.43 1.46
-## 2 versic~ 5.94 2.77 4.26
-## 3 virgin~ 6.59 2.97 5.55
-## # ... with 1 more variable: Petal.Width <dbl>
-或者更科学的
-%>%
- iris group_by(Species) %>%
- summarise(
- across(is.numeric, mean)
- )
## # A tibble: 3 x 5
-## Species Sepal.Length Sepal.Width Petal.Length
-## <fct> <dbl> <dbl> <dbl>
-## 1 setosa 5.01 3.43 1.46
-## 2 versic~ 5.94 2.77 4.26
-## 3 virgin~ 6.59 2.97 5.55
-## # ... with 1 more variable: Petal.Width <dbl>
-可以看到,以往是一列一列的处理,现在对多列同时操作,这主要得益于across()
函数,它有两个主要的参数:
across(.cols = , .fns = )
select()
的语法一致再看看这个案例
-<- function(x) {
- std - mean(x)) / sd(x)
- (x
- }
-%>%
- iris group_by(Species) %>%
- summarise(
- across(starts_with("Sepal"), std)
- )
## # A tibble: 150 x 3
-## # Groups: Species [3]
-## Species Sepal.Length Sepal.Width
-## <fct> <dbl> <dbl>
-## 1 setosa 0.267 0.190
-## 2 setosa -0.301 -1.13
-## 3 setosa -0.868 -0.601
-## 4 setosa -1.15 -0.865
-## 5 setosa -0.0170 0.454
-## 6 setosa 1.12 1.25
-## 7 setosa -1.15 -0.0739
-## 8 setosa -0.0170 -0.0739
-## 9 setosa -1.72 -1.39
-## 10 setosa -0.301 -0.865
-## # ... with 140 more rows
-# purrr style
-%>%
- iris group_by(Species) %>%
- summarise(
- across(starts_with("Sepal"), ~ (.x - mean(.x)) / sd(.x))
- )
## # A tibble: 150 x 3
-## # Groups: Species [3]
-## Species Sepal.Length Sepal.Width
-## <fct> <dbl> <dbl>
-## 1 setosa 0.267 0.190
-## 2 setosa -0.301 -1.13
-## 3 setosa -0.868 -0.601
-## 4 setosa -1.15 -0.865
-## 5 setosa -0.0170 0.454
-## 6 setosa 1.12 1.25
-## 7 setosa -1.15 -0.0739
-## 8 setosa -0.0170 -0.0739
-## 9 setosa -1.72 -1.39
-## 10 setosa -0.301 -0.865
-## # ... with 140 more rows
-%>%
- iris group_by(Species) %>%
- summarise(
- across(starts_with("Petal"), list(min = min, max = max))
- # across(starts_with("Petal"), list(min = min, max = max), .names = "{fn}_{col}")
- )
## # A tibble: 3 x 5
-## Species Petal.Length_min Petal.Length_max
-## <fct> <dbl> <dbl>
-## 1 setosa 1 1.9
-## 2 versic~ 3 5.1
-## 3 virgin~ 4.5 6.9
-## # ... with 2 more variables: Petal.Width_min <dbl>,
-## # Petal.Width_max <dbl>
-%>%
- iris group_by(Species) %>%
- summarise(
- across(starts_with("Sepal"), mean),
- Area = mean(Petal.Length * Petal.Width),
- across(c(Petal.Width), min),
- n = n()
- )
## # A tibble: 3 x 6
-## Species Sepal.Length Sepal.Width Area Petal.Width
-## <fct> <dbl> <dbl> <dbl> <dbl>
-## 1 setosa 5.01 3.43 0.366 0.1
-## 2 versic~ 5.94 2.77 5.72 1
-## 3 virgin~ 6.59 2.97 11.3 1.4
-## # ... with 1 more variable: n <int>
-除了在summarise()
里可以使用外,在其它函数也是可以使用的
%>% mutate(across(is.numeric, mean)) iris
## # A tibble: 150 x 5
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 5.84 3.06 3.76 1.20
-## 2 5.84 3.06 3.76 1.20
-## 3 5.84 3.06 3.76 1.20
-## 4 5.84 3.06 3.76 1.20
-## 5 5.84 3.06 3.76 1.20
-## 6 5.84 3.06 3.76 1.20
-## 7 5.84 3.06 3.76 1.20
-## 8 5.84 3.06 3.76 1.20
-## 9 5.84 3.06 3.76 1.20
-## 10 5.84 3.06 3.76 1.20
-## # ... with 140 more rows, and 1 more variable:
-## # Species <fct>
-%>% mutate(across(starts_with("Sepal"), mean)) iris
## # A tibble: 150 x 5
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 5.84 3.06 1.4 0.2
-## 2 5.84 3.06 1.4 0.2
-## 3 5.84 3.06 1.3 0.2
-## 4 5.84 3.06 1.5 0.2
-## 5 5.84 3.06 1.4 0.2
-## 6 5.84 3.06 1.7 0.4
-## 7 5.84 3.06 1.4 0.3
-## 8 5.84 3.06 1.5 0.2
-## 9 5.84 3.06 1.4 0.2
-## 10 5.84 3.06 1.5 0.1
-## # ... with 140 more rows, and 1 more variable:
-## # Species <fct>
-%>% mutate(across(is.numeric, std)) # std function has defined before iris
## # A tibble: 150 x 5
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 -0.898 1.02 -1.34 -1.31
-## 2 -1.14 -0.132 -1.34 -1.31
-## 3 -1.38 0.327 -1.39 -1.31
-## 4 -1.50 0.0979 -1.28 -1.31
-## 5 -1.02 1.25 -1.34 -1.31
-## 6 -0.535 1.93 -1.17 -1.05
-## 7 -1.50 0.786 -1.34 -1.18
-## 8 -1.02 0.786 -1.28 -1.31
-## 9 -1.74 -0.361 -1.34 -1.31
-## 10 -1.14 0.0979 -1.28 -1.44
-## # ... with 140 more rows, and 1 more variable:
-## # Species <fct>
-%>% mutate(
- iris across(is.numeric, ~ .x / 2),
- across(is.factor, stringr::str_to_upper)
- )
## # A tibble: 150 x 5
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 2.55 1.75 0.7 0.1
-## 2 2.45 1.5 0.7 0.1
-## 3 2.35 1.6 0.65 0.1
-## 4 2.3 1.55 0.75 0.1
-## 5 2.5 1.8 0.7 0.1
-## 6 2.7 1.95 0.85 0.2
-## 7 2.3 1.7 0.7 0.15
-## 8 2.5 1.7 0.75 0.1
-## 9 2.2 1.45 0.7 0.1
-## 10 2.45 1.55 0.75 0.05
-## # ... with 140 more rows, and 1 more variable:
-## # Species <chr>
-n()
, 返回当前分组的多少行cur_data()
, 返回当前分组的数据内容(不包含分组变量)cur_group()
, 返回当前分组的分组变量(一行一列的数据框)across(cur_column())
, 返回当前列的列名这些函数返回当前分组的信息,因此只能在特定函数内部使用,比如summarise()
and mutate()
<- tibble(
- df g = sample(rep(letters[1:3], 1:3)),
- x = runif(6),
- y = runif(6)
-
- ) df
## # A tibble: 6 x 3
-## g x y
-## <chr> <dbl> <dbl>
-## 1 b 0.603 0.415
-## 2 b 0.443 0.936
-## 3 c 0.0727 0.301
-## 4 c 0.749 0.888
-## 5 c 0.591 0.273
-## 6 a 0.278 0.259
-%>%
- df group_by(g) %>%
- summarise(
- n = n()
- )
## # A tibble: 3 x 2
-## g n
-## <chr> <int>
-## 1 a 1
-## 2 b 2
-## 3 c 3
-%>%
- df group_by(g) %>%
- summarise(
- data = list(cur_group())
- )
## # A tibble: 3 x 2
-## g data
-## <chr> <list>
-## 1 a <tibble [1 x 1]>
-## 2 b <tibble [1 x 1]>
-## 3 c <tibble [1 x 1]>
-%>%
- df group_by(g) %>%
- summarise(
- data = list(cur_data())
- )
## # A tibble: 3 x 2
-## g data
-## <chr> <list>
-## 1 a <tibble [1 x 2]>
-## 2 b <tibble [2 x 2]>
-## 3 c <tibble [3 x 2]>
-%>%
- mtcars group_by(cyl) %>%
- summarise(
- ::tidy(lm(mpg ~ wt, data = cur_data()))
- broom )
## # A tibble: 6 x 6
-## # Groups: cyl [3]
-## cyl term estimate std.error statistic p.value
-## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 4 (Interce~ 39.6 4.35 9.10 7.77e-6
-## 2 4 wt -5.65 1.85 -3.05 1.37e-2
-## 3 6 (Interce~ 28.4 4.18 6.79 1.05e-3
-## 4 6 wt -2.78 1.33 -2.08 9.18e-2
-## 5 8 (Interce~ 23.9 3.01 7.94 4.05e-6
-## 6 8 wt -2.19 0.739 -2.97 1.18e-2
-%>%
- df group_by(g) %>%
- mutate(across(everything(), ~ paste(cur_column(), round(.x, 2))))
## # A tibble: 6 x 3
-## # Groups: g [3]
-## g x y
-## <chr> <chr> <chr>
-## 1 b x 0.6 y 0.42
-## 2 b x 0.44 y 0.94
-## 3 c x 0.07 y 0.3
-## 4 c x 0.75 y 0.89
-## 5 c x 0.59 y 0.27
-## 6 a x 0.28 y 0.26
-<- c(x = 0.2, y = 0.8)
- wt
-%>%
- df mutate(
- across(c(x, y), ~ .x * wt[cur_column()])
- )
## # A tibble: 6 x 3
-## g x y
-## <chr> <dbl> <dbl>
-## 1 b 0.121 0.332
-## 2 b 0.0885 0.749
-## 3 c 0.0145 0.241
-## 4 c 0.150 0.711
-## 5 c 0.118 0.219
-## 6 a 0.0555 0.207
-数据框中向量de方向,事实上可以看做有两个方向,横着看是row-vector,竖着看是col-vector。 -
-tidyverse遵循的tidy原则,一列表示一个变量,一行表示一次观察。 -这种数据的存储格式,对ggplot2很方便,但对行方向的操作或者运算不同友好。比如
-<- tibble(id = letters[1:6], w = 10:15, x = 20:25, y = 30:35, z = 40:45)
- df df
## # A tibble: 6 x 5
-## id w x y z
-## <chr> <int> <int> <int> <int>
-## 1 a 10 20 30 40
-## 2 b 11 21 31 41
-## 3 c 12 22 32 42
-## 4 d 13 23 33 43
-## 5 e 14 24 34 44
-## 6 f 15 25 35 45
-计算每行的均值,
-%>% mutate(avg = mean(c(w, x, y, z))) df
## # A tibble: 6 x 6
-## id w x y z avg
-## <chr> <int> <int> <int> <int> <dbl>
-## 1 a 10 20 30 40 27.5
-## 2 b 11 21 31 41 27.5
-## 3 c 12 22 32 42 27.5
-## 4 d 13 23 33 43 27.5
-## 5 e 14 24 34 44 27.5
-## 6 f 15 25 35 45 27.5
-好像不对?为什么呢?
-%>%
- df pivot_longer(
- cols = -id,
- names_to = "variable",
- values_to = "value"
- %>%
- ) group_by(id) %>%
- summarize(
- r_mean = mean(value)
- )
## # A tibble: 6 x 2
-## id r_mean
-## <chr> <dbl>
-## 1 a 25
-## 2 b 26
-## 3 c 27
-## 4 d 28
-## 5 e 29
-## 6 f 30
-如果保留原始数据,就还需要再left_join()
一次,虽然思路清晰,但还是挺周转的。
purrr
宏包的pmap_dbl
函数library(purrr)
-%>%
- df mutate(r_mean = pmap_dbl(select_if(., is.numeric), lift_vd(mean)))
## # A tibble: 6 x 6
-## id w x y z r_mean
-## <chr> <int> <int> <int> <int> <dbl>
-## 1 a 10 20 30 40 25
-## 2 b 11 21 31 41 26
-## 3 c 12 22 32 42 27
-## 4 d 13 23 33 43 28
-## 5 e 14 24 34 44 29
-## 6 f 15 25 35 45 30
-但需要学习新的语法,代价也很高。
-rowwise()
%>%
- df rowwise() %>%
- mutate(avg = mean(c(w, x, y, z)))
## # A tibble: 6 x 6
-## # Rowwise:
-## id w x y z avg
-## <chr> <int> <int> <int> <int> <dbl>
-## 1 a 10 20 30 40 25
-## 2 b 11 21 31 41 26
-## 3 c 12 22 32 42 27
-## 4 d 13 23 33 43 28
-## 5 e 14 24 34 44 29
-## 6 f 15 25 35 45 30
-变量名要是很多的话,又变了体力活了,怎么才能变的轻巧一点呢?
-rowwise() + c_across()
,现在dplyr 1.0终于给出了一个很好的解决方案%>%
- df rowwise() %>%
- mutate(
- avg = mean(c_across(w:z))
- )
## # A tibble: 6 x 6
-## # Rowwise:
-## id w x y z avg
-## <chr> <int> <int> <int> <int> <dbl>
-## 1 a 10 20 30 40 25
-## 2 b 11 21 31 41 26
-## 3 c 12 22 32 42 27
-## 4 d 13 23 33 43 28
-## 5 e 14 24 34 44 29
-## 6 f 15 25 35 45 30
-这个很好的解决方案中,rowwise()
工作原理类似与group_by()
,是按每一行进行分组,然后按行(行方向)统计
%>%
- df rowwise(id) %>%
- mutate(total = mean(c_across(w:z)))
## # A tibble: 6 x 6
-## # Rowwise: id
-## id w x y z total
-## <chr> <int> <int> <int> <int> <dbl>
-## 1 a 10 20 30 40 25
-## 2 b 11 21 31 41 26
-## 3 c 12 22 32 42 27
-## 4 d 13 23 33 43 28
-## 5 e 14 24 34 44 29
-## 6 f 15 25 35 45 30
-%>%
- df rowwise(id) %>%
- mutate(mean = mean(c_across(is.numeric)))
## # A tibble: 6 x 6
-## # Rowwise: id
-## id w x y z mean
-## <chr> <int> <int> <int> <int> <dbl>
-## 1 a 10 20 30 40 25
-## 2 b 11 21 31 41 26
-## 3 c 12 22 32 42 27
-## 4 d 13 23 33 43 28
-## 5 e 14 24 34 44 29
-## 6 f 15 25 35 45 30
-%>%
- df rowwise(id) %>%
- summarise(
- m = mean(c_across(is.numeric))
- )
## # A tibble: 6 x 2
-## # Groups: id [6]
-## id m
-## <chr> <dbl>
-## 1 a 25
-## 2 b 26
-## 3 c 27
-## 4 d 28
-## 5 e 29
-## 6 f 30
-因此,我们可以总结成下面这张图
- -rowwise()
不仅仅用于计算行方向均值这样的简单统计,而是当处理列表列时,方才显示出rowwise()
与purrr::map
一样的强大。那么,什么是列表列?
-列表列指的是数据框的一列是一个列表, 比如
<- tibble(
- tb x = list(1, 2:3, 4:6)
- )
如果想显示列表中每个元素的长度,用purrr包,可以这样写
-%>% mutate(l = purrr::map_int(x, length)) tb
## # A tibble: 3 x 2
-## x l
-## <list> <int>
-## 1 <dbl [1]> 1
-## 2 <int [2]> 2
-## 3 <int [3]> 3
-如果从行方向的角度理解,其实很简练
-%>%
- tb rowwise() %>%
- mutate(l = length(x))
## # A tibble: 3 x 2
-## # Rowwise:
-## x l
-## <list> <int>
-## 1 <dbl [1]> 1
-## 2 <int [2]> 2
-## 3 <int [3]> 3
-<- mtcars %>% as_tibble()
- mtcars mtcars
## # A tibble: 32 x 11
-## mpg cyl disp hp drat wt qsec vs
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 21 6 160 110 3.9 2.62 16.5 0
-## 2 21 6 160 110 3.9 2.88 17.0 0
-## 3 22.8 4 108 93 3.85 2.32 18.6 1
-## 4 21.4 6 258 110 3.08 3.22 19.4 1
-## 5 18.7 8 360 175 3.15 3.44 17.0 0
-## 6 18.1 6 225 105 2.76 3.46 20.2 1
-## 7 14.3 8 360 245 3.21 3.57 15.8 0
-## 8 24.4 4 147. 62 3.69 3.19 20 1
-## 9 22.8 4 141. 95 3.92 3.15 22.9 1
-## 10 19.2 6 168. 123 3.92 3.44 18.3 1
-## # ... with 22 more rows, and 3 more variables:
-## # am <dbl>, gear <dbl>, carb <dbl>
-以cyl分组,计算每组中mpg ~ wt
的线性模型的系数.
%>%
- mtcars group_by(cyl) %>%
- nest()
## # A tibble: 3 x 2
-## # Groups: cyl [3]
-## cyl data
-## <dbl> <list>
-## 1 6 <tibble [7 x 10]>
-## 2 4 <tibble [11 x 10]>
-## 3 8 <tibble [14 x 10]>
-分组建模后,形成列表列,此时列表中的每个元素对应一个模型,我们需要依次提取每次模型的系数,列方向的做法是,借用purrr::map
完成列表中每个模型的迭代,
%>%
- mtcars group_by(cyl) %>%
- nest() %>%
- mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
- mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
- unnest(result)
## # A tibble: 6 x 8
-## # Groups: cyl [3]
-## cyl data model term estimate std.error statistic
-## <dbl> <lis> <lis> <chr> <dbl> <dbl> <dbl>
-## 1 6 <tib~ <lm> (Int~ 28.4 4.18 6.79
-## 2 6 <tib~ <lm> wt -2.78 1.33 -2.08
-## 3 4 <tib~ <lm> (Int~ 39.6 4.35 9.10
-## 4 4 <tib~ <lm> wt -5.65 1.85 -3.05
-## 5 8 <tib~ <lm> (Int~ 23.9 3.01 7.94
-## 6 8 <tib~ <lm> wt -2.19 0.739 -2.97
-## # ... with 1 more variable: p.value <dbl>
-用purrr::map
实现列表元素一个一个的依次迭代,从数据框的角度来看(数据框是列表的一种特殊形式),因此实质上就是一行一行的处理。所以,尽管purrr很强大,但需要一定学习成本,从解决问题的路径上也比较周折。
事实上,分组建模后,形成列表列,这种存储格式,天然地符合行处理的范式,因此一开始就使用行方向分组(这里nest_by()
类似于 group_by()
)
%>%
- mtcars nest_by(cyl) %>%
- mutate(model = list(lm(mpg ~ wt, data = data))) %>%
- summarise(broom::tidy(model))
## # A tibble: 6 x 6
-## # Groups: cyl [3]
-## cyl term estimate std.error statistic p.value
-## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 4 (Interce~ 39.6 4.35 9.10 7.77e-6
-## 2 4 wt -5.65 1.85 -3.05 1.37e-2
-## 3 6 (Interce~ 28.4 4.18 6.79 1.05e-3
-## 4 6 wt -2.78 1.33 -2.08 9.18e-2
-## 5 8 (Interce~ 23.9 3.01 7.94 4.05e-6
-## 6 8 wt -2.19 0.739 -2.97 1.18e-2
-# or
-%>%
- mtcars nest_by(cyl) %>%
- summarise(
- ::tidy(lm(mpg ~ wt, data = data))
- broom )
## # A tibble: 6 x 6
-## # Groups: cyl [3]
-## cyl term estimate std.error statistic p.value
-## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 4 (Interce~ 39.6 4.35 9.10 7.77e-6
-## 2 4 wt -5.65 1.85 -3.05 1.37e-2
-## 3 6 (Interce~ 28.4 4.18 6.79 1.05e-3
-## 4 6 wt -2.78 1.33 -2.08 9.18e-2
-## 5 8 (Interce~ 23.9 3.01 7.94 4.05e-6
-## 6 8 wt -2.19 0.739 -2.97 1.18e-2
-至此,tidyverse框架下,实现分组统计中的数据框进,数据框输出
, 现在有四种方法了
%>%
- mtcars group_nest(cyl) %>%
- mutate(model = purrr::map(data, ~ lm(mpg ~ wt, data = .))) %>%
- mutate(result = purrr::map(model, ~ broom::tidy(.))) %>%
- ::unnest(result)
- tidyr
-
-%>%
- mtcars group_by(cyl) %>%
- group_modify(
- ~ broom::tidy(lm(mpg ~ wt, data = .))
-
- )
-
-%>%
- mtcars nest_by(cyl) %>%
- summarise(
- ::tidy(lm(mpg ~ wt, data = data))
- broom
- )
-
-%>%
- mtcars group_by(cyl) %>%
- summarise(
- ::tidy(lm(mpg ~ wt, data = cur_data()))
- broom
- )
-# or
-%>%
- mtcars group_by(cyl) %>%
- summarise(broom::tidy(lm(mpg ~ wt)))
本章介绍tidyverse的语法中经常遇到.
, 不同的场景,含义不同。因此很有必要弄清楚各自的含义。
library(tidyverse)
.
各自代表什么意思呢?read_csv("./data/wages.csv") %>%
-mutate(letter = str_extract(race, "(?<=h)(.)")) %>%
- select(., -letter) %>%
- mutate_at(vars(race), ~ as.factor(.)) %>%
- mutate_at(vars(sex), ~ if_else(. == "male", 1, 0)) %>%
- filter_if(~ is.numeric(.), all_vars(. != 0)) %>%
- split(.$sex) %>%
- map(~ lm(earn ~ ., data = .)) %>%
- map_dfr(~ broom::tidy(.), .id = "sex")
回答之前,我们先介绍一些相关知识点
-管道符号%>%
主要功能是传递参数。
y %>% f()
is equivalent to f(y)
y %>% f(x, .)
is equivalent to f(x, y)
z %>% f(x, y, arg = .)
is equivalent to f(x, y, arg = z)
我们经常这样写
-%>%
- mtcars select(cyl, disp, hp) %>%
- head(2)
## # A tibble: 2 x 3
-## cyl disp hp
-## <dbl> <dbl> <dbl>
-## 1 6 160 110
-## 2 6 160 110
-实际上,这里是有占位符的
-%>%
- mtcars select(., cyl, disp, hp) %>%
- head(., 2)
## # A tibble: 2 x 3
-## cyl disp hp
-## <dbl> <dbl> <dbl>
-## 1 6 160 110
-## 2 6 160 110
-.
出现在函数.f
的位置上, 就是 purrr 风格的Lambda函数~ fun(.)
,
%>%
- mtcars select_at(vars(contains("ar")), ~ toupper(.)) %>%
- head(3)
## # A tibble: 3 x 2
-## GEAR CARB
-## <dbl> <dbl>
-## 1 4 4
-## 2 4 4
-## 3 4 1
-有时候程序员会将~toupper(.)
简写成 toupper
%>%
- mtcars select_at(vars(contains("ar")), toupper) %>%
- head(3)
## # A tibble: 3 x 2
-## GEAR CARB
-## <dbl> <dbl>
-## 1 4 4
-## 2 4 4
-## 3 4 1
-<- "the fattest cat." words
%>% str_replace_all("t.", "-") words
## [1] "-e fa-es-ca-"
-%>% str_replace_all("t\\.", "-") words
## [1] "the fattest ca-"
-<- . %>% mean(na.rm = T)
- mean_rm
-c(1, 2, 3, NA) %>% mean_rm()
## [1] 2
-等价于
-# is equivalent to
-c(1, 2, 3, NA) %>% mean(., na.rm = T)
## [1] 2
-%>% subset(1:nrow(.) %% 30 == 0) iris
## # A tibble: 5 x 5
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 4.7 3.2 1.6 0.2
-## 2 5.2 2.7 3.9 1.4
-## 3 5.5 2.5 4 1.3
-## 4 6 2.2 5 1.5
-## 5 5.9 3 5.1 1.8
-## # ... with 1 more variable: Species <fct>
-1:10 %>% {
-c(min(.), max(.))
- }
## [1] 1 10
-当dplyr::mutate
遇到purrr::map
,情况就复杂很多了。然而,这种情况,tidyverse比比皆是。我就多说几句吧
%>%
- iris head(3) %>%
- mutate(., r_sum = pmap_dbl(select_if(., is.numeric), sum))
## # A tibble: 3 x 6
-## Sepal.Length Sepal.Width Petal.Length Petal.Width
-## <dbl> <dbl> <dbl> <dbl>
-## 1 5.1 3.5 1.4 0.2
-## 2 4.9 3 1.4 0.2
-## 3 4.7 3.2 1.3 0.2
-## # ... with 2 more variables: Species <fct>,
-## # r_sum <dbl>
-这里mutate()
行,有两个.
, 实际这两个.
都是等待iris %>% head(3)
传来的data.frame
<- tibble(
- df mean = c(1, 2),
- sd = c(2, 4)
-
- ) df
## # A tibble: 2 x 2
-## mean sd
-## <dbl> <dbl>
-## 1 1 2
-## 2 2 4
-%>%
- df ::mutate(., rand = map(mean, ~ rnorm(5, .))) %>%
- dplyr::unnest_wider(rand) tidyr
## # A tibble: 2 x 7
-## mean sd ...1 ...2 ...3 ...4 ...5
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 1 2 0.238 -0.0633 0.651 2.24 0.941
-## 2 2 4 2.63 2.59 1.68 2.00 0.196
-.
, 是df
.
, 是df
中的mean
%>%
- df ::mutate(rand = map2(mean, sd, ~ rnorm(5, .x, .y))) %>%
- dplyr::unnest_wider(rand) tidyr
## # A tibble: 2 x 7
-## mean sd ...1 ...2 ...3 ...4 ...5
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 1 2 -0.141 -1.16 1.68 2.26 -0.419
-## 2 2 4 -1.28 1.07 -1.49 6.01 -2.63
-mean
传给 .x
sd
传给 .y
再来一个变态的。(我们不一定要这样写,但我们尽可能的要明白它的意思。)
-<- tribble(
- df ~a, ~b,
- 1, 10,
- 2, 11
-
- )
-
-%>%
- df ::mutate(., sum = purrr::pmap_dbl(., ~ sum(...))) dplyr
## # A tibble: 2 x 3
-## a b sum
-## <dbl> <dbl> <dbl>
-## 1 1 10 11
-## 2 2 11 13
-<- function(...) {
- commas ::str_c(..., collapse = ", ")
- stringr
- }
-
-commas(letters[1:10])
## [1] "a, b, c, d, e, f, g, h, i, j"
-注意:有些函数的参数前缀是 .
-mutate_all(.tbl, .funs, ...)
-
-mutate_if(.tbl, .predicate, .funs, ...)
-
-mutate_at(.tbl, .vars, .funs, ..., .cols = NULL)
-
-select_all(.tbl, .funs = list(), ...)
-
-rename_all(.tbl, .funs = list(), ...)
%>%
一起)现在回答本章开始的问题
-read_csv("./demo_data/wages.csv") %>%
-::mutate(letter = str_extract(race, "(?<=h)(.)")) %>%
- dplyr::select(., -letter) %>%
- dplyr::mutate_at(vars(race), ~ as.factor(.)) %>%
- dplyr::mutate_at(vars(sex), ~ if_else(. == "male", 1, 0)) %>%
- dplyr::filter_if(~ is.numeric(.), all_vars(. != 0)) %>%
- dplyrsplit(.$sex) %>%
- ::map(~ lm(earn ~ ., data = .)) %>%
- purrr::map_dfr(., ~ broom::tidy(.), .id = "sex") purrr
## # A tibble: 8 x 6
-## sex term estimate std.error statistic p.value
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 1 (Interc~ -121846. 37449. -3.25 1.21e- 3
-## 2 1 height 977. 515. 1.90 5.84e- 2
-## 3 1 sex NA NA NA NA
-## 4 1 racehis~ 578. 7934. 0.0728 9.42e- 1
-## 5 1 raceoth~ -2035. 11514. -0.177 8.60e- 1
-## 6 1 racewhi~ 12823. 5284. 2.43 1.56e- 2
-## 7 1 ed 5234. 601. 8.71 4.30e-17
-## 8 1 age 406. 95.5 4.25 2.52e- 5
-.
代表当前位置,如果是..
表示上一级目录select(-letter)
~ as.factor(.)
也可以简写as.factor
,~
和(.)
要么都写,要么都不写.
代表lambda函数; 第二个.
也是lambda函数,但这里它是all_vars(expr)
中expr的一种特有写法,代表所有数值型变量,*行方向构成的向量, all_vars(. != 0)
函数返回TRUE或FALSE,从而帮助filter()
是否筛选该行lm
中,第一个.
代表除因变量earn之外所有的变量,第二个.
占位符,留给上面的数据框.
是占位符,代表上面传来的list,第二个.
lambda函数,依次对list的元素迭代处理,第二个.
是参数名,.id
是特有的一个符号。本章我们介绍tidyverse里数据处理的神器dplyr宏包。首先,我们加载该宏包
-library(dplyr)
dplyr 定义了数据处理的规范语法,其中主要包含以下九个主要的函数。
-mutate()
, select()
, filter()
summarise()
, group_by()
, arrange()
left_join()
, right_join()
, full_join()
我们依次介绍
-mutate()
假定我们有一数据框,包含三位学生的英语和数学
-<- data.frame(
- df name = c("Alice", "Alice", "Bob", "Bob", "Carol", "Carol"),
- type = c("english", "math", "english", "math", "english", "math")
-
- )
- df
## name type
-## 1 Alice english
-## 2 Alice math
-## 3 Bob english
-## 4 Bob math
-## 5 Carol english
-## 6 Carol math
-这里有他们的最近的考试成绩,我们想增加到数据框里去
-<- c(80.2, 90.5, 92.2, 90.8, 82.5, 84.6)
- score2020 score2020
## [1] 80.2 90.5 92.2 90.8 82.5 84.6
-使用传统的方法
-$newscore <- score2020
- df df
## name type newscore
-## 1 Alice english 80.2
-## 2 Alice math 90.5
-## 3 Bob english 92.2
-## 4 Bob math 90.8
-## 5 Carol english 82.5
-## 6 Carol math 84.6
-dplyr语法这样写
-mutate(df, newscore = score2020)
## name type newscore
-## 1 Alice english 80.2
-## 2 Alice math 90.5
-## 3 Bob english 92.2
-## 4 Bob math 90.8
-## 5 Carol english 82.5
-## 6 Carol math 84.6
-mutate()
函数
mutate(.data = df, newscore = score2020)
df
,newscore = score2020
,等号左边的newscore
是我们打算创建一个新列,而取的列名;
-等号右边是装着学生成绩的向量(注意,向量 的长度要与数据框的行数相等,比如这里长度都是6)管道
%>%这里有必要介绍下管道操作符%>%
.
c(1:10)
## [1] 1 2 3 4 5 6 7 8 9 10
-sum(c(1:10))
## [1] 55
-与下面的写法是等价的,
-c(1:10) %>% sum()
## [1] 55
-这条语句的意思,向量c(1:10)
通过管道操作符 %>%
,传递到函数sum()
的第一个参数位置,即sum(c(1:10))
, 这个%>%
管道操作符还是很形象的,
当对执行多个函数操作的时候,就显得格外方便,代码可读性更强。
-sqrt(sum(abs(c(-10:10))))
## [1] 10.49
-# sqrt(sum(abs(c(-10:10))))
-c(-10:10) %>% abs() %>% sum() %>% sqrt()
## [1] 10.49
-那么,上面增加学生成绩的语句mutate(df, newscore = score2020)
就可以使用管道
# 等价于
-%>% mutate(newscore = score2020) df
## name type newscore
-## 1 Alice english 80.2
-## 2 Alice math 90.5
-## 3 Bob english 92.2
-## 4 Bob math 90.8
-## 5 Carol english 82.5
-## 6 Carol math 84.6
-是不是很赞?
-注意此时df
没有变化喔。好比把df
传给了f()
执行了f(df)
, 但df
本身没有变化。
-如果想保留f(df)
结果,需要把f(df)
赋值给新的对象,当然也可以赋值给df
, 即替换.
<- df %>% mutate(newscore = score2020)
- df df
## name type newscore
-## 1 Alice english 80.2
-## 2 Alice math 90.5
-## 3 Bob english 92.2
-## 4 Bob math 90.8
-## 5 Carol english 82.5
-## 6 Carol math 84.6
-select()
select()
顾名思义选择
,就是选择数据框的某一列,或者某几列
我们还是以学生成绩的数据框为例
-我们可以选择name
列, 结果是只有一列的数据框(仍然数据框喔)
"name"] df[
## name
-## 1 Alice
-## 2 Alice
-## 3 Bob
-## 4 Bob
-## 5 Carol
-## 6 Carol
-%>% select(name) df
## name
-## 1 Alice
-## 2 Alice
-## 3 Bob
-## 4 Bob
-## 5 Carol
-## 6 Carol
-%>% select(name, newscore) df
## name newscore
-## 1 Alice 80.2
-## 2 Alice 90.5
-## 3 Bob 92.2
-## 4 Bob 90.8
-## 5 Carol 82.5
-## 6 Carol 84.6
-如果不想要某列, 可以在变量前面加-
, 结果与上面的一样
%>% select(-type) df
## name newscore
-## 1 Alice 80.2
-## 2 Alice 90.5
-## 3 Bob 92.2
-## 4 Bob 90.8
-## 5 Carol 82.5
-## 6 Carol 84.6
-filter()
select
是列方向的选择,我们还可以对数据行方向的选择和筛选,选出符合我们条件的某些行
比如这里把成绩高于90分的同学筛选出来
-%>% filter(newscore >= 90) df
## name type newscore
-## 1 Alice math 90.5
-## 2 Bob english 92.2
-## 3 Bob math 90.8
-也可以限定多个条件进行筛选, 英语成绩高于90分的筛选出来
-%>% filter(type == "english", newscore >= 90) df
## name type newscore
-## 1 Bob english 92.2
-summarise()
统计summarise()
主要用于统计,往往与其他函数配合使用,比如计算所有同学考试成绩的均值
%>% summarise( mean_score = mean(newscore)) df
## mean_score
-## 1 86.8
-比如,计算所有同学的考试成绩的标准差
-%>% summarise( mean_score = sd(newscore)) df
## mean_score
-## 1 5.015
-还同时完成多个统计
-%>% summarise(
- df mean_score = mean(newscore),
- median_score = median(newscore),
- n = n(),
- sum = sum(newscore)
- )
## mean_score median_score n sum
-## 1 86.8 87.55 6 520.8
-注意,mutate()
, select()
和 filter()
是在原数据框的基础上增减, 而summarise()
返回的是一个新的数据框。
group_by()
分组事实上,summarise()
往往配合group_by()
一起使用,即,先分组再统计。比如,我们想统计每个学生的平均成绩,那么就需要先按学生name
分组,然后求平均
%>%
- df group_by(name) %>%
- summarise(
- mean_score = mean(newscore),
- sd_score = sd(newscore)
- )
## # A tibble: 3 x 3
-## name mean_score sd_score
-## <chr> <dbl> <dbl>
-## 1 Alice 85.4 7.28
-## 2 Bob 91.5 0.990
-## 3 Carol 83.6 1.48
-arrange()
排序这个很好理解的。比如我们按照考试成绩从低到高排序,然后输出
-%>% arrange(newscore) df
## name type newscore
-## 1 Alice english 80.2
-## 2 Carol english 82.5
-## 3 Carol math 84.6
-## 4 Alice math 90.5
-## 5 Bob math 90.8
-## 6 Bob english 92.2
-如果从高到低排序呢,有两种方法:
-%>% arrange(-newscore) df
## name type newscore
-## 1 Bob english 92.2
-## 2 Bob math 90.8
-## 3 Alice math 90.5
-## 4 Carol math 84.6
-## 5 Carol english 82.5
-## 6 Alice english 80.2
-写成下面这种形式也是降序排列,但可读性更强些
-%>% arrange(desc(newscore)) df
## name type newscore
-## 1 Bob english 92.2
-## 2 Bob math 90.8
-## 3 Alice math 90.5
-## 4 Carol math 84.6
-## 5 Carol english 82.5
-## 6 Alice english 80.2
-也可对多个变量先后排序。先按学科排,然后按照成绩从高到底排序
-%>%
- df arrange(type, desc(newscore))
## name type newscore
-## 1 Bob english 92.2
-## 2 Carol english 82.5
-## 3 Alice english 80.2
-## 4 Bob math 90.8
-## 5 Alice math 90.5
-## 6 Carol math 84.6
-left_join()
数据框合并,假定我们已经统计了每个同学的平均成绩,存放在df1
<- df %>%
- df1 group_by(name) %>%
- summarise( mean_score = mean(newscore) )
-
- df1
## # A tibble: 3 x 2
-## name mean_score
-## <chr> <dbl>
-## 1 Alice 85.4
-## 2 Bob 91.5
-## 3 Carol 83.6
-我们有新一个数据框df2
,包含同学们的年龄信息
<- tibble(
- df2 name = c("Alice", "Bob"),
- age = c(12, 13)
-
- )
- df2
## # A tibble: 2 x 2
-## name age
-## <chr> <dbl>
-## 1 Alice 12
-## 2 Bob 13
-可以用 left_join
把两个数据框df1
和df2
,合并连接再一起, 两个数据框是通过姓名name
连接的,因此需要指定by = "name"
left_join(df1, df2, by = "name")
## # A tibble: 3 x 3
-## name mean_score age
-## <chr> <dbl> <dbl>
-## 1 Alice 85.4 12
-## 2 Bob 91.5 13
-## 3 Carol 83.6 NA
-#
-%>% left_join(df2, by = "name") df1
## # A tibble: 3 x 3
-## name mean_score age
-## <chr> <dbl> <dbl>
-## 1 Alice 85.4 12
-## 2 Bob 91.5 13
-## 3 Carol 83.6 NA
-大家注意到最后一行Carol的年龄是NA
, 大家想想为什么呢?
right_join()
我们再试试right_join()
%>% right_join(df2, by = "name") df1
## # A tibble: 2 x 3
-## name mean_score age
-## <chr> <dbl> <dbl>
-## 1 Alice 85.4 12
-## 2 Bob 91.5 13
-Carol同学的信息没有了? 大家想想又为什么呢?
-事实上,答案就在函数的名字上,left_join()
是左合并,即以左边数据框df1
中的学生姓名name
为准,在右边数据框df2
里,有Alice
和Bob
的年龄,那么就对应合并过来,没有Carol
,就为缺失值NA
right_join()
是右合并,即以右边数据框df2
中的学生姓名name
为准,只有Alice
和Bob
,因此而df1
只需要把Alice
和Bob
的信息粘过来。
这是一份Ames房屋数据,您可以把它想象为房屋中介推出的成都市武侯区、锦江区以及高新区等各区县的房屋信息
-library(tidyverse)
-<- read_csv("./demo_data/ames_houseprice.csv") %>%
- ames ::clean_names()
- janitor
-glimpse(ames)
## Rows: 1,460
-## Columns: 81
-## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9,...
-## $ ms_sub_class <dbl> 60, 20, 60, 70, 60, 50, 20...
-## $ ms_zoning <chr> "RL", "RL", "RL", "RL", "R...
-## $ lot_frontage <dbl> 65, 80, 68, 60, 84, 85, 75...
-## $ lot_area <dbl> 8450, 9600, 11250, 9550, 1...
-## $ street <chr> "Pave", "Pave", "Pave", "P...
-## $ alley <chr> NA, NA, NA, NA, NA, NA, NA...
-## $ lot_shape <chr> "Reg", "Reg", "IR1", "IR1"...
-## $ land_contour <chr> "Lvl", "Lvl", "Lvl", "Lvl"...
-## $ utilities <chr> "AllPub", "AllPub", "AllPu...
-## $ lot_config <chr> "Inside", "FR2", "Inside",...
-## $ land_slope <chr> "Gtl", "Gtl", "Gtl", "Gtl"...
-## $ neighborhood <chr> "CollgCr", "Veenker", "Col...
-## $ condition1 <chr> "Norm", "Feedr", "Norm", "...
-## $ condition2 <chr> "Norm", "Norm", "Norm", "N...
-## $ bldg_type <chr> "1Fam", "1Fam", "1Fam", "1...
-## $ house_style <chr> "2Story", "1Story", "2Stor...
-## $ overall_qual <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7,...
-## $ overall_cond <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5,...
-## $ year_built <dbl> 2003, 1976, 2001, 1915, 20...
-## $ year_remod_add <dbl> 2003, 1976, 2002, 1970, 20...
-## $ roof_style <chr> "Gable", "Gable", "Gable",...
-## $ roof_matl <chr> "CompShg", "CompShg", "Com...
-## $ exterior1st <chr> "VinylSd", "MetalSd", "Vin...
-## $ exterior2nd <chr> "VinylSd", "MetalSd", "Vin...
-## $ mas_vnr_type <chr> "BrkFace", "None", "BrkFac...
-## $ mas_vnr_area <dbl> 196, 0, 162, 0, 350, 0, 18...
-## $ exter_qual <chr> "Gd", "TA", "Gd", "TA", "G...
-## $ exter_cond <chr> "TA", "TA", "TA", "TA", "T...
-## $ foundation <chr> "PConc", "CBlock", "PConc"...
-## $ bsmt_qual <chr> "Gd", "Gd", "Gd", "TA", "G...
-## $ bsmt_cond <chr> "TA", "TA", "TA", "Gd", "T...
-## $ bsmt_exposure <chr> "No", "Gd", "Mn", "No", "A...
-## $ bsmt_fin_type1 <chr> "GLQ", "ALQ", "GLQ", "ALQ"...
-## $ bsmt_fin_sf1 <dbl> 706, 978, 486, 216, 655, 7...
-## $ bsmt_fin_type2 <chr> "Unf", "Unf", "Unf", "Unf"...
-## $ bsmt_fin_sf2 <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0...
-## $ bsmt_unf_sf <dbl> 150, 284, 434, 540, 490, 6...
-## $ total_bsmt_sf <dbl> 856, 1262, 920, 756, 1145,...
-## $ heating <chr> "GasA", "GasA", "GasA", "G...
-## $ heating_qc <chr> "Ex", "Ex", "Ex", "Gd", "E...
-## $ central_air <chr> "Y", "Y", "Y", "Y", "Y", "...
-## $ electrical <chr> "SBrkr", "SBrkr", "SBrkr",...
-## $ x1st_flr_sf <dbl> 856, 1262, 920, 961, 1145,...
-## $ x2nd_flr_sf <dbl> 854, 0, 866, 756, 1053, 56...
-## $ low_qual_fin_sf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
-## $ gr_liv_area <dbl> 1710, 1262, 1786, 1717, 21...
-## $ bsmt_full_bath <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0,...
-## $ bsmt_half_bath <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0,...
-## $ full_bath <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2,...
-## $ half_bath <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0,...
-## $ bedroom_abv_gr <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2,...
-## $ kitchen_abv_gr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2,...
-## $ kitchen_qual <chr> "Gd", "TA", "Gd", "Gd", "G...
-## $ tot_rms_abv_grd <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8,...
-## $ functional <chr> "Typ", "Typ", "Typ", "Typ"...
-## $ fireplaces <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2,...
-## $ fireplace_qu <chr> NA, "TA", "TA", "Gd", "TA"...
-## $ garage_type <chr> "Attchd", "Attchd", "Attch...
-## $ garage_yr_blt <dbl> 2003, 1976, 2001, 1998, 20...
-## $ garage_finish <chr> "RFn", "RFn", "RFn", "Unf"...
-## $ garage_cars <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2,...
-## $ garage_area <dbl> 548, 460, 608, 642, 836, 4...
-## $ garage_qual <chr> "TA", "TA", "TA", "TA", "T...
-## $ garage_cond <chr> "TA", "TA", "TA", "TA", "T...
-## $ paved_drive <chr> "Y", "Y", "Y", "Y", "Y", "...
-## $ wood_deck_sf <dbl> 0, 298, 0, 0, 192, 40, 255...
-## $ open_porch_sf <dbl> 61, 0, 42, 35, 84, 30, 57,...
-## $ enclosed_porch <dbl> 0, 0, 0, 272, 0, 0, 0, 228...
-## $ x3ssn_porch <dbl> 0, 0, 0, 0, 0, 320, 0, 0, ...
-## $ screen_porch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
-## $ pool_area <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0,...
-## $ pool_qc <chr> NA, NA, NA, NA, NA, NA, NA...
-## $ fence <chr> NA, NA, NA, NA, NA, "MnPrv...
-## $ misc_feature <chr> NA, NA, NA, NA, NA, "Shed"...
-## $ misc_val <dbl> 0, 0, 0, 0, 0, 700, 0, 350...
-## $ mo_sold <dbl> 2, 5, 9, 2, 12, 10, 8, 11,...
-## $ yr_sold <dbl> 2008, 2007, 2008, 2006, 20...
-## $ sale_type <chr> "WD", "WD", "WD", "WD", "W...
-## $ sale_condition <chr> "Normal", "Normal", "Norma...
-## $ sale_price <dbl> 208500, 181500, 223500, 14...
-感谢曾倬同学提供的解释说明文档
-<- readxl::read_excel("./demo_data/ames_houseprice_explanation.xlsx")
- explanation %>%
- explanation ::kable() knitr
-列名 - | --description - | --解释 - | -
---|---|---|
-MSSubClass - | --Identifies the type of dwelling involved in the sale. - | --住宅概况 - | -
-MSZoning - | --Identifies the general zoning classification of the sale. - | --建筑性质(农业、商业、高/低密度住宅) - | -
-LotFrontage - | --Linear feet of street connected to property - | --建筑离街道的距离 - | -
-LotArea - | --Lot size in square feet - | --占地面积 - | -
-Street - | --Type of road access to property - | --建筑附近的路面材质 - | -
-Alley - | --Type of alley access to property - | --建筑附近小巷的修建材质 - | -
-LotShape - | --General shape of property - | --建筑物的形状 - | -
-LandContour - | --Flatness of the property - | --地面平坦程度 - | -
-Utilities - | --Type of utilities available - | --可用公用设施类型 - | -
-LotConfig - | --Lot configuration - | --房屋哪里配置多 - | -
-LandSlope - | --Slope of property - | --建筑的斜率 - | -
-Neighborhood - | --Physical locations within Ames city limits - | --建筑在Ames城市的位置 - | -
-Condition1 - | --Proximity to various conditions - | --建筑附近的交通网络 - | -
-Condition2 - | --Proximity to various conditions (if more than one is present) - | --建筑附近的交通网络 - | -
-BldgType - | --Type of dwelling - | --住宅类别(联排别墅、独栋别墅…) - | -
-HouseStyle - | --Style of dwelling - | --建筑风格 - | -
-OverallQual - | --Rates the overall material and finish of the house - | --房屋装饰材质水平 - | -
-OverallCond - | --Rates the overall condition of the house - | --房屋整体状况评估 - | -
-YearBuilt - | --Original construction date - | --房屋修建日期 - | -
-YearRemodAdd - | --Remodel date (same as construction date if no remodeling or additions) - | --房屋改建日期 - | -
-RoofStyle - | --Type of roof - | --屋顶类型 - | -
-RoofMatl - | --Roof material - | --屋顶材质 - | -
-Exterior1st - | --Exterior covering on house - | --建筑外立面材质 - | -
-Exterior2nd - | --Exterior covering on house (if more than one material) - | --建筑外立面材质 - | -
-MasVnrType - | --Masonry veneer type - | --建筑表层砌体类型 - | -
-MasVnrArea - | --Masonry veneer area in square feet - | --每平方英尺的砌体面积 - | -
-ExterQual - | --Evaluates the quality of the material on the exterior - | --建筑表层砌体材料质量评估 - | -
-ExterCond - | --Evaluates the present condition of the material on the exterior - | --建筑表层砌体材料现状评估 - | -
-Foundation - | --Type of foundation - | --建筑基础的类型 - | -
-BsmtQual - | --Evaluates the height of the basement - | --地下室高度评估 - | -
-BsmtCond - | --Evaluates the general condition of the basement - | --地下室总体状况评估 - | -
-BsmtExposure - | --Refers to walkout or garden level walls - | --走廊/花园外墙的评估 - | -
-BsmtFinType1 - | --Rating of basement finished area - | --地下室完工区域的等级评价 - | -
-BsmtFinSF1 - | --Type 1 finished square feet - | --地下室完工区域的面积 - | -
-BsmtFinType2 - | --Rating of basement finished area (if multiple types) - | --其他地下室完工区域的等级评价 - | -
-BsmtFinSF2 - | --Type 2 finished square feet - | --其他地下室完工区域的面积 - | -
-BsmtUnfSF - | --Unfinished square feet of basement area - | --地下室未完工部分的面积 - | -
-TotalBsmtSF - | --Total square feet of basement area - | --地下室总面积 - | -
-Heating - | --Type of heating - | --房屋暖气类型(地暖、墙暖….) - | -
-HeatingQC - | --Heating quality and condition - | --暖气设施的质量和条件 - | -
-CentralAir - | --Central air conditioning - | --是否有中央空调 - | -
-Electrical - | --Electrical system - | --电器系统配置标准 - | -
-1stFlrSF - | --First Floor square feet - | --一楼面积 - | -
-2ndFlrSF - | --Second floor square feet - | --二楼面积 - | -
-LowQualFinSF - | --Low quality finished square feet (all floors) - | --所有楼层中低质量施工面积 - | -
-GrLivArea - | --Above grade (ground) living area square feet - | --地上居住面积 - | -
-BsmtFullBath - | --Basement full bathrooms - | --地下室标准卫生间个数 - | -
-BsmtHalfBath - | --Basement half bathrooms - | --地下室简易卫生间个数 - | -
-FullBath - | --Full bathrooms above grade - | --地上楼层标准卫生间个数 - | -
-HalfBath - | --Half baths above grade - | --地上楼层简易卫生间个数 - | -
-BedroomAbvGr - | --Bedrooms above grade (does NOT include basement bedrooms) - | --地上楼层卧室个数 - | -
-KitchenAbvGr - | --Kitchens above grade - | --地上楼层厨房个数 - | -
-KitchenQual - | --Kitchen quality - | --厨房质量评估 - | -
-TopRmsAbvGrd - | --Total rooms above grade (does not include bathrooms) - | --地上楼层房间总数(除去卧室) - | -
-Functional - | --Home functionality (Assume typical unless deductions are warranted) - | --房屋功能情况 - | -
-Fireplaces - | --Number of fireplaces - | --壁炉个数 - | -
-FireplaceQu - | --Fireplace quality - | --壁炉质量 - | -
-GarageType - | --Garage location - | --车库位置 - | -
-GarageYrBlt - | --Year garage was built - | --车库建成年份 - | -
-GarageFinish - | --Interior finish of the garage - | --车库内部装饰情况 - | -
-GarageCars - | --Size of garage in car capacity - | --车库容量 - | -
-GarageArea - | --Size of garage in square feet - | --车库占地面积 - | -
-GarageQual - | --Garage quality - | --车库质量 - | -
-GarageCond - | --Garage condition - | --车库条件 - | -
-PavedDrive - | --Paved driveway - | --车道施工方式 - | -
-WoodDeckSF - | --Wood deck area in square feet - | --木甲板面积 - | -
-OpenPorchSF - | --Open porch area in square feet - | --开放式门廊面积 - | -
-EnclosedPorch - | --Enclosed porch area in square feet - | --封闭式门廊面积 - | -
-3SsnPorch - | --Three season porch area in square feet - | --三季门廊面积 - | -
-ScreenPorch - | --Screen porch area in square feet - | --纱窗门廊面积 - | -
-PoolArea - | --Pool area in square feet - | --游泳池面积 - | -
-PoolQC - | --Pool quality - | --游泳池质量 - | -
-Fence - | --Fence quality - | --栅栏质量 - | -
-MiscFeature - | --Miscellaneous feature not covered in other categories - | --其他配套设施(网球场、电梯…) - | -
-MiscVal - | --$Value of miscellaneous feature - | --其他配套设施的费用 - | -
-MoSold - | --Month Sold (MM) - | --销售月份 - | -
-YrSold - | --Year Sold (YYYY) - | --销售年份 - | -
-SaleType - | --Type of sale - | --支付方式 - | -
-SaleCondition - | --Condition of sale - | --房屋出售的情况 - | -
<- ames %>%
- d select(sale_price,
- # 建筑离街道的距离
- lot_frontage, # 占地面积
- lot_area, # 建筑在城市的位置
- neighborhood, # 地上居住面积
- gr_liv_area, # 住宅类别(联排别墅、独栋别墅...)
- bldg_type, # 房屋修建日期
- year_built
- ) d
## # A tibble: 1,460 x 7
-## sale_price lot_frontage lot_area neighborhood
-## <dbl> <dbl> <dbl> <chr>
-## 1 208500 65 8450 CollgCr
-## 2 181500 80 9600 Veenker
-## 3 223500 68 11250 CollgCr
-## 4 140000 60 9550 Crawfor
-## 5 250000 84 14260 NoRidge
-## 6 143000 85 14115 Mitchel
-## 7 307000 75 10084 Somerst
-## 8 200000 NA 10382 NWAmes
-## 9 129900 51 6120 OldTown
-## 10 118000 50 7420 BrkSide
-## # ... with 1,450 more rows, and 3 more variables:
-## # gr_liv_area <dbl>, bldg_type <chr>,
-## # year_built <dbl>
-%>%
- d summarise(
- across(everything(), function(x) sum(is.na(x)) )
- )
## # A tibble: 1 x 7
-## sale_price lot_frontage lot_area neighborhood
-## <int> <int> <int> <int>
-## 1 0 259 0 0
-## # ... with 3 more variables: gr_liv_area <int>,
-## # bldg_type <int>, year_built <int>
-找出来看看
-%>%
- d filter_all(
- any_vars(is.na(.))
- )
## # A tibble: 259 x 7
-## sale_price lot_frontage lot_area neighborhood
-## <dbl> <dbl> <dbl> <chr>
-## 1 200000 NA 10382 NWAmes
-## 2 144000 NA 12968 Sawyer
-## 3 157000 NA 10920 NAmes
-## 4 149000 NA 11241 NAmes
-## 5 154000 NA 8246 Sawyer
-## 6 149350 NA 8544 Sawyer
-## 7 144000 NA 9180 SawyerW
-## 8 130250 NA 9200 CollgCr
-## 9 177000 NA 13869 Gilbert
-## 10 219500 NA 9375 CollgCr
-## # ... with 249 more rows, and 3 more variables:
-## # gr_liv_area <dbl>, bldg_type <chr>,
-## # year_built <dbl>
-library(visdat)
-
-%>% vis_dat() d
如果不选择lot_frontage
就不会有缺失值,如何选择,自己抉择
%>%
- d select(-lot_frontage) %>%
- ::vis_dat() visdat
我个人觉得这个变量很重要,所以还是保留,牺牲一点样本量吧
-<- d %>%
- d drop_na()
%>% visdat::vis_dat() d
<- function(x) {
- standard - mean(x)) / sd(x)
- (x
- }
-%>%
- d mutate(
- across(where(is.numeric), standard),
- across(where(is.character), as.factor)
- )
## # A tibble: 1,201 x 7
-## sale_price lot_frontage lot_area neighborhood
-## <dbl> <dbl> <dbl> <fct>
-## 1 0.333 -0.208 -0.190 CollgCr
-## 2 0.00875 0.410 -0.0444 Veenker
-## 3 0.512 -0.0844 0.164 CollgCr
-## 4 -0.489 -0.414 -0.0507 Crawfor
-## 5 0.830 0.574 0.544 NoRidge
-## 6 -0.453 0.616 0.525 Mitchel
-## 7 1.51 0.204 0.0167 Somerst
-## 8 -0.610 -0.784 -0.484 OldTown
-## 9 -0.753 -0.826 -0.319 BrkSide
-## 10 -0.615 -0.00206 0.158 Sawyer
-## # ... with 1,191 more rows, and 3 more variables:
-## # gr_liv_area <dbl>, bldg_type <fct>,
-## # year_built <dbl>
-%>%
- d mutate(
- log_sale_price = log(sale_price)
- )
## # A tibble: 1,201 x 8
-## sale_price lot_frontage lot_area neighborhood
-## <dbl> <dbl> <dbl> <chr>
-## 1 208500 65 8450 CollgCr
-## 2 181500 80 9600 Veenker
-## 3 223500 68 11250 CollgCr
-## 4 140000 60 9550 Crawfor
-## 5 250000 84 14260 NoRidge
-## 6 143000 85 14115 Mitchel
-## 7 307000 75 10084 Somerst
-## 8 129900 51 6120 OldTown
-## 9 118000 50 7420 BrkSide
-## 10 129500 70 11200 Sawyer
-## # ... with 1,191 more rows, and 4 more variables:
-## # gr_liv_area <dbl>, bldg_type <chr>,
-## # year_built <dbl>, log_sale_price <dbl>
-%>%
- d mutate(
- across(where(is.numeric), log),
- across(where(is.character), as.factor)
- )
## # A tibble: 1,201 x 7
-## sale_price lot_frontage lot_area neighborhood
-## <dbl> <dbl> <dbl> <fct>
-## 1 12.2 4.17 9.04 CollgCr
-## 2 12.1 4.38 9.17 Veenker
-## 3 12.3 4.22 9.33 CollgCr
-## 4 11.8 4.09 9.16 Crawfor
-## 5 12.4 4.43 9.57 NoRidge
-## 6 11.9 4.44 9.55 Mitchel
-## 7 12.6 4.32 9.22 Somerst
-## 8 11.8 3.93 8.72 OldTown
-## 9 11.7 3.91 8.91 BrkSide
-## 10 11.8 4.25 9.32 Sawyer
-## # ... with 1,191 more rows, and 3 more variables:
-## # gr_liv_area <dbl>, bldg_type <fct>,
-## # year_built <dbl>
-选择哪一种,我们看图说话
-%>%
- d ggplot(aes(x = sale_price)) +
- geom_density()
%>%
- d ggplot(aes(x = log(sale_price))) +
- geom_density()
我们选择对数化,并保存结果
-<- d %>%
- d mutate(
- across(where(is.numeric),
- .fns = list(log = log),
- .names = "{.fn}_{.col}"
-
- ),across(where(is.character), as.factor)
- )
%>% count(neighborhood) d
## # A tibble: 25 x 2
-## neighborhood n
-## <fct> <int>
-## 1 Blmngtn 14
-## 2 Blueste 2
-## 3 BrDale 16
-## 4 BrkSide 51
-## 5 ClearCr 13
-## 6 CollgCr 126
-## 7 Crawfor 41
-## 8 Edwards 92
-## 9 Gilbert 49
-## 10 IDOTRR 34
-## # ... with 15 more rows
-%>%
- d group_by(neighborhood) %>%
- summarise(
- mean_sale = mean(sale_price)
- %>%
- )
- ggplot(
- aes(x = mean_sale, y = fct_reorder(neighborhood, mean_sale))
- +
- ) geom_col(aes(fill = mean_sale < 150000), show.legend = FALSE) +
- geom_text(aes(label = round(mean_sale, 0)), hjust = 1) +
- # scale_x_continuous(
- # expand = c(0, 0),
- # breaks = c(0, 100000, 200000, 300000),
- # labels = c(0, "1w", "2w", "3w")
- # ) +
- scale_x_continuous(
- expand = c(0, 0),
- labels = scales::dollar
- +
- ) scale_fill_viridis_d(option = "D") +
- theme_classic() +
- labs(x = NULL, y = NULL)
%>%
- d ggplot(aes(x = log_lot_area, y = log_sale_price)) +
- geom_point(colour = "blue") +
- geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
%>%
- d ggplot(aes(x = log_lot_area, y = log_sale_price)) +
- geom_point(aes(colour = neighborhood)) +
- geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
%>%
- d ggplot(aes(x = log_lot_area, y = log_sale_price)) +
- geom_point(colour = "blue") +
- geom_smooth(method = lm, se = FALSE, formula = "y ~ x", fullrange = TRUE) +
- facet_wrap(~neighborhood) +
- theme(strip.background = element_blank())
%>%
- d ggplot(aes(x = log_gr_liv_area, y = log_sale_price)) +
- geom_point(aes(colour = neighborhood)) +
- geom_smooth(method = lm, se = FALSE, formula = "y ~ x")
%>%
- d ggplot(aes(x = log_gr_liv_area, y = log_sale_price)) +
- geom_point() +
- geom_smooth(method = lm, se = FALSE, formula = "y ~ x", fullrange = TRUE) +
- facet_wrap(~neighborhood) +
- theme(strip.background = element_blank())
车库大小是否对销售价格有帮助?
-%>%
- ames #select(garage_cars, garage_area, sale_price) %>%
- ggplot(aes(x = garage_area, y = sale_price)) +
- geom_point(
- data = select(ames, -garage_cars),
- color = "gray50"
- +
- ) geom_point(aes(color = as_factor(garage_cars))) +
- facet_wrap(vars(garage_cars)) +
- theme(legend.position = "none") +
- ggtitle("This is the influence of garage for sale price")
lm(log_sale_price ~ 1 + log_gr_liv_area + neighborhood, data = d) %>%
-::tidy() broom
## # A tibble: 26 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) 7.53 0.154 48.7 2.21e-284
-## 2 log_gr_liv_a~ 0.638 0.0200 31.9 3.76e-161
-## 3 neighborhood~ -0.314 0.149 -2.10 3.55e- 2
-## 4 neighborhood~ -0.466 0.0724 -6.43 1.80e- 10
-## 5 neighborhood~ -0.336 0.0597 -5.62 2.44e- 8
-## 6 neighborhood~ -0.103 0.0762 -1.35 1.76e- 1
-## 7 neighborhood~ 0.00332 0.0556 0.0597 9.52e- 1
-## 8 neighborhood~ -0.0870 0.0612 -1.42 1.55e- 1
-## 9 neighborhood~ -0.365 0.0567 -6.44 1.79e- 10
-## 10 neighborhood~ -0.0621 0.0599 -1.04 3.00e- 1
-## # ... with 16 more rows
-library(lme4)
-lmer(log_sale_price ~ 1 + log_gr_liv_area + (log_gr_liv_area | neighborhood),
-data = d) %>%
- ::tidy() broom.mixed
## # A tibble: 6 x 6
-## effect group term estimate std.error statistic
-## <chr> <chr> <chr> <dbl> <dbl> <dbl>
-## 1 fixed <NA> (Interce~ 6.88 0.334 20.6
-## 2 fixed <NA> log_gr_l~ 0.705 0.0493 14.3
-## 3 ran_pa~ neigh~ sd__(Int~ 1.34 NA NA
-## 4 ran_pa~ neigh~ cor__(In~ -0.993 NA NA
-## 5 ran_pa~ neigh~ sd__log_~ 0.205 NA NA
-## 6 ran_pa~ Resid~ sd__Obse~ 0.191 NA NA
-
-在可视化章节,我们提到 Anscombe’s quartet这个数据集,
-::anscombe ?datasets
在其官方文档,我们可看到它是这样描述的:
---Four x-y datasets which have the same traditional statistical properties (mean, variance, correlation, regression line, etc.), yet are quite different.
-
<- datasets::anscombe
- d head(d)
## x1 x2 x3 x4 y1 y2 y3 y4
-## 1 10 10 10 8 8.04 9.14 7.46 6.58
-## 2 8 8 8 8 6.95 8.14 6.77 5.76
-## 3 13 13 13 8 7.58 8.74 12.74 7.71
-## 4 9 9 9 8 8.81 8.77 7.11 8.84
-## 5 11 11 11 8 8.33 9.26 7.81 8.47
-## 6 14 14 14 8 9.96 8.10 8.84 7.04
-library(tidyverse)
本节课的内容,就是用tidyverse的方法去探索下这个数据集:
-我们再看看数据
-head(d)
## x1 x2 x3 x4 y1 y2 y3 y4
-## 1 10 10 10 8 8.04 9.14 7.46 6.58
-## 2 8 8 8 8 6.95 8.14 6.77 5.76
-## 3 13 13 13 8 7.58 8.74 12.74 7.71
-## 4 9 9 9 8 8.81 8.77 7.11 8.84
-## 5 11 11 11 8 8.33 9.26 7.81 8.47
-## 6 14 14 14 8 9.96 8.10 8.84 7.04
-实际上,这是四组(x1, y1), (x2, y2), (x3, y3), (x4, y4)
。那要怎么样规整数据,
-或者说怎么样把数据弄成tidy呢。这里有个技巧,你可以想象,数据能ggplot()
可视化的基本上就是tidy的。
%>%
- d ggplot(aes(x = x, y = y)) +
- geom_point() +
- facet_wrap(~set)
那么,我们希望我们的数据是这样的格式
-set | -x | -y | -
---|---|---|
1 | -10 | -8.04 | -
1 | -8 | -6.95 | -
… | -- | - |
2 | -10 | -9.14 | -
2 | -8 | -8.14 | -
… | -- | - |
我们之前讲过,数据变形中,宽表格变成长表格,
-需要用到tidyr::pivot_longer()
函数
-
比如
-<- tibble(id = c("a", "b"), x_1 = 1:2, x_2 = 3:4, y_1 = 5:6, y_2 = 8:9)
- dt dt
## # A tibble: 2 x 5
-## id x_1 x_2 y_1 y_2
-## <chr> <int> <int> <int> <int>
-## 1 a 1 3 5 8
-## 2 b 2 4 6 9
-%>% pivot_longer(-id,
- dt names_to = "name",
- values_to = "vaules"
- )
## # A tibble: 8 x 3
-## id name vaules
-## <chr> <chr> <int>
-## 1 a x_1 1
-## 2 a x_2 3
-## 3 a y_1 5
-## 4 a y_2 8
-## 5 b x_1 2
-## 6 b x_2 4
-## 7 b y_1 6
-## 8 b y_2 9
-有时候,我们不想要下划线后面的编号,只想保留前面的第一个字母
-%>% pivot_longer(
- dt cols = -id,
- names_to = "name",
- names_pattern = "(.)_.",
- values_to = "vaules"
- )
## # A tibble: 8 x 3
-## id name vaules
-## <chr> <chr> <int>
-## 1 a x 1
-## 2 a x 3
-## 3 a y 5
-## 4 a y 8
-## 5 b x 2
-## 6 b x 4
-## 7 b y 6
-## 8 b y 9
-有时候人的需求是多样的,比如不想要前面的第一个字母,只要下划线后面的编号
-%>% pivot_longer(
- dt cols = -id,
- names_to = "name",
- names_pattern = "._(.)",
- values_to = "vaules"
- )
## # A tibble: 8 x 3
-## id name vaules
-## <chr> <chr> <int>
-## 1 a 1 1
-## 2 a 2 3
-## 3 a 1 5
-## 4 a 2 8
-## 5 b 1 2
-## 6 b 2 4
-## 7 b 1 6
-## 8 b 2 9
-有时候我们都想要呢?
-%>% pivot_longer(
- dt cols = -id,
- names_to = c("name", "group"),
- names_pattern = "(.)_(.)",
- values_to = "vaules"
- )
## # A tibble: 8 x 4
-## id name group vaules
-## <chr> <chr> <chr> <int>
-## 1 a x 1 1
-## 2 a x 2 3
-## 3 a y 1 5
-## 4 a y 2 8
-## 5 b x 1 2
-## 6 b x 2 4
-## 7 b y 1 6
-## 8 b y 2 9
-有时候,我们希望"x", "y"
保留在列名,那么匹配出来的第一个字母,就不能给"name"
,而是传给特殊的符号".value"
,它会收集匹配出来的字符,然后放在列名中
%>% pivot_longer(
- dt cols = -id,
- names_to = c(".value", "group"),
- names_pattern = "(.)_(.)",
- values_to = "vaules"
- )
## # A tibble: 4 x 4
-## id group x y
-## <chr> <chr> <int> <int>
-## 1 a 1 1 5
-## 2 a 2 3 8
-## 3 b 1 2 6
-## 4 b 2 4 9
-是不是觉得很强大?
-具体来说,我们希望 x1
按照指定的正则表达式分成了两个部分 x
和 1
,那么1
放在set
下,而 x
传给了.value
当作变型后的列名.
::include_graphics("images/pivot_longer_values.jpg") knitr
那么和上面的情况一样,使用tidyr::pivot_longer()
函数
<- d %>%
- tidy_d pivot_longer(
- cols = everything(),
- names_to = c(".value", "set"),
- names_pattern = "(.)(.)"
-
- ) tidy_d
## # A tibble: 44 x 3
-## set x y
-## <chr> <dbl> <dbl>
-## 1 1 10 8.04
-## 2 2 10 9.14
-## 3 3 10 7.46
-## 4 4 8 6.58
-## 5 1 8 6.95
-## 6 2 8 8.14
-## 7 3 8 6.77
-## 8 4 8 5.76
-## 9 1 13 7.58
-## 10 2 13 8.74
-## # ... with 34 more rows
-再啰嗦下参数的含义:
-cols = everything()
表示选择所有列names_to = c(".value", "set")
希望变型后的列名是c(".value", "set")
, 这里 ".value"
是个特殊的符号,代表着names_pattern
匹配过来的值,一般情况下,是多个值,如果传给".value"
的"x, y, z"
,那么列名就会变成c("x", "y", "z", "set")
names_pattern = "(.)(.)"
将变换前的列名按照指定的正则表达式匹配,并且传递给names_to
的对应的参数,比如这里第一个(.)
传递给.value
;第二个(.)
传递给set
.数据规整了,统计就很简单了
-<- tidy_d %>%
- tidy_d_summary group_by(set) %>%
- summarise(across(
- .cols = everything(),
- .fns = lst(mean, sd, var),
- .names = "{col}_{fn}"
-
- )) tidy_d_summary
## # A tibble: 4 x 7
-## set x_mean x_sd x_var y_mean y_sd y_var
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 1 9 3.32 11 7.50 2.03 4.13
-## 2 2 9 3.32 11 7.50 2.03 4.13
-## 3 3 9 3.32 11 7.5 2.03 4.12
-## 4 4 9 3.32 11 7.50 2.03 4.12
-具体参考第 22 章整理的四种方法
-%>%
- tidy_d group_nest(set) %>%
- mutate(
- fit = map(data, ~ lm(y ~ x, data = .x)),
- tidy = map(fit, broom::tidy),
- glance = map(fit, broom::glance)
- %>%
- ) unnest(tidy)
感觉大家更喜欢这种
-%>%
- tidy_d group_by(set) %>%
- group_modify(
- ~ broom::tidy(lm(y ~ x, data = .))
- )
## # A tibble: 8 x 6
-## # Groups: set [4]
-## set term estimate std.error statistic p.value
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 1 (Intercep~ 3.00 1.12 2.67 0.0257
-## 2 1 x 0.500 0.118 4.24 0.00217
-## 3 2 (Intercep~ 3.00 1.13 2.67 0.0258
-## 4 2 x 0.5 0.118 4.24 0.00218
-## 5 3 (Intercep~ 3.00 1.12 2.67 0.0256
-## 6 3 x 0.500 0.118 4.24 0.00218
-## 7 4 (Intercep~ 3.00 1.12 2.67 0.0256
-## 8 4 x 0.500 0.118 4.24 0.00216
-%>%
- tidy_d group_by(set) %>%
- summarise(
- ::tidy(lm(y ~ x, data = cur_data()))
- broom )
## # A tibble: 8 x 6
-## # Groups: set [4]
-## set term estimate std.error statistic p.value
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 1 (Intercep~ 3.00 1.12 2.67 0.0257
-## 2 1 x 0.500 0.118 4.24 0.00217
-## 3 2 (Intercep~ 3.00 1.13 2.67 0.0258
-## 4 2 x 0.5 0.118 4.24 0.00218
-## 5 3 (Intercep~ 3.00 1.12 2.67 0.0256
-## 6 3 x 0.500 0.118 4.24 0.00218
-## 7 4 (Intercep~ 3.00 1.12 2.67 0.0256
-## 8 4 x 0.500 0.118 4.24 0.00216
-%>%
- tidy_d ggplot(aes(x = x, y = y, colour = set)) +
- geom_point() +
- geom_smooth(method = "lm", se = FALSE) +
- theme(legend.position = "none") +
- facet_wrap(~set)
library(tidyverse)
-
-<-
- example ::tribble(
- tibble~name, ~english, ~chinese, ~math, ~sport, ~psy, ~edu,
- "A", 133, 100, 102, 56, 89, 89,
- "B", 120, 120, 86, 88, 45, 75,
- "C", 98, 109, 114, 87, NA, 84,
- "D", 120, 78, 106, 68, 86, 69,
- "E", 110, 99, 134, 98, 75, 70,
- "F", NA, 132, 130, NA, 68, 88
-
- )
- example
## # A tibble: 6 x 7
-## name english chinese math sport psy edu
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 A 133 100 102 56 89 89
-## 2 B 120 120 86 88 45 75
-## 3 C 98 109 114 87 NA 84
-## 4 D 120 78 106 68 86 69
-## 5 E 110 99 134 98 75 70
-## 6 F NA 132 130 NA 68 88
-我们需要判断每一列的缺失值
-%>%
- example summarise(
- na_in_english = sum(is.na(english)),
- na_in_chinese = sum(is.na(chinese)),
- na_in_math = sum(is.na(math)),
- na_in_sport = sum(is.na(sport)),
- na_in_psy = sum(is.na(math)), # tpyo here
- na_in_edu = sum(is.na(edu))
- )
## # A tibble: 1 x 6
-## na_in_english na_in_chinese na_in_math na_in_sport
-## <int> <int> <int> <int>
-## 1 1 0 0 1
-## # ... with 2 more variables: na_in_psy <int>,
-## # na_in_edu <int>
-我们发现,这种写法比较笨,而且容易出错,比如na_in_psy = sum(is.na(math))
就写错了。那么有没有既偷懒又安全
的方法呢?有的。但代价是需要学会across()
函数,大家可以在Console中输入?dplyr::across
查看帮助文档,或者看第 22 章。
%>%
- example summarise(
- across(everything(), mean)
- )
## # A tibble: 1 x 7
-## name english chinese math sport psy edu
-## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 NA NA 106. 112 NA NA 79.2
-%>%
- example summarise(
- across(everything(), function(x) sum(is.na(x)) )
- )
## # A tibble: 1 x 7
-## name english chinese math sport psy edu
-## <int> <int> <int> <int> <int> <int> <int>
-## 1 0 1 0 0 1 1 0
-%>% drop_na() example
## # A tibble: 4 x 7
-## name english chinese math sport psy edu
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 A 133 100 102 56 89 89
-## 2 B 120 120 86 88 45 75
-## 3 D 120 78 106 68 86 69
-## 4 E 110 99 134 98 75 70
-<- example %>%
- d mutate(
- across(where(is.numeric), ~ if_else(is.na(.), mean(., na.rm = T), .))
-
- ) d
## # A tibble: 6 x 7
-## name english chinese math sport psy edu
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 A 133 100 102 56 89 89
-## 2 B 120 120 86 88 45 75
-## 3 C 98 109 114 87 72.6 84
-## 4 D 120 78 106 68 86 69
-## 5 E 110 99 134 98 75 70
-## 6 F 116. 132 130 79.4 68 88
-%>%
- d rowwise() %>%
- mutate(
- total = sum(c_across(-name))
- )
## # A tibble: 6 x 8
-## # Rowwise:
-## name english chinese math sport psy edu total
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 A 133 100 102 56 89 89 569
-## 2 B 120 120 86 88 45 75 534
-## 3 C 98 109 114 87 72.6 84 565.
-## 4 D 120 78 106 68 86 69 527
-## 5 E 110 99 134 98 75 70 586
-## 6 F 116. 132 130 79.4 68 88 614.
-%>%
- d rowwise() %>%
- mutate(
- mean = mean(c_across(-name))
- )
## # A tibble: 6 x 8
-## # Rowwise:
-## name english chinese math sport psy edu mean
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 A 133 100 102 56 89 89 94.8
-## 2 B 120 120 86 88 45 75 89
-## 3 C 98 109 114 87 72.6 84 94.1
-## 4 D 120 78 106 68 86 69 87.8
-## 5 E 110 99 134 98 75 70 97.7
-## 6 F 116. 132 130 79.4 68 88 102.
-<- function(x) {
- standard - mean(x)) / sd(x)
- (x }
%>%
- d mutate(
- across(where(is.numeric), standard)
- )
## # A tibble: 6 x 7
-## name english chinese math sport psy edu
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 A 1.44 -0.339 -0.555 -1.54 1.04 1.10
-## 2 B 0.326 0.731 -1.44 0.566 -1.75 -0.464
-## 3 C -1.56 0.143 0.111 0.500 0 0.538
-## 4 D 0.326 -1.51 -0.333 -0.75 0.852 -1.13
-## 5 E -0.531 -0.392 1.22 1.22 0.153 -1.02
-## 6 F 0 1.37 0.999 0 -0.292 0.984
-感谢康钦虹同学提供的数据,但这里有几点需要注意的地方:
-事项 | -问题 | -解决办法 | -
---|---|---|
文件名 | -excel的文件名是中文 | -用英文,比如 data.xlsx |
-
列名 | -列名中有-号,大小写不统一 | -规范列名,或用janitor::clean_names() 偷懒 |
-
预处理 | -直接在原始数据中新增 | -不要在原始数据上改动,统计工作可以在R里实现 | -
文件管理 | -没有层级 | -新建data 文件夹装数据,与code.Rmd 并列 |
-
<- readxl::read_excel("demo_data/career-decision.xlsx", skip = 1) %>%
- data ::clean_names()
- janitor
-#glimpse(data)
<- data %>% select(1:61)
- d #glimpse(d)
%>%
- d summarise(
- across(everything(), ~sum(is.na(.)))
- )
## # A tibble: 1 x 61
-## sex majoy grade from z1 z2 z3 z4 z5
-## <int> <int> <int> <int> <int> <int> <int> <int> <int>
-## 1 0 0 0 0 0 0 0 0 0
-## # ... with 52 more variables: z6 <int>, z7 <int>,
-## # z8 <int>, z9 <int>, z10 <int>, z11 <int>,
-## # z12 <int>, z13 <int>, z14 <int>, z15 <int>,
-## # z16 <int>, z17 <int>, z18 <int>, j1 <int>,
-## # j2 <int>, j3 <int>, j4 <int>, j5 <int>, j6 <int>,
-## # j7 <int>, j8 <int>, j9 <int>, j10 <int>,
-## # j11 <int>, j12 <int>, j13 <int>, j14 <int>,
-## # j15 <int>, j16 <int>, j17 <int>, j18 <int>,
-## # j19 <int>, j20 <int>, j21 <int>, j22 <int>,
-## # j23 <int>, j24 <int>, j25 <int>, j26 <int>,
-## # j27 <int>, j28 <int>, j29 <int>, j30 <int>,
-## # j31 <int>, j32 <int>, j33 <int>, j34 <int>,
-## # j35 <int>, j36 <int>, j37 <int>, j38 <int>,
-## # j39 <int>
-没有缺失值,挺好
-采用利克特式 5 点计分… (这方面你们懂得比我多)
-<- d %>%
- d rowwise() %>%
- mutate(
- environment_exploration = sum(c_across(z1:z5)),
- self_exploration = sum(c_across(z6:z9)),
- objective_system_exploration = sum(c_across(z10:z15)),
- info_quantity_exploration = sum(c_across(z16:z18)),
-
- self_evaluation = sum(c_across(j1:j6)),
- information_collection = sum(c_across(j7:j15)),
- target_select = sum(c_across(j16:j24)),
- formulate = sum(c_across(j25:j32)),
- problem_solving = sum(c_across(j33:j39)),
-
-career_exploration = sum(c_across(z1:z18)),
- career_decision_making = sum(c_across(j1:j39))
- %>%
- ) select(-starts_with("z"), -starts_with("j")) %>%
- ungroup() %>%
- mutate(pid = 1:n(), .before = sex) %>%
- mutate(
- across(c(pid, sex, majoy, grade, from), as_factor)
-
- )
-#glimpse(d)
<- function(x) {
- standard - mean(x)) / sd(x)
- (x
- }
-<- d %>%
- d mutate(
- across(where(is.numeric), standard)
-
- ) d
## # A tibble: 304 x 16
-## pid sex majoy grade from environment_exp~
-## <fct> <fct> <fct> <fct> <fct> <dbl>
-## 1 1 1 4 4 2 -1.63
-## 2 2 1 4 4 1 -1.87
-## 3 3 2 4 4 2 0.0802
-## 4 4 2 4 4 1 -1.87
-## 5 5 2 4 4 1 -0.895
-## 6 6 1 1 4 3 -0.651
-## 7 7 1 4 4 3 -2.36
-## 8 8 1 4 4 1 -0.407
-## 9 9 1 4 4 3 -0.651
-## 10 10 1 4 4 2 0.324
-## # ... with 294 more rows, and 10 more variables:
-## # self_exploration <dbl>,
-## # objective_system_exploration <dbl>,
-## # info_quantity_exploration <dbl>,
-## # self_evaluation <dbl>,
-## # information_collection <dbl>, target_select <dbl>,
-## # formulate <dbl>, problem_solving <dbl>,
-## # career_exploration <dbl>,
-## # career_decision_making <dbl>
-以性别为例。因为性别变量是男女,仅仅2组,所以检查男女在各自指标上的均值差异,可以用T检验。
-%>%
- d group_by(sex) %>%
- summarise(
- across(where(is.numeric), mean)
- )
## # A tibble: 2 x 12
-## sex environment_exp~ self_exploration
-## <fct> <dbl> <dbl>
-## 1 1 -0.147 -0.0829
-## 2 2 0.165 0.0933
-## # ... with 9 more variables:
-## # objective_system_exploration <dbl>,
-## # info_quantity_exploration <dbl>,
-## # self_evaluation <dbl>,
-## # information_collection <dbl>, target_select <dbl>,
-## # formulate <dbl>, problem_solving <dbl>,
-## # career_exploration <dbl>,
-## # career_decision_making <dbl>
-你可以给这个图颜色弄得更好看点?
-library(ggridges)
-%>%
- d ggplot(aes(x = career_exploration, y = sex, fill = sex)) +
- geom_density_ridges()
<- t.test(career_exploration ~ sex, data = d, var.equal = TRUE) %>%
- t_test_eq ::tidy()
- broom t_test_eq
## # A tibble: 1 x 10
-## estimate estimate1 estimate2 statistic p.value
-## <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 -0.367 -0.173 0.194 -3.24 0.00132
-## # ... with 5 more variables: parameter <dbl>,
-## # conf.low <dbl>, conf.high <dbl>, method <chr>,
-## # alternative <chr>
-<- t.test(career_exploration ~ sex, data = d, var.equal = FALSE) %>%
- t_test_uneq ::tidy()
- broom t_test_uneq
## # A tibble: 1 x 10
-## estimate estimate1 estimate2 statistic p.value
-## <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 -0.367 -0.173 0.194 -3.27 0.00121
-## # ... with 5 more variables: parameter <dbl>,
-## # conf.low <dbl>, conf.high <dbl>, method <chr>,
-## # alternative <chr>
-当然,也可以用第 32 章介绍的统计推断的方法
-library(infer)
-
-<- d %>%
- obs_diff specify(formula = career_exploration ~ sex) %>%
- calculate("diff in means", order = c("1", "2"))
- obs_diff
## # A tibble: 1 x 1
-## stat
-## <dbl>
-## 1 -0.367
-<- d %>%
- null_dist specify(formula = career_exploration ~ sex) %>%
- hypothesize(null = "independence") %>%
- generate(reps = 5000, type = "permute") %>%
- calculate(stat = "diff in means", order = c("1", "2"))
- null_dist
## # A tibble: 5,000 x 2
-## replicate stat
-## <int> <dbl>
-## 1 1 -0.0114
-## 2 2 0.0656
-## 3 3 0.00208
-## 4 4 -0.0663
-## 5 5 0.0155
-## 6 6 -0.0736
-## 7 7 -0.0798
-## 8 8 -0.0443
-## 9 9 0.0412
-## 10 10 0.105
-## # ... with 4,990 more rows
-%>%
- null_dist visualize() +
- shade_p_value(obs_stat = obs_diff, direction = "two_sided")
%>%
- null_dist get_p_value(obs_stat = obs_diff, direction = "two_sided") %>%
- #get_p_value(obs_stat = obs_diff, direction = "less") %>%
- mutate(p_value_clean = scales::pvalue(p_value))
## # A tibble: 1 x 2
-## p_value p_value_clean
-## <dbl> <chr>
-## 1 0.00120 0.001
-也可以用tidyverse的方法一次性的搞定所有指标
-%>%
- d pivot_longer(
- cols = -c(pid, sex, majoy, grade, from),
- names_to = "index",
- values_to = "value"
- %>%
- ) group_by(index) %>%
- summarise(
- ::tidy( t.test(value ~ sex, data = cur_data()))
- broom%>%
- ) select(index, estimate, statistic, p.value) %>%
- arrange(p.value)
## # A tibble: 11 x 4
-## index estimate statistic p.value
-## <chr> <dbl> <dbl> <dbl>
-## 1 career_decision_making -0.494 -4.53 8.62e-6
-## 2 problem_solving -0.470 -4.26 2.70e-5
-## 3 target_select -0.449 -4.07 6.09e-5
-## 4 formulate -0.411 -3.72 2.35e-4
-## 5 information_collection -0.411 -3.70 2.53e-4
-## 6 self_evaluation -0.404 -3.65 3.15e-4
-## 7 objective_system_explo~ -0.382 -3.40 7.65e-4
-## 8 career_exploration -0.367 -3.27 1.21e-3
-## 9 environment_exploration -0.312 -2.75 6.29e-3
-## 10 info_quantity_explorat~ -0.274 -2.42 1.62e-2
-## 11 self_exploration -0.176 -1.54 1.26e-1
-以生源地为例。因为生源地有3类,所以可以使用方差分析。
-aov(career_exploration ~ from, data = d) %>%
-TukeyHSD(which = "from") %>%
- ::tidy() broom
## # A tibble: 3 x 7
-## term contrast null.value estimate conf.low conf.high
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 from 2-1 0 0.382 0.0623 0.701
-## 2 from 3-1 0 0.287 -0.0386 0.613
-## 3 from 3-2 0 -0.0943 -0.446 0.257
-## # ... with 1 more variable: adj.p.value <dbl>
-library(ggridges)
-%>%
- d ggplot(aes(x = career_exploration, y = from, fill = from)) +
- geom_density_ridges()
也可以一次性的搞定所有指标
-%>%
- d pivot_longer(
- cols = -c(pid, sex, majoy, grade, from),
- names_to = "index",
- values_to = "value"
- %>%
- ) group_by(index) %>%
- summarise(
- ::tidy( aov(value ~ from, data = cur_data()))
- broom%>%
- ) select(index, term, statistic, p.value) %>%
- filter(term != "Residuals") %>%
- arrange(p.value)
## # A tibble: 11 x 4
-## # Groups: index [11]
-## index term statistic p.value
-## <chr> <chr> <dbl> <dbl>
-## 1 problem_solving from 14.6 9.18e-7
-## 2 career_decision_making from 14.2 1.26e-6
-## 3 formulate from 12.2 7.81e-6
-## 4 information_collection from 10.2 5.27e-5
-## 5 self_evaluation from 8.91 1.74e-4
-## 6 target_select from 8.45 2.70e-4
-## 7 info_quantity_exploration from 5.78 3.44e-3
-## 8 career_exploration from 4.48 1.21e-2
-## 9 objective_system_explora~ from 4.06 1.81e-2
-## 10 environment_exploration from 3.69 2.60e-2
-## 11 self_exploration from 0.699 4.98e-1
-可以用第 28 章线性模型来探索
-lm(career_decision_making ~ career_exploration, data = d)
##
-## Call:
-## lm(formula = career_decision_making ~ career_exploration, data = d)
-##
-## Coefficients:
-## (Intercept) career_exploration
-## 2.15e-15 7.83e-01
-不要因为我讲课讲的很垃圾,就错过了R的美,瑕不掩瑜啦。要相信自己,你们是川师研究生中最聪明的。
- - -本章我们分析加拿大哥伦比亚林地驯鹿追踪数据,数据包含了从1988年到2016年期间260只驯鹿,近250000个位置标签。
-大家可以在这里了解数据集的信息,它包含了两个数据集
-# devtools::install_github("thebioengineer/tidytuesdayR")
-library(tidytuesdayR)
-
-<- tidytuesdayR::tt_load("2020-06-23")
- tuesdata # or
-# tuesdata <- tidytuesdayR::tt_load(2020, week = 26)
library(tidyverse)
-library(lubridate)
-library(gganimate)
-
-<- readr::read_csv("./demo_data/caribou/individuals.csv")
- individuals <- readr::read_csv("./demo_data/caribou/locations.csv") locations
%>% glimpse() individuals
## Rows: 286
-## Columns: 14
-## $ animal_id <chr> "HR_151.510", "GR_C04...
-## $ sex <chr> "f", "f", "f", "f", "...
-## $ life_stage <chr> NA, NA, NA, NA, NA, N...
-## $ pregnant <lgl> NA, NA, NA, NA, NA, N...
-## $ with_calf <lgl> NA, NA, NA, NA, NA, N...
-## $ death_cause <chr> NA, NA, NA, NA, NA, N...
-## $ study_site <chr> "Hart Ranges", "Graha...
-## $ deploy_on_longitude <dbl> NA, NA, NA, NA, NA, N...
-## $ deploy_on_latitude <dbl> NA, NA, NA, NA, NA, N...
-## $ deploy_on_comments <chr> NA, NA, NA, NA, NA, N...
-## $ deploy_off_longitude <dbl> NA, NA, NA, NA, NA, N...
-## $ deploy_off_latitude <dbl> NA, NA, NA, NA, NA, N...
-## $ deploy_off_type <chr> "unknown", "unknown",...
-## $ deploy_off_comments <chr> NA, NA, NA, NA, NA, N...
-%>% count(animal_id) individuals
## # A tibble: 260 x 2
-## animal_id n
-## <chr> <int>
-## 1 BP_car022 1
-## 2 BP_car023 1
-## 3 BP_car032 1
-## 4 BP_car043 1
-## 5 BP_car100 1
-## 6 BP_car101 1
-## 7 BP_car115 1
-## 8 BP_car144 1
-## 9 BP_car145 1
-## 10 GR_C01 2
-## # ... with 250 more rows
-我们发现有重复id的,怎么办?
-%>% janitor::get_dupes(animal_id) individuals
## # A tibble: 50 x 15
-## animal_id dupe_count sex life_stage pregnant
-## <chr> <int> <chr> <chr> <lgl>
-## 1 GR_C01 2 f <NA> NA
-## 2 GR_C01 2 f <NA> NA
-## 3 GR_C02 2 f <NA> NA
-## 4 GR_C02 2 f <NA> NA
-## 5 GR_C04 2 f <NA> NA
-## 6 GR_C04 2 f <NA> NA
-## 7 GR_C05 2 f <NA> NA
-## 8 GR_C05 2 f <NA> NA
-## 9 GR_C06 2 f <NA> NA
-## 10 GR_C06 2 f <NA> NA
-## # ... with 40 more rows, and 10 more variables:
-## # with_calf <lgl>, death_cause <chr>,
-## # study_site <chr>, deploy_on_longitude <dbl>,
-## # deploy_on_latitude <dbl>,
-## # deploy_on_comments <chr>,
-## # deploy_off_longitude <dbl>,
-## # deploy_off_latitude <dbl>, deploy_off_type <chr>,
-## # deploy_off_comments <chr>
-%>%
- individuals filter(deploy_on_latitude > 50) %>%
- ggplot(aes(x = deploy_on_longitude, y = deploy_on_latitude)) +
- geom_point(aes(color = study_site)) #+
# borders("world", regions = "china")
简单点说,就是哪个驯鹿在什么时间出现在什么地方
-%>%
- locations ggplot(aes(x = longitude, y = latitude)) +
- geom_point(aes(color = study_site))
<-
- top_animal_ids count(locations, animal_id, sort = TRUE) %>%
- slice(1:10) %>%
- pull(animal_id)
-
-
-%>%
- locations filter(animal_id %in% top_animal_ids) %>%
- arrange(animal_id, timestamp) %>%
- group_by(animal_id) %>%
- mutate(measurement_n = row_number()) %>%
- ggplot(aes(
- x = longitude,
- y = latitude,
- color = animal_id,
- alpha = measurement_n
- +
- )) geom_point(show.legend = FALSE, size = 1) +
- geom_path(show.legend = FALSE, size = 1) +
- # scale_color_manual(values = ) +
- theme_minimal() +
- theme(
- plot.title = element_text(size = 20, face = "bold"),
- plot.subtitle = element_text(size = 10),
- text = element_text(color = "White"),
- panel.grid.minor = element_blank(),
- panel.grid.major = element_line(color = "gray60", size = 0.05),
- plot.background = element_rect(fill = "gray10"),
- axis.text = element_text(color = "white")
- +
- ) labs(
- x = "\nLongitude", y = "Latitude\n",
- title = "Caribou movement tracking",
- subtitle = "Latitude and longitude locations of the animals with the highest number of measurements\n",
- caption = "Tidy Tuesday: Caribou Location Tracking"
- )
%>%
- locations ::filter(animal_id %in% c("QU_car143")) %>%
- dplyr::arrange(animal_id, timestamp) %>%
- dplyr::group_by(animal_id) %>%
- dplyr::mutate(measurement_n = row_number()) %>%
- dplyrggplot(aes(
- x = longitude,
- y = latitude,
- color = measurement_n,
- alpha = measurement_n
- +
- )) geom_point(show.legend = FALSE, size = 1) +
- geom_path(show.legend = FALSE, size = 1) +
- scale_color_gradient(low = "white", high = "firebrick3") +
- theme_minimal() +
- theme(
- plot.title = element_text(size = 20, face = "bold"),
- plot.subtitle = element_text(size = 10),
- text = element_text(color = "White"),
- panel.grid.minor = element_blank(),
- panel.grid.major = element_line(color = "gray60", size = 0.05),
- plot.background = element_rect(fill = "gray10"),
- axis.text = element_text(color = "white")
- +
- ) labs(
- x = "\nLongitude", y = "Latitude\n",
- title = "QU_car143 movement tracking",
- subtitle = "Latitude and longitude locations of the animals with the highest number of measurements\n Ligher colors indicate earlier measurements",
- caption = "Tidy Tuesday: Caribou Location Tracking"
- )
<- locations %>%
- example_animal ::filter(animal_id == sample(animal_id, 1)) %>%
- dplyr::arrange(timestamp)
- dplyr example_animal
## # A tibble: 2,039 x 7
-## event_id animal_id study_site season
-## <dbl> <chr> <chr> <chr>
-## 1 2.27e9 QU_car110 Quintette Winter
-## 2 2.27e9 QU_car110 Quintette Winter
-## 3 2.27e9 QU_car110 Quintette Winter
-## 4 2.27e9 QU_car110 Quintette Winter
-## 5 2.27e9 QU_car110 Quintette Winter
-## 6 2.27e9 QU_car110 Quintette Winter
-## 7 2.27e9 QU_car110 Quintette Winter
-## 8 2.27e9 QU_car110 Quintette Winter
-## 9 2.27e9 QU_car110 Quintette Winter
-## 10 2.27e9 QU_car110 Quintette Winter
-## # ... with 2,029 more rows, and 3 more variables:
-## # timestamp <dttm>, longitude <dbl>, latitude <dbl>
-"2010-03-28 21:00:44" %>% lubridate::as_date()
-"2010-03-28 21:00:44" %>% lubridate::as_datetime()
-"2010-03-28 21:00:44" %>% lubridate::quarter()
%>%
- example_animal ::mutate(date = lubridate::as_date(timestamp)) %>%
- dplyrggplot(aes(x = longitude, y = latitude, color = date)) +
- geom_path()
%>%
- example_animal ::mutate(quarter = lubridate::quarter(timestamp) %>% as.factor()) %>%
- dplyrggplot(aes(x = longitude, y = latitude, color = quarter)) +
- geom_path() +
- facet_wrap(vars(quarter)) +
- labs(title = "一只小驯鹿到处啊跑")
看看驯鹿夏季和冬季运动模式,这段代码来自gkaramanis
-<- locations %>%
- movement filter(study_site != "Hart Ranges") %>%
- mutate(
- season = fct_rev(season),
- longitude = round(longitude, 2),
- latitude = round(latitude, 2)
- %>%
- ) distinct(season, study_site, longitude, latitude)
-
-
-
-ggplot(movement) +
-geom_point(aes(longitude, latitude,
- group = study_site,
- colour = study_site
- size = 0.1) +
- ), ::gghighlight(
- gghighlightunhighlighted_params = list(colour = "grey70"), use_direct_label = FALSE
- +
- ) scale_colour_manual(
- values = c("#ffe119", "#4363d8", "#f58231", "#e6194B", "#800000", "#000075", "#f032e6", "#3cb44b"),
- breaks = c("Graham", "Scott", "Moberly", "Burnt Pine", "Kennedy", "Quintette", "Narraway")
- +
- ) guides(colour = guide_legend(title = "Herd", override.aes = list(size = 3))) +
- coord_fixed(ratio = 1.5) +
- facet_wrap(vars(season), ncol = 2) +
- # labs(
- # title = "Migration patterns of Northern Caribou\nin the South Peace of British Columbia",
- # subtitle = str_wrap("In summer, most caribou migrate towards the central core of the Rocky Mountains where they use alpine and subalpine habitat. The result of this movement to the central core of the Rocky Mountains is that some of the east side herds can overlap with west side herds during the summer.", 100),
- # caption = str_wrap("Source: Seip DR, Price E (2019) Data from: Science update for the South Peace Northern Caribou (Rangifer tarandus caribou pop. 15) in British Columbia. Movebank Data Repository. https://doi.org/10.5441/001/1.p5bn656k | Graphic: Georgios Karamanis", 70)
- # ) +
- theme_void() +
- theme(
- legend.position = c(0.5, 0.6),
- legend.text = element_text(size = 11, colour = "#F9EED9"),
- legend.title = element_text(size = 16, hjust = 0.5, colour = "#F9EED9"),
- panel.spacing.x = unit(3, "lines"),
- plot.margin = margin(20, 20, 20, 20),
- plot.background = element_rect(fill = "#7A6A4F", colour = NA),
- strip.text = element_text(colour = "#F9EED9", size = 18),
- plot.title = element_text(colour = "white", size = 20, hjust = 0, lineheight = 1),
- plot.subtitle = element_text(colour = "white", size = 12, hjust = 0, lineheight = 1, margin = margin(10, 0, 50, 0)),
- plot.caption = element_text(colour = "grey80", size = 7, hjust = 1, margin = margin(30, 0, 10, 0))
- )
<- locations %>%
- location_with_speed ::group_by(animal_id) %>%
- dplyr::mutate(
- dplyrlast_longitude = lag(longitude),
- last_latitude = lag(latitude),
- hours = as.numeric(difftime(timestamp, lag(timestamp), units = "hours")),
- km = geosphere::distHaversine(
- cbind(longitude, latitude), cbind(last_longitude, last_latitude)
- / 1000,
- ) speed = km / hours
- %>%
- ) ::ungroup()
- dplyr
- location_with_speed
## # A tibble: 249,450 x 12
-## event_id animal_id study_site season
-## <dbl> <chr> <chr> <chr>
-## 1 2.26e9 GR_C01 Graham Winter
-## 2 2.26e9 GR_C01 Graham Winter
-## 3 2.26e9 GR_C01 Graham Winter
-## 4 2.26e9 GR_C01 Graham Winter
-## 5 2.26e9 GR_C01 Graham Winter
-## 6 2.26e9 GR_C01 Graham Winter
-## 7 2.26e9 GR_C01 Graham Winter
-## 8 2.26e9 GR_C01 Graham Winter
-## 9 2.26e9 GR_C01 Graham Winter
-## 10 2.26e9 GR_C01 Graham Winter
-## # ... with 249,440 more rows, and 8 more variables:
-## # timestamp <dttm>, longitude <dbl>, latitude <dbl>,
-## # last_longitude <dbl>, last_latitude <dbl>,
-## # hours <dbl>, km <dbl>, speed <dbl>
-%>%
- location_with_speed ggplot(aes(x = speed)) +
- geom_histogram() +
- scale_x_log10()
library(gganimate)
-
-%>%
- example_animal ggplot(aes(x = longitude, y = latitude)) +
- geom_point() +
- transition_time(time = timestamp) +
- shadow_mark(past = TRUE) +
- labs(title = "date is {frame_time}")
<- locations %>%
- df ::filter(
- dplyr== "Graham",
- study_site year(timestamp) == 2002
- %>%
- ) ::group_by(animal_id) %>%
- dplyr::filter(
- dplyras_date(min(timestamp)) == "2002-01-01",
- as_date(max(timestamp)) == "2002-12-31"
- %>%
- ) ::ungroup() %>%
- dplyr::mutate(date = as_date(timestamp)) %>%
- dplyr::group_by(animal_id, date) %>%
- dplyr::summarise(
- dplyrlongitude_centroid = mean(longitude),
- latitude_centroid = mean(latitude)
- %>%
- ) ::ungroup() %>%
- dplyr::complete(animal_id, date) %>%
- tidyr::arrange(animal_id, date) %>%
- dplyr::fill(longitude_centroid, latitude_centroid, .direction = "down") tidyr
<- df %>%
- p ggplot(aes(longitude_centroid, latitude_centroid, colour = animal_id)) +
- geom_point(size = 2) +
- coord_map() +
- theme_void() +
- theme(legend.position = "none") +
- transition_time(time = date) +
- shadow_mark(alpha = 0.2, size = 0.8) +
- ggtitle("Caribou location on {frame_time}")
- p
library(tidyverse)
-library(lubridate)
-library(maps)
-library(viridis)
-library(ggrepel)
-library(paletteer)
-library(shadowtext)
-library(showtext)
-showtext_auto()
新型冠状病毒(俗称武汉肺炎)疫情在多国蔓延,本章通过分析疫情数据,了解疫情发展,祝愿人类早日会战胜病毒!
-我们打开链接https://github.com/CSSEGISandData/COVID-19,
- -找到疫情时间序列数据,你可以通过点击该网页Clone or download
直接下载的方式获取数据。
假定你已经下载了数据,比如time_series_covid19_confirmed_global.csv
, 那么我们可以用readr::read_csv()
函数直接读取, 关于在R语言里文件读取的方法可以参考第 5 章。
<- read_csv("./demo_data/time_series_covid19_confirmed_global.csv")
- d d
## # A tibble: 256 x 74
-## `Province/State` `Country/Region` Lat Long
-## <chr> <chr> <dbl> <dbl>
-## 1 <NA> Afghanistan 33 65
-## 2 <NA> Albania 41.2 20.2
-## 3 <NA> Algeria 28.0 1.66
-## 4 <NA> Andorra 42.5 1.52
-## 5 <NA> Angola -11.2 17.9
-## 6 <NA> Antigua and Bar~ 17.1 -61.8
-## 7 <NA> Argentina -38.4 -63.6
-## 8 <NA> Armenia 40.1 45.0
-## 9 Australian Capi~ Australia -35.5 149.
-## 10 New South Wales Australia -33.9 151.
-## # ... with 246 more rows, and 70 more variables:
-## # `1/22/20` <dbl>, `1/23/20` <dbl>, `1/24/20` <dbl>,
-## # `1/25/20` <dbl>, `1/26/20` <dbl>, `1/27/20` <dbl>,
-## # `1/28/20` <dbl>, `1/29/20` <dbl>, `1/30/20` <dbl>,
-## # `1/31/20` <dbl>, `2/1/20` <dbl>, `2/2/20` <dbl>,
-## # `2/3/20` <dbl>, `2/4/20` <dbl>, `2/5/20` <dbl>,
-## # `2/6/20` <dbl>, `2/7/20` <dbl>, `2/8/20` <dbl>,
-## # `2/9/20` <dbl>, `2/10/20` <dbl>, `2/11/20` <dbl>,
-## # `2/12/20` <dbl>, `2/13/20` <dbl>, `2/14/20` <dbl>,
-## # `2/15/20` <dbl>, `2/16/20` <dbl>, `2/17/20` <dbl>,
-## # `2/18/20` <dbl>, `2/19/20` <dbl>, `2/20/20` <dbl>,
-## # `2/21/20` <dbl>, `2/22/20` <dbl>, `2/23/20` <dbl>,
-## # `2/24/20` <dbl>, `2/25/20` <dbl>, `2/26/20` <dbl>,
-## # `2/27/20` <dbl>, `2/28/20` <dbl>, `2/29/20` <dbl>,
-## # `3/1/20` <dbl>, `3/2/20` <dbl>, `3/3/20` <dbl>,
-## # `3/4/20` <dbl>, `3/5/20` <dbl>, `3/6/20` <dbl>,
-## # `3/7/20` <dbl>, `3/8/20` <dbl>, `3/9/20` <dbl>,
-## # `3/10/20` <dbl>, `3/11/20` <dbl>, `3/12/20` <dbl>,
-## # `3/13/20` <dbl>, `3/14/20` <dbl>, `3/15/20` <dbl>,
-## # `3/16/20` <dbl>, `3/17/20` <dbl>, `3/18/20` <dbl>,
-## # `3/19/20` <dbl>, `3/20/20` <dbl>, `3/21/20` <dbl>,
-## # `3/22/20` <dbl>, `3/23/20` <dbl>, `3/24/20` <dbl>,
-## # `3/25/20` <dbl>, `3/26/20` <dbl>, `3/27/20` <dbl>,
-## # `3/28/20` <dbl>, `3/29/20` <dbl>, `3/30/20` <dbl>,
-## # `3/31/20` <dbl>
-探索数据之前,我们一定要对数据存储结构、数据变量名及其含义要非常清楚,重要的事情说三遍。
-glimpse(d)
## Rows: 256
-## Columns: 74
-## $ `Province/State` <chr> NA, NA, NA, NA, NA, NA, N...
-## $ `Country/Region` <chr> "Afghanistan", "Albania",...
-## $ Lat <dbl> 33.00, 41.15, 28.03, 42.5...
-## $ Long <dbl> 65.000, 20.168, 1.660, 1....
-## $ `1/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/24/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/25/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/26/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/27/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/28/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/29/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/30/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `1/31/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/1/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/2/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/3/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/4/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/5/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/6/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/7/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/8/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/9/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/10/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/11/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/12/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/13/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/14/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/15/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/16/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/17/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/18/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/19/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/20/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/21/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/22/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/23/20` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/24/20` <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0...
-## $ `2/25/20` <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0...
-## $ `2/26/20` <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0...
-## $ `2/27/20` <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0...
-## $ `2/28/20` <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0...
-## $ `2/29/20` <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0...
-## $ `3/1/20` <dbl> 1, 0, 1, 0, 0, 0, 0, 1, 0...
-## $ `3/2/20` <dbl> 1, 0, 3, 1, 0, 0, 0, 1, 0...
-## $ `3/3/20` <dbl> 1, 0, 5, 1, 0, 0, 1, 1, 0...
-## $ `3/4/20` <dbl> 1, 0, 12, 1, 0, 0, 1, 1, ...
-## $ `3/5/20` <dbl> 1, 0, 12, 1, 0, 0, 1, 1, ...
-## $ `3/6/20` <dbl> 1, 0, 17, 1, 0, 0, 2, 1, ...
-## $ `3/7/20` <dbl> 1, 0, 17, 1, 0, 0, 8, 1, ...
-## $ `3/8/20` <dbl> 4, 0, 19, 1, 0, 0, 12, 1,...
-## $ `3/9/20` <dbl> 4, 2, 20, 1, 0, 0, 12, 1,...
-## $ `3/10/20` <dbl> 5, 10, 20, 1, 0, 0, 17, 1...
-## $ `3/11/20` <dbl> 7, 12, 20, 1, 0, 0, 19, 1...
-## $ `3/12/20` <dbl> 7, 23, 24, 1, 0, 0, 19, 4...
-## $ `3/13/20` <dbl> 7, 33, 26, 1, 0, 1, 31, 8...
-## $ `3/14/20` <dbl> 11, 38, 37, 1, 0, 1, 34, ...
-## $ `3/15/20` <dbl> 16, 42, 48, 1, 0, 1, 45, ...
-## $ `3/16/20` <dbl> 21, 51, 54, 2, 0, 1, 56, ...
-## $ `3/17/20` <dbl> 22, 55, 60, 39, 0, 1, 68,...
-## $ `3/18/20` <dbl> 22, 59, 74, 39, 0, 1, 79,...
-## $ `3/19/20` <dbl> 22, 64, 87, 53, 0, 1, 97,...
-## $ `3/20/20` <dbl> 24, 70, 90, 75, 1, 1, 128...
-## $ `3/21/20` <dbl> 24, 76, 139, 88, 2, 1, 15...
-## $ `3/22/20` <dbl> 40, 89, 201, 113, 2, 1, 2...
-## $ `3/23/20` <dbl> 40, 104, 230, 133, 3, 3, ...
-## $ `3/24/20` <dbl> 74, 123, 264, 164, 3, 3, ...
-## $ `3/25/20` <dbl> 84, 146, 302, 188, 3, 3, ...
-## $ `3/26/20` <dbl> 94, 174, 367, 224, 4, 7, ...
-## $ `3/27/20` <dbl> 110, 186, 409, 267, 4, 7,...
-## $ `3/28/20` <dbl> 110, 197, 454, 308, 5, 7,...
-## $ `3/29/20` <dbl> 120, 212, 511, 334, 7, 7,...
-## $ `3/30/20` <dbl> 170, 223, 584, 370, 7, 7,...
-## $ `3/31/20` <dbl> 174, 243, 716, 376, 7, 7,...
-select()
%>% select(-c(1:4))
- d %>% select(5:ncol(.))
- d %>% select(matches("/20"))
- d %>% select(ends_with("/20"))
- d
-# 应该还有其他的方法
pivot_longer()
宽表格变长表格,需要用到pivot_longer()
和 pivot_wider()
, 比如
table4a
## # A tibble: 3 x 3
-## country `1999` `2000`
-## * <chr> <int> <int>
-## 1 Afghanistan 745 2666
-## 2 Brazil 37737 80488
-## 3 China 212258 213766
-<- table4a %>%
- longer pivot_longer(
- cols = `1999`:`2000`,
- names_to = "year",
- values_to = "cases"
-
- )
- longer
## # A tibble: 6 x 3
-## country year cases
-## <chr> <chr> <int>
-## 1 Afghanistan 1999 745
-## 2 Afghanistan 2000 2666
-## 3 Brazil 1999 37737
-## 4 Brazil 2000 80488
-## 5 China 1999 212258
-## 6 China 2000 213766
-pivot_wider()
有时候我们想折腾下,比如把长表格再变回宽表格
-%>%
- longer pivot_wider(
- names_from = year,
- values_from = cases
- )
## # A tibble: 3 x 3
-## country `1999` `2000`
-## <chr> <int> <int>
-## 1 Afghanistan 745 2666
-## 2 Brazil 37737 80488
-## 3 China 212258 213766
-有时候,我会遇到日期date
这种数据类型,我推荐使用lubridate
包来处理,比如
c("2020-3-25", "20200325", "20-03-25", "2020 03 25") %>% lubridate::ymd()
## [1] "2020-03-25" "2020-03-25" "2020-03-25" "2020-03-25"
-c("3/25/20", "03-25-20", "3-25/2020") %>% lubridate::mdy()
## [1] "2020-03-25" "2020-03-25" "2020-03-25"
-遇到这种010210
日期的,请把输入数据的人扁一顿,他会告诉你的
::dmy(010210)
- lubridate::dym(010210)
- lubridate::mdy(010210)
- lubridate::myd(010210)
- lubridate::ymd(010210)
- lubridate::ydm(010210) lubridate
difftime(ymd("2020-03-24"),
-ymd("2020-03-23"),
- units = "days"
- )
## Time difference of 1 days
-或者更直观的表述
-ymd("2020-03-24") - ymd("2020-03-23")
## Time difference of 1 days
-转换为天数
-ymd("2020-03-24") - ymd("2020-03-23")) %>% as.numeric() (
## [1] 1
-<- tibble(
- tb days_since_100 = 0:18,
- cases = 100 * 1.33^days_since_100
-
- )
-
-<- tb %>%
- p1 ggplot(aes(days_since_100, cases)) +
- geom_line(size = 0.8) +
- geom_point(pch = 21, size = 1)
-
-<- tb %>%
- p2 ggplot(aes(days_since_100, log10(cases))) +
- geom_line(size = 0.8) +
- geom_point(pch = 21, size = 1)
-
-
-<- tb %>%
- p3 ggplot(aes(days_since_100, cases)) +
- geom_line(size = 0.8) +
- geom_point(pch = 21, size = 1) +
- scale_y_log10()
-
-library(patchwork)
-+ p2 + p3 p1
<- d %>%
- d1 pivot_longer(
- cols = 5:ncol(.),
- names_to = "date",
- values_to = "cases"
- %>%
- ) mutate(date = lubridate::mdy(date)) %>%
- ::clean_names() %>%
- janitorgroup_by(country_region, date) %>%
- summarise(cases = sum(cases)) %>%
- ungroup()
-
- d1
## # A tibble: 12,600 x 3
-## country_region date cases
-## <chr> <date> <dbl>
-## 1 Afghanistan 2020-01-22 0
-## 2 Afghanistan 2020-01-23 0
-## 3 Afghanistan 2020-01-24 0
-## 4 Afghanistan 2020-01-25 0
-## 5 Afghanistan 2020-01-26 0
-## 6 Afghanistan 2020-01-27 0
-## 7 Afghanistan 2020-01-28 0
-## 8 Afghanistan 2020-01-29 0
-## 9 Afghanistan 2020-01-30 0
-## 10 Afghanistan 2020-01-31 0
-## # ... with 12,590 more rows
-%>%
- d1 group_by(date) %>%
- summarise(confirmed = sum(cases))
## # A tibble: 70 x 2
-## date confirmed
-## <date> <dbl>
-## 1 2020-01-22 555
-## 2 2020-01-23 654
-## 3 2020-01-24 941
-## 4 2020-01-25 1434
-## 5 2020-01-26 2118
-## 6 2020-01-27 2927
-## 7 2020-01-28 5578
-## 8 2020-01-29 6166
-## 9 2020-01-30 8234
-## 10 2020-01-31 9927
-## # ... with 60 more rows
-【WHO:2019冠状病毒全球大流行正在“加速”】世界卫生组织(WHO)昨日发出警告,指2019冠状病毒全球感染者已超过30万人,全球大流行正在“加速”。世卫组织指,从首例病例报告到感染者达到10万人用了67天;感染人数增至20万用了11天;从20万到突破30万则只用了4天。
-%>%
- d1 group_by(date) %>%
- summarise(confirmed = sum(cases)) %>%
- ggplot(aes(x = date, y = confirmed)) +
- geom_point() +
- scale_x_date(
- date_labels = "%m-%d",
- date_breaks = "1 week"
- +
- ) scale_y_continuous(
- breaks = c(0, 50000, 100000, 200000, 300000, 500000, 900000),
- labels = scales::comma
- )
# d1 %>% distinct(country_region) %>% pull(country_region)
-%>% distinct(country_region) d1
## # A tibble: 180 x 1
-## country_region
-## <chr>
-## 1 Afghanistan
-## 2 Albania
-## 3 Algeria
-## 4 Andorra
-## 5 Angola
-## 6 Antigua and Barbuda
-## 7 Argentina
-## 8 Armenia
-## 9 Australia
-## 10 Austria
-## # ... with 170 more rows
-%>%
- d1 filter(country_region == "China")
## # A tibble: 70 x 3
-## country_region date cases
-## <chr> <date> <dbl>
-## 1 China 2020-01-22 548
-## 2 China 2020-01-23 643
-## 3 China 2020-01-24 920
-## 4 China 2020-01-25 1406
-## 5 China 2020-01-26 2075
-## 6 China 2020-01-27 2877
-## 7 China 2020-01-28 5509
-## 8 China 2020-01-29 6087
-## 9 China 2020-01-30 8141
-## 10 China 2020-01-31 9802
-## # ... with 60 more rows
-%>%
- d1 filter(country_region == "China") %>%
- ggplot(aes(x = date, y = cases)) +
- geom_point() +
- scale_x_date(date_breaks = "1 week", date_labels = "%m-%d") +
- scale_y_log10(labels = scales::comma)
%>%
- d1 group_by(country_region) %>%
- filter(max(cases) >= 20000) %>%
- ungroup() %>%
- ggplot(aes(x = date, y = cases, color = country_region)) +
- geom_point() +
- scale_x_date(date_breaks = "1 week", date_labels = "%m-%d") +
- scale_y_log10() +
- facet_wrap(vars(country_region), ncol = 2) +
- theme(
- axis.text.x = element_text(angle = 45, hjust = 1)
- +
- ) theme(legend.position = "none")
网站https://www.ft.com/coronavirus-latest 这张图很受关注,于是打算重复
-这张图想表达的是,出现100个案例后,各国确诊人数的爆发趋势
-那么,我们需要对数据的时间轴做相应的变形
-100
的国家case >= 100
的日期,date[cases >= 100]
min(date[cases >= 100])
mutate( days_since_100 = date - min(date[cases >= 100])
days_since_100
转换成数值型as.numeric()
<- d1 %>%
- d2 group_by(country_region) %>%
- filter(max(cases) >= 100) %>%
- mutate(
- days_since_100 = date - min(date[cases >= 100])
- %>%
- ) mutate(days_since_100 = as.numeric(days_since_100)) %>%
- filter(days_since_100 >= 0) %>%
- ungroup()
- d2
## # A tibble: 1,710 x 4
-## country_region date cases days_since_100
-## <chr> <date> <dbl> <dbl>
-## 1 Afghanistan 2020-03-27 110 0
-## 2 Afghanistan 2020-03-28 110 1
-## 3 Afghanistan 2020-03-29 120 2
-## 4 Afghanistan 2020-03-30 170 3
-## 5 Afghanistan 2020-03-31 174 4
-## 6 Albania 2020-03-23 104 0
-## 7 Albania 2020-03-24 123 1
-## 8 Albania 2020-03-25 146 2
-## 9 Albania 2020-03-26 174 3
-## 10 Albania 2020-03-27 186 4
-## # ... with 1,700 more rows
--大家都谈过恋爱,也有可能失恋。大家失恋时间是不同的,若把失恋的当天作为第 0 day, 就可以比较失恋若干天后每个人精神波动情况。参照《失恋33天》 -
-<- d2 %>%
- d2_most group_by(country_region) %>%
- top_n(1, days_since_100) %>%
- filter(cases >= 10000) %>%
- ungroup() %>%
- arrange(desc(cases))
- d2_most
## # A tibble: 13 x 4
-## country_region date cases days_since_100
-## <chr> <date> <dbl> <dbl>
-## 1 US 2020-03-31 188172 28
-## 2 Italy 2020-03-31 105792 37
-## 3 Spain 2020-03-31 95923 29
-## 4 China 2020-03-31 82279 69
-## 5 Germany 2020-03-31 71808 30
-## 6 France 2020-03-31 52827 31
-## 7 Iran 2020-03-31 44605 34
-## 8 United Kingdom 2020-03-31 25481 26
-## 9 Switzerland 2020-03-31 16605 26
-## 10 Turkey 2020-03-31 13531 12
-## 11 Belgium 2020-03-31 12775 25
-## 12 Netherlands 2020-03-31 12667 25
-## 13 Austria 2020-03-31 10180 23
-%>%
- d2 bind_rows(
- tibble(country = "33% daily rise", days_since_100 = 0:30) %>%
- mutate(cases = 100 * 1.33^days_since_100)
- %>%
- )
- ggplot(aes(days_since_100, cases, color = country_region)) +
- geom_hline(yintercept = 100) +
- geom_vline(xintercept = 0) +
- geom_line(size = 0.8) +
- geom_point(pch = 21, size = 1) +
- # scale_colour_manual(
-# values = c(
-# "US" = "#EB5E8D",
-# "Italy" = "black",
-# "Spain" = "#c2b7af",
-# "China" = "red",
-# "Germany" = "#c2b7af",
-# "France" = "#c2b7af",
-# "Iran" = "#9dbf57",
-# "United Kingdom" = "#ce3140",
-# "Korea, South" = "#208fce",
-# "Japan" = "#208fce",
-# "Singapore" = "#1E8FCC",
-# "33% daily rise" = "#D9CCC3",
-# "Switzerland" = "#c2b7af",
-# "Turkey" = "#208fce",
-# "Belgium" = "#c2b7af",
-# "Netherlands" = "#c2b7af",
-# "Austria" = "#c2b7af",
-# "Hong Kong" = "#1E8FCC",
-# # gray
-# "India" = "#c2b7af",
-# "Switzerland" = "#c2b7af",
-# "Belgium" = "#c2b7af",
-# "Norway" = "#c2b7af",
-# "Sweden" = "#c2b7af",
-# "Austria" = "#c2b7af",
-# "Australia" = "#c2b7af",
-# "Denmark" = "#c2b7af",
-# "Canada" = "#c2b7af",
-# "Brazil" = "#c2b7af",
-# "Portugal" = "#c2b7af"
-# )
-# ) +
-
- geom_shadowtext(
- data = d2_most, aes(label = paste0(" ", country_region)),
- bg.color = "white"
- +
- ) scale_y_log10(
- expand = expansion(mult = c(0, .1)),
- breaks = c(100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000),
- labels = scales::comma
- +
- ) scale_x_continuous(
- expand = expansion(mult = c(0, .1)),
- breaks = c(0, 5, 10, 15, 20, 25, 30)
- +
- ) theme_minimal() +
- theme(
- panel.grid.minor = element_blank(),
- plot.background = element_rect(fill = "#FFF1E6"),
- legend.position = "none",
- panel.spacing = margin(3, 15, 3, 15, "mm")
- +
- ) labs(
- x = "Number of days since 100th case",
- y = "",
- title = "Country by country: how coronavirus case trajectories compare",
- subtitle = "Cumulative number of cases, by Number of days since 100th case",
- caption = "data source from @www.ft.com"
- )
有点乱,还有很多细节没有实现,后面再弄弄了
-<- d1 %>%
- d2a group_by(country_region) %>%
- filter(cases >= 100) %>%
- mutate(days_since_100 = 0:(n() - 1)) %>%
- # same as
- # mutate(edate = as.numeric(date - min(date)))
- ungroup()
- d2a
## # A tibble: 1,710 x 4
-## country_region date cases days_since_100
-## <chr> <date> <dbl> <int>
-## 1 Afghanistan 2020-03-27 110 0
-## 2 Afghanistan 2020-03-28 110 1
-## 3 Afghanistan 2020-03-29 120 2
-## 4 Afghanistan 2020-03-30 170 3
-## 5 Afghanistan 2020-03-31 174 4
-## 6 Albania 2020-03-23 104 0
-## 7 Albania 2020-03-24 123 1
-## 8 Albania 2020-03-25 146 2
-## 9 Albania 2020-03-26 174 3
-## 10 Albania 2020-03-27 186 4
-## # ... with 1,700 more rows
-这里的d2a
和d2
是一样的了,但方法简单很多。
<- d2a %>%
- d3 group_by(country_region) %>%
- filter(days_since_100 == max(days_since_100)) %>%
- # same as
- # top_n(1, days_since_100) %>%
- ungroup() %>%
- arrange(desc(days_since_100))
- d3
## # A tibble: 110 x 4
-## country_region date cases days_since_100
-## <chr> <date> <dbl> <int>
-## 1 China 2020-03-31 82279 69
-## 2 Diamond Princess 2020-03-31 712 50
-## 3 Korea, South 2020-03-31 9786 40
-## 4 Japan 2020-03-31 1953 39
-## 5 Italy 2020-03-31 105792 37
-## 6 Iran 2020-03-31 44605 34
-## 7 France 2020-03-31 52827 31
-## 8 Singapore 2020-03-31 926 31
-## 9 Germany 2020-03-31 71808 30
-## 10 Spain 2020-03-31 95923 29
-## # ... with 100 more rows
-<- d3 %>%
- highlight top_n(10, days_since_100) %>%
- pull(country_region)
- highlight
## [1] "China" "Diamond Princess"
-## [3] "Korea, South" "Japan"
-## [5] "Italy" "Iran"
-## [7] "France" "Singapore"
-## [9] "Germany" "Spain"
-%>%
- d2a bind_rows(
- tibble(country = "33% daily rise", days_since_100 = 0:30) %>%
- mutate(cases = 100 * 1.33^days_since_100)
- %>%
- ) ggplot(aes(days_since_100, cases, color = country_region)) +
- geom_hline(yintercept = 100) +
- geom_vline(xintercept = 0) +
- geom_line(size = 0.8) +
- geom_point(pch = 21, size = 1) +
- scale_y_log10(
- expand = expansion(mult = c(0, .1)),
- breaks = c(100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000, 100000),
- labels = scales::comma
- +
- ) scale_x_continuous(
- expand = expansion(mult = c(0, .1)),
- breaks = c(0, 5, 10, 15, 20, 25, 30, 40, 50, 60)
- +
- ) theme_minimal() +
- theme(
- panel.grid.minor = element_blank(),
- plot.background = element_rect(fill = "#FFF1E6"),
- legend.position = "none",
- panel.spacing = margin(3, 15, 3, 15, "mm")
- +
- ) labs(
- x = "Number of days since 100th case",
- y = "",
- title = "Country by country: how coronavirus case trajectories compare",
- subtitle = "Cumulative number of cases, by Number of days since 100th case",
- caption = "data source from @www.ft.com"
- +
- ) ::gghighlight(country_region %in% highlight,
- gghighlightlabel_key = country_region, use_direct_label = TRUE,
- label_params = list(segment.color = NA, nudge_x = 1),
- use_group_by = FALSE
- )
灰色线条的国家名,有点不好弄,在想办法
-笨办法,实际上是4张表共同完成
-<- c(
- highlight "China", "Spain", "US", "United Kingdom", "Korea, South",
- "Italy", "Japan", "Singapore", "Germany", "France", "Iran"
-
- )
-<- c(
- gray "India", "Switzerland", "Belgium", "Netherlands",
- "Sweden", "Austria", "Australia", "Denmark",
- "Canada", "Brazil", "Portugal"
-
- )
-<- d2a %>% filter(country_region %in% highlight)
- d3_highlight
-<- d2a %>% filter(country_region %in% gray) d3_gray
%>%
- d2a ggplot(aes(days_since_100, cases, group = country_region)) +
- geom_hline(yintercept = 100) +
- geom_vline(xintercept = 0) +
- geom_line(size = 0.8, color = "gray70") +
- geom_point(pch = 21, size = 1, color = "gray70") +
-
-# highlight country
- geom_line(data = d3_highlight, aes(color = country_region)) +
- geom_point(data = d3_highlight, aes(color = country_region)) +
- geom_text(
- data = d3_highlight %>%
- group_by(country_region) %>%
- top_n(1, days_since_100) %>%
- ungroup(),
- aes(color = country_region, label = country_region),
- hjust = 0,
- vjust = 0,
- nudge_x = 0.5
- +
- )
-
-# gray country
- geom_text(
- data = d3_gray %>%
- group_by(country_region) %>%
- top_n(1, days_since_100) %>%
- ungroup(),
- aes(label = country_region),
- color = "gray50",
- hjust = 0,
- vjust = 0,
- nudge_x = 0.5
- +
- ) geom_point(
- data = d3_gray %>%
- group_by(country_region) %>%
- top_n(1, days_since_100) %>%
- ungroup(),
- size = 2,
- color = "gray50"
- +
- ) scale_y_log10(
- expand = expansion(mult = c(0, .1)),
- breaks = c(100, 200, 500, 2000, 5000, 10000, 20000, 50000, 100000, 150000),
- labels = scales::comma
- +
- ) scale_x_continuous(
- expand = expansion(mult = c(0, .1)),
- breaks = c(0, 5, 10, 15, 20, 25, 30, 40, 50, 60)
- +
- ) theme_minimal() +
- theme(
- panel.grid.minor = element_blank(),
- plot.background = element_rect(fill = "#FFF1E6"),
- legend.position = "none",
- panel.spacing = margin(3, 15, 3, 15, "mm")
- +
- ) labs(
- x = "Number of days since 100th case",
- y = "",
- title = "Country by country: how coronavirus case trajectories compare",
- subtitle = "Cumulative number of cases, by Number of days since 100th case",
- caption = "data source from @www.ft.com"
- )
差强人意,再想想有没有好的办法
-对数据框d2a增加两列属性(有无标签,有无颜色),然后手动改颜色
-<- d2a %>%
- highlight_country group_by(country_region) %>%
- filter(days_since_100 == max(days_since_100)) %>%
- ungroup() %>%
- arrange(desc(days_since_100)) %>%
- top_n(10, days_since_100) %>%
- pull(country_region)
-
- highlight_country
## [1] "China" "Diamond Princess"
-## [3] "Korea, South" "Japan"
-## [5] "Italy" "Iran"
-## [7] "France" "Singapore"
-## [9] "Germany" "Spain"
-
-## Colors
-<- c(prismatic::clr_darken(paletteer_d("ggsci::category20_d3"), 0.2)[1:length(highlight_country)], "gray70")
- cgroup_cols ::show_col(cgroup_cols) scales
%>%
- d2a group_by(country_region) %>%
- filter(max(days_since_100) > 9) %>%
- mutate(
- end_label = ifelse(days_since_100 == max(days_since_100), country_region, NA_character_)
- %>%
- ) mutate(end_label = case_when(country_region %in% highlight_country ~ end_label,
- TRUE ~ NA_character_),
- cgroup = case_when(country_region %in% highlight_country ~ country_region,
- TRUE ~ "ZZOTHER")) %>% # length(highlight_country) + gray
-
-
- ggplot(aes(x = days_since_100, y = cases,
- color = cgroup, label = end_label,
- group = country_region)) +
- geom_line(size = 0.8) +
- geom_text_repel(nudge_x = 1.1,
- nudge_y = 0.1,
- segment.color = NA) +
- guides(color = FALSE) +
- scale_color_manual(values = cgroup_cols) +
- scale_y_continuous(labels = scales::comma_format(accuracy = 1),
- breaks = 10^seq(2, 8),
- trans = "log10"
- +
- ) labs(x = "Days Since 100 Confirmed Death",
- y = "Cumulative Number of Deaths (log10 scale)",
- title = "Cumulative Number of Reported Deaths from COVID-19, Selected Countries",
- subtitle = "Cumulative number of cases, by Number of days since 100th case",
- caption = "data source from @www.ft.com")
感觉这样是最好的方案。
-%>%
- d2 group_by(country_region) %>%
- filter(max(cases) >= 1000) %>%
- ungroup()
## # A tibble: 1,060 x 4
-## country_region date cases days_since_100
-## <chr> <date> <dbl> <dbl>
-## 1 Argentina 2020-03-20 128 0
-## 2 Argentina 2020-03-21 158 1
-## 3 Argentina 2020-03-22 266 2
-## 4 Argentina 2020-03-23 301 3
-## 5 Argentina 2020-03-24 387 4
-## 6 Argentina 2020-03-25 387 5
-## 7 Argentina 2020-03-26 502 6
-## 8 Argentina 2020-03-27 589 7
-## 9 Argentina 2020-03-28 690 8
-## 10 Argentina 2020-03-29 745 9
-## # ... with 1,050 more rows
-%>%
- d2 group_by(country_region) %>%
- filter(max(cases) >= 1000) %>%
- ungroup() %>%
- ggplot(aes(days_since_100, cases)) +
- geom_line(size = 0.8) +
- geom_line(
- data = d2 %>% rename(country = country_region),
- aes(days_since_100, cases, group = country),
- color = "grey"
- +
- ) geom_point(pch = 21, size = 1, color = "red") +
- scale_y_log10(
- expand = expansion(mult = c(0, .1)),
- breaks = c(100, 1000, 10000, 50000)
- +
- ) scale_x_continuous(
- expand = expansion(mult = c(0, 0)),
- breaks = c(0, 5, 10, 20, 30, 50)
- +
- ) facet_wrap(vars(country_region), scales = "free_x") +
- theme(
- panel.background = element_rect(fill = "#FFF1E6"),
- plot.background = element_rect(fill = "#FFF1E6")
- +
- ) labs(
- x = "Number of days since 100th case",
- y = "",
- title = "Outbreak are now underway in dozens of other countries, with some on the same trajectory as Italy",
- subtitle = "Cumulative number of cases, by Number of days since 100th case",
- caption = "data source from @www.ft.com"
- )
library(countrycode)
-# countrycode('Albania', 'country.name', 'iso3c')
-
-%>%
- d2_newest mutate(ISO3 = countrycode(country_region,
- origin = "country.name", destination = "iso3c"
- ))
我们选取最新的日期
-<- d %>%
- d_newest select(Long, Lat, last_col()) %>%
- set_names("Long", "Lat", "newest_date")
- d_newest
## # A tibble: 256 x 3
-## Long Lat newest_date
-## <dbl> <dbl> <dbl>
-## 1 65 33 174
-## 2 20.2 41.2 243
-## 3 1.66 28.0 716
-## 4 1.52 42.5 376
-## 5 17.9 -11.2 7
-## 6 -61.8 17.1 7
-## 7 -63.6 -38.4 1054
-## 8 45.0 40.1 532
-## 9 149. -35.5 80
-## 10 151. -33.9 2032
-## # ... with 246 more rows
-<- map_data("world")
- world
-
-ggplot() +
-geom_polygon(
- data = world,
- aes(x = long, y = lat, group = group),
- fill = "grey", alpha = 0.3
- +
- ) geom_point(
- data = d_newest,
- aes(x = Long, y = Lat, size = newest_date, color = newest_date),
- stroke = F, alpha = 0.7
- +
- ) scale_size_continuous(
- name = "Cases", trans = "log",
- range = c(1, 7),
- breaks = c(1, 20, 100, 1000, 50000),
- labels = c("1-19", "20-99", "100-999", "1,000-49,999", "50,000+")
- +
- ) scale_color_viridis_c(
- option = "inferno",
- name = "Cases",
- trans = "log",
- breaks = c(1, 20, 100, 1000, 50000),
- labels = c("1-19", "20-99", "100-999", "1,000-49,999", "50,000+")
- +
- ) theme_void() +
- guides(colour = guide_legend()) +
- labs(
- title = "Mapping the coronavirus outbreak",
- subtitle = "",
- caption = "Source: JHU Unviersity, CSSE; FT research @www.FT.com"
- +
- ) theme(
- legend.position = "bottom",
- text = element_text(color = "#22211d"),
- plot.background = element_rect(fill = "#ffffff", color = NA),
- panel.background = element_rect(fill = "#ffffff", color = NA),
- legend.background = element_rect(fill = "#ffffff", color = NA)
- )
library(tidyverse)
这是一份身高和体重的数据集
-<- read_csv("./demo_data/weight-height.csv")
- d d
## # A tibble: 10,000 x 3
-## Gender Height Weight
-## <chr> <dbl> <dbl>
-## 1 Male 73.8 242.
-## 2 Male 68.8 162.
-## 3 Male 74.1 213.
-## 4 Male 71.7 220.
-## 5 Male 69.9 206.
-## 6 Male 67.3 152.
-## 7 Male 68.8 184.
-## 8 Male 68.3 168.
-## 9 Male 67.0 176.
-## 10 Male 63.5 156.
-## # ... with 9,990 more rows
-%>% summarise(
- d across(everything(), ~ sum(is.na(.)))
- )
## # A tibble: 1 x 3
-## Gender Height Weight
-## <int> <int> <int>
-## 1 0 0 0
-常规答案
-%>%
- d ggplot(aes(x = Height, fill = Gender)) +
- geom_density(alpha = 0.5)
%>%
- d ggplot(aes(x = Height, fill = Gender)) +
- geom_density(alpha = 0.5) +
- facet_wrap(vars(Gender))
刚才我们看到了分面的操作,全局数据按照某个变量分组后,形成的若干个子集在不同的面板中分别展示出来。
-这种方法很适合子集之间对比。事实上,我们看到每个子集的情况后,还很想知道全局的情况,以及子集在全局中的分布、状态或者位置。也就说,想对比子集和全局的情况。
-所以我们期望(子集之间对比,子集与全局对比)。
-具体方法:用分面的方法高亮展示子集,同时在每个分面上添加全局(灰色背景)
-%>%
- d ggplot(aes(x = Height)) +
- geom_density() +
- facet_wrap(vars(Gender))
%>%
- d ggplot(aes(x = Height)) +
- geom_density(
- data = d %>% select(-Gender)
- +
- ) geom_density() +
- facet_wrap(vars(Gender))
%>%
- d ggplot(aes(x = Height, y = after_stat(count))) +
- geom_density(
- data = d %>% select(-Gender)
- +
- ) geom_density() +
- facet_wrap(vars(Gender))
“Male,” “Female” 是Gender已经存在的分组。另外,我们在背景图层,新增了一个组“all people”,这样,整个图就有三个分组(三个color组),那么,我们可以在scale_fill_manual中统一设置和指定。
-<- c(
- density_colors "Male" = "#247BA0",
- "Female" = "#F25F5C",
- "all people" = "grey85"
- )
%>%
- d ggplot(aes(x = Height, y = after_stat(count))) +
- geom_density(
- data = df %>% select(-Gender),
- aes(fill = "all people", color = "all people")
- +
- ) geom_density(aes(color = Gender, fill = Gender)) +
- facet_wrap(vars(Gender)) +
- scale_fill_manual(name = NULL, values = density_colors) +
- scale_color_manual(name = NULL, values = density_colors) +
- theme_minimal() +
- theme(legend.position = "bottom")
<- c(
- density_colors "Male" = "#247BA0",
- "Female" = "#F25F5C",
- "all people" = "grey80"
-
- )
-::show_col(density_colors) scales
%>%
- d ggplot(aes(x = Height, y = after_stat(count))) +
- geom_density(
- data = d %>% dplyr::select(-Gender),
- aes(fill = "all people", color = "all people")
- +
- ) geom_density(aes(color = Gender, fill = Gender)) +
- facet_wrap(vars(Gender)) +
- scale_fill_manual(name = NULL, values = density_colors) +
- scale_color_manual(name = NULL, values = density_colors) +
- theme_minimal() +
- theme(legend.position = "bottom")
或者,用不同的主题风格
-<- c(
- density_colors "Male" = "#56B4E9",
- "Female" = "#EF8A17",
- "all participants" = "grey85"
-
- )
-%>%
- d ggplot(aes(x = Height, y = after_stat(count))) +
- geom_density(
- data = function(x) dplyr::select(x, -Gender),
- aes(fill = "all participants", color = "all participants")
- +
- ) geom_density(aes(fill = Gender, color = Gender)) +
- facet_wrap(vars(Gender)) +
- scale_color_manual(name = NULL, values = density_colors) +
- scale_fill_manual(name = NULL, values = density_colors) +
- ::theme_minimal_hgrid(16) +
- cowplottheme(legend.position = "bottom", legend.justification = "center")
%>%
- d ggplot(aes(x = Weight, fill = Gender)) +
- geom_density(alpha = 0.5)
%>%
- d ggplot(aes(x = Height, y = Weight, color = Gender)) +
- geom_point()
<- lm(Weight ~ 1 + Height, data = d)
- fit summary(fit)
##
-## Call:
-## lm(formula = Weight ~ 1 + Height, data = d)
-##
-## Residuals:
-## Min 1Q Median 3Q Max
-## -51.93 -8.24 -0.12 8.26 46.84
-##
-## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) -350.7372 2.1115 -166 <2e-16 ***
-## Height 7.7173 0.0318 243 <2e-16 ***
-## ---
-## Signif. codes:
-## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Residual standard error: 12.2 on 9998 degrees of freedom
-## Multiple R-squared: 0.855, Adjusted R-squared: 0.855
-## F-statistic: 5.9e+04 on 1 and 9998 DF, p-value: <2e-16
-::tidy(fit) broom
## # A tibble: 2 x 5
-## term estimate std.error statistic p.value
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 (Intercept) -351. 2.11 -166. 0
-## 2 Height 7.72 0.0318 243. 0
-%>%
- d group_by(Gender) %>%
- group_modify(
- ~ broom::tidy(lm(Weight ~ 1 + Height, data = .))
- )
## # A tibble: 4 x 6
-## # Groups: Gender [2]
-## Gender term estimate std.error statistic p.value
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Female (Interce~ -246. 3.36 -73.3 0
-## 2 Female Height 5.99 0.0526 114. 0
-## 3 Male (Interce~ -224. 3.41 -65.8 0
-## 4 Male Height 5.96 0.0494 121. 0
-%>%
- d ggplot(aes(x = Height, y = Weight, group = Gender)) +
- geom_point(aes(color = Gender)) +
- geom_smooth(method = lm)
探索性数据分析(exporatory data analysis)是各种知识的综合运用。本章通过一个案例,讲解探索性数据分析的基本思路,也算是对前面几章内容的一次总结复习。
-数据准备(对数据要做到心中有数)
-数据探索(围绕探索的目标)
-这是一个诺贝尔奖获得者的数据集,
- -library(tidyverse)
-library(lubridate)
<- read_csv("./demo_data/nobel_winners.csv")
- df df
## # A tibble: 969 x 18
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 1901 Chemist~ The ~ "\"in rec~ 1/1
-## 2 1901 Literat~ The ~ "\"in spe~ 1/1
-## 3 1901 Medicine The ~ "\"for hi~ 1/1
-## 4 1901 Peace The ~ <NA> 1/2
-## 5 1901 Peace The ~ <NA> 1/2
-## 6 1901 Physics The ~ "\"in rec~ 1/1
-## 7 1902 Chemist~ The ~ "\"in rec~ 1/1
-## 8 1902 Literat~ The ~ "\"the gr~ 1/1
-## 9 1902 Medicine The ~ "\"for hi~ 1/1
-## 10 1902 Peace The ~ <NA> 1/2
-## # ... with 959 more rows, and 13 more variables:
-## # laureate_id <dbl>, laureate_type <chr>,
-## # full_name <chr>, birth_date <date>,
-## # birth_city <chr>, birth_country <chr>,
-## # gender <chr>, organization_name <chr>,
-## # organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>
-# 如果是xlsx格式
-::read_excel("myfile.xlsx")
- readxl
-# 如果是csv格式
-::read_csv("myfile.csv") readr
-这里有个小小的提示: -
-一行就是一个诺奖获得者的记录? 确定?
-缺失值及其处理
-%>% map_df(~ sum(is.na(.))) df
## # A tibble: 1 x 18
-## prize_year category prize motivation prize_share
-## <int> <int> <int> <int> <int>
-## 1 0 0 0 88 0
-## # ... with 13 more variables: laureate_id <int>,
-## # laureate_type <int>, full_name <int>,
-## # birth_date <int>, birth_city <int>,
-## # birth_country <int>, gender <int>,
-## # organization_name <int>, organization_city <int>,
-## # organization_country <int>, death_date <int>,
-## # death_city <int>, death_country <int>
-性别缺失怎么造成的?
-%>% count(laureate_type) df
## # A tibble: 2 x 2
-## laureate_type n
-## <chr> <int>
-## 1 Individual 939
-## 2 Organization 30
-你想关心哪些问题,可能是
-%>% count(category) df
## # A tibble: 6 x 2
-## category n
-## <chr> <int>
-## 1 Chemistry 194
-## 2 Economics 83
-## 3 Literature 113
-## 4 Medicine 227
-## 5 Peace 130
-## 6 Physics 222
-%>%
- df count(category) %>%
- ggplot(aes(x = category, y = n, fill = category)) +
- geom_col() +
- geom_text(aes(label = n), vjust = -0.25) +
- labs(title = "不同学科诺贝奖获奖次数对比", x = "学科", y = "数量") +
- theme(legend.position = "none")
%>%
- df count(category) %>%
- ggplot(aes(x = fct_reorder(category, n), y = n, fill = category)) +
- geom_col() +
- geom_text(aes(label = n), vjust = -0.25) +
- labs(title = "不同学科诺贝奖获奖次数对比", x = "学科", y = "数量") +
- theme(legend.position = "none")
也可以使用别人定义好的配色方案
-library(ggthemr) # install.packages("devtools")
-# devtools::install_github('cttobin/ggthemr')
-ggthemr("dust")
-
-%>%
- df count(category) %>%
- ggplot(aes(x = fct_reorder(category, n), y = n, fill = category)) +
- geom_col() +
- labs(title = "不同学科诺贝奖获奖次数对比", x = "学科", y = "数量") +
- theme(legend.position = "none")
这个配色方案感觉挺好看的呢,比较适合我这种又挑剔又懒惰的人。
-当然,也可以自己DIY,或者使用配色网站的主题方案(https://learnui.design/tools/data-color-picker.html#palette)
-%>%
- df count(category) %>%
- ggplot(aes(x = fct_reorder(category, n), y = n)) +
- geom_col(fill = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) +
- labs(title = "不同学科诺贝奖获奖次数对比", x = "学科", y = "数量") +
- theme(legend.position = "none")
让图骚动起来吧
-library(gganimate) # install.packages("gganimate", dependencies = T)
-
-%>%
- df count(category) %>%
- mutate(category = fct_reorder(category, n)) %>%
- ggplot(aes(x = category, y = n)) +
- geom_text(aes(label = n), vjust = -0.25) +
- geom_col(fill = c("#003f5c", "#444e86", "#955196", "#dd5182", "#ff6e54", "#ffa600")) +
- labs(title = "不同学科诺贝奖获奖次数对比", x = "学科", y = "数量") +
- theme(legend.position = "none") +
- transition_states(category) +
- shadow_mark(past = TRUE)
和ggplot2的分面一样,动态图可以增加数据展示的维度。
-%>%
- df ::filter(birth_country == "China") %>%
- dplyr::select(full_name, prize_year, category) dplyr
## # A tibble: 12 x 3
-## full_name prize_year category
-## <chr> <dbl> <chr>
-## 1 Walter Houser Brattain 1956 Physics
-## 2 Chen Ning Yang 1957 Physics
-## 3 Tsung-Dao (T.D.) Lee 1957 Physics
-## 4 Edmond H. Fischer 1992 Medicine
-## 5 Daniel C. Tsui 1998 Physics
-## 6 Gao Xingjian 2000 Literature
-## 7 Charles Kuen Kao 2009 Physics
-## 8 Charles Kuen Kao 2009 Physics
-## 9 Ei-ichi Negishi 2010 Chemistry
-## 10 Liu Xiaobo 2010 Peace
-## 11 Mo Yan 2012 Literature
-## 12 Youyou Tu 2015 Medicine
-我们发现获奖者有多个地址,就会有重复的情况,比如 Charles Kuen Kao在2009年Physics有两次,为什么重复计数了呢?
-下面我们去重吧, 去重可以用distinct()
函数
<- tibble::tribble(
- dt ~x, ~y, ~z,
- 1, 1, "a",
- 1, 1, "b",
- 1, 2, "c",
- 1, 2, "d"
-
- )
- dt
## # A tibble: 4 x 3
-## x y z
-## <dbl> <dbl> <chr>
-## 1 1 1 a
-## 2 1 1 b
-## 3 1 2 c
-## 4 1 2 d
-%>% distinct_at(vars(x), .keep_all = T) dt
## # A tibble: 1 x 3
-## x y z
-## <dbl> <dbl> <chr>
-## 1 1 1 a
-%>% distinct_at(vars(x, y), .keep_all = T) dt
## # A tibble: 2 x 3
-## x y z
-## <dbl> <dbl> <chr>
-## 1 1 1 a
-## 2 1 2 c
-<- df %>%
- nobel_winners mutate_if(is.character, tolower) %>%
- distinct_at(vars(full_name, prize_year, category), .keep_all = TRUE) %>%
- mutate(
- decade = 10 * (prize_year %/% 10),
- prize_age = prize_year - year(birth_date)
-
- )
- nobel_winners
## # A tibble: 911 x 20
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 1901 chemist~ the ~ "\"in rec~ 1/1
-## 2 1901 literat~ the ~ "\"in spe~ 1/1
-## 3 1901 medicine the ~ "\"for hi~ 1/1
-## 4 1901 peace the ~ <NA> 1/2
-## 5 1901 peace the ~ <NA> 1/2
-## 6 1901 physics the ~ "\"in rec~ 1/1
-## 7 1902 chemist~ the ~ "\"in rec~ 1/1
-## 8 1902 literat~ the ~ "\"the gr~ 1/1
-## 9 1902 medicine the ~ "\"for hi~ 1/1
-## 10 1902 peace the ~ <NA> 1/2
-## # ... with 901 more rows, and 15 more variables:
-## # laureate_id <dbl>, laureate_type <chr>,
-## # full_name <chr>, birth_date <date>,
-## # birth_city <chr>, birth_country <chr>,
-## # gender <chr>, organization_name <chr>,
-## # organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>,
-## # decade <dbl>, prize_age <dbl>
--这是时候,我们才对数据有了一个初步的了解 -
-再来看看我的祖国
-%>%
- nobel_winners ::filter(birth_country == "china") %>%
- dplyr::select(full_name, prize_year, category) dplyr
## # A tibble: 11 x 3
-## full_name prize_year category
-## <chr> <dbl> <chr>
-## 1 walter houser brattain 1956 physics
-## 2 chen ning yang 1957 physics
-## 3 tsung-dao (t.d.) lee 1957 physics
-## 4 edmond h. fischer 1992 medicine
-## 5 daniel c. tsui 1998 physics
-## 6 gao xingjian 2000 literature
-## 7 charles kuen kao 2009 physics
-## 8 ei-ichi negishi 2010 chemistry
-## 9 liu xiaobo 2010 peace
-## 10 mo yan 2012 literature
-## 11 youyou tu 2015 medicine
-%>% count(full_name, sort = T) nobel_winners
## # A tibble: 904 x 2
-## full_name n
-## <chr> <int>
-## 1 "comité international de la croix rouge (inte~ 3
-## 2 "frederick sanger" 2
-## 3 "john bardeen" 2
-## 4 "linus carl pauling" 2
-## 5 "marie curie, née sklodowska" 2
-## 6 "office of the united nations high commission~ 2
-## 7 " lie ducommun" 1
-## 8 "a. michael spence" 1
-## 9 "aage niels bohr" 1
-## 10 "aaron ciechanover" 1
-## # ... with 894 more rows
-%>%
- nobel_winners group_by(full_name) %>%
- mutate(
- number_prize = n(),
- number_cateory = n_distinct(category)
- %>%
- ) arrange(desc(number_prize), full_name) %>%
- ::filter(number_cateory == 2) dplyr
## # A tibble: 4 x 22
-## # Groups: full_name [2]
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 1954 chemist~ the ~ "\"for hi~ 1/1
-## 2 1962 peace the ~ <NA> 1/1
-## 3 1903 physics the ~ "\"in rec~ 1/4
-## 4 1911 chemist~ the ~ "\"in rec~ 1/1
-## # ... with 17 more variables: laureate_id <dbl>,
-## # laureate_type <chr>, full_name <chr>,
-## # birth_date <date>, birth_city <chr>,
-## # birth_country <chr>, gender <chr>,
-## # organization_name <chr>, organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>,
-## # decade <dbl>, prize_age <dbl>, number_prize <int>,
-## # number_cateory <int>
-%>%
- nobel_winners count(prize_age) %>%
- ggplot(aes(x = prize_age, y = n)) +
- geom_col()
%>%
- nobel_winners group_by(category) %>%
- summarise(mean_prize_age = mean(prize_age, na.rm = T))
## # A tibble: 6 x 2
-## category mean_prize_age
-## <chr> <dbl>
-## 1 chemistry 58.0
-## 2 economics 67.2
-## 3 literature 64.7
-## 4 medicine 58.0
-## 5 peace 61.4
-## 6 physics 55.4
-%>%
- nobel_winners mutate(category = fct_reorder(category, prize_age, median, na.rm = TRUE)) %>%
- ggplot(aes(category, prize_age)) +
- geom_point() +
- geom_boxplot() +
- coord_flip()
%>%
- nobel_winners ::filter(!is.na(prize_age)) %>%
- dplyrgroup_by(decade, category) %>%
- summarize(
- average_age = mean(prize_age),
- median_age = median(prize_age)
- %>%
- ) ggplot(aes(decade, average_age, color = category)) +
- geom_line()
library(ggridges)
-
-%>%
- nobel_winners ggplot(aes(
- x = prize_age,
- y = category,
- fill = category
- +
- )) geom_density_ridges()
他们60多少岁才得诺奖,大家才23或24岁,还年轻,不用焦虑喔。
-%>%
- nobel_winners
-ggplot(aes(x = prize_age, fill = category, color = category)) +
- geom_density() +
- facet_wrap(vars(category)) +
- theme(legend.position = "none")
有同学说要一个个的画,至于group_split()
函数,下次课在讲
%>%
- nobel_winners group_split(category) %>%
- map(
- ~ ggplot(data = .x, aes(x = prize_age)) +
- geom_density() +
- ggtitle(.x$category)
- )
## [[1]]
-
-##
-## [[2]]
-
-##
-## [[3]]
-
-##
-## [[4]]
-
-##
-## [[5]]
-
-##
-## [[6]]
-
-也可以用强大的group_by() + group_map()
组合,我们会在第 21 章讲到
%>%
- nobel_winners group_by(category) %>%
- group_map(
- ~ ggplot(data = .x, aes(x = prize_age)) +
- geom_density() +
- ggtitle(.y)
- )
%>%
- nobel_winners ::filter(laureate_type == "individual") %>%
- dplyrcount(category, gender) %>%
- group_by(category) %>%
- mutate(prop = n / sum(n))
## # A tibble: 12 x 4
-## # Groups: category [6]
-## category gender n prop
-## <chr> <chr> <int> <dbl>
-## 1 chemistry female 4 0.0229
-## 2 chemistry male 171 0.977
-## 3 economics female 1 0.0128
-## 4 economics male 77 0.987
-## 5 literature female 14 0.124
-## 6 literature male 99 0.876
-## 7 medicine female 12 0.0569
-## 8 medicine male 199 0.943
-## 9 peace female 14 0.14
-## 10 peace male 86 0.86
-## 11 physics female 2 0.00980
-## 12 physics male 202 0.990
-各年代性别比例
-%>%
- nobel_winners ::filter(laureate_type == "individual") %>%
- dplyr# mutate(decade = glue::glue("{round(prize_year - 1, -1)}s")) %>%
- count(decade, category, gender) %>%
- group_by(decade, category) %>%
- mutate(prop = n / sum(n)) %>%
- ggplot(aes(decade, category, fill = prop)) +
- geom_tile(size = 0.7) +
- # geom_text(aes(label = scales::percent(prop, accuracy = .01))) +
- geom_text(aes(label = scales::number(prop, accuracy = .01))) +
- facet_grid(vars(gender)) +
- scale_fill_gradient(low = "#FDF4E9", high = "#834C0D")
library(ggbeeswarm) # install.packages("ggbeeswarm")
-
-%>%
- nobel_winners ggplot(aes(
- x = category,
- y = prize_age,
- colour = gender,
- alpha = gender
- +
- )) ::geom_beeswarm() +
- ggbeeswarmcoord_flip() +
- scale_color_manual(values = c("#BB1288", "#5867A6")) +
- scale_alpha_manual(values = c(1, .4)) +
- theme_minimal() +
- theme(legend.position = "top") +
- labs(
- title = "诺奖获得者性别不平衡",
- subtitle = "1901年-2016年数据",
- colour = "Gender",
- alpha = "Gender",
- x = "学科",
- y = "获奖年龄"
- )
%>%
- nobel_winners count(decade,
-
- category,gender = coalesce(gender, laureate_type)
- %>%
- ) group_by(decade, category) %>%
- mutate(percent = n / sum(n)) %>%
- ggplot(aes(decade, n, fill = gender)) +
- geom_col() +
- facet_wrap(~category) +
- labs(
- x = "Decade",
- y = "# of nobel prize winners",
- fill = "Gender",
- title = "Nobel Prize gender distribution over time"
- )
%>%
- nobel_winners select(category, birth_date) %>%
- mutate(year = floor(year(birth_date) / 10) * 10) %>%
- count(category, year) %>%
- ::filter(!is.na(year)) %>%
- dplyrggplot(aes(x = year, y = n)) +
- geom_col() +
- scale_x_continuous(breaks = seq(1810, 1990, 20)) +
- geom_text(aes(label = n), vjust = -0.25) +
- facet_wrap(vars(category))
课堂练习,哪位同学能把图弄得好看些?
-%>%
- nobel_winners ::filter(prize_age == min(prize_age, na.rm = T)) dplyr
## # A tibble: 1 x 20
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 2014 peace the ~ "\"for th~ 1/2
-## # ... with 15 more variables: laureate_id <dbl>,
-## # laureate_type <chr>, full_name <chr>,
-## # birth_date <date>, birth_city <chr>,
-## # birth_country <chr>, gender <chr>,
-## # organization_name <chr>, organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>,
-## # decade <dbl>, prize_age <dbl>
-%>%
- nobel_winners ::filter(
- dplyrrank(prize_year - year(birth_date)) == 1
- )
## # A tibble: 1 x 20
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 2014 peace the ~ "\"for th~ 1/2
-## # ... with 15 more variables: laureate_id <dbl>,
-## # laureate_type <chr>, full_name <chr>,
-## # birth_date <date>, birth_city <chr>,
-## # birth_country <chr>, gender <chr>,
-## # organization_name <chr>, organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>,
-## # decade <dbl>, prize_age <dbl>
-%>%
- nobel_winners arrange(
- - year(birth_date)
- prize_year )
## # A tibble: 911 x 20
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 2014 peace the ~ "\"for th~ 1/2
-## 2 1915 physics the ~ "\"for th~ 1/2
-## 3 1932 physics the ~ "\"for th~ 1/1
-## 4 1933 physics the ~ "\"for th~ 1/2
-## 5 1936 physics the ~ "\"for hi~ 1/2
-## 6 1957 physics the ~ "\"for th~ 1/2
-## 7 1923 medicine the ~ "\"for th~ 1/2
-## 8 1961 physics the ~ "\"for hi~ 1/2
-## 9 1976 peace the ~ <NA> 1/2
-## 10 2011 peace the ~ "\"for th~ 1/3
-## # ... with 901 more rows, and 15 more variables:
-## # laureate_id <dbl>, laureate_type <chr>,
-## # full_name <chr>, birth_date <date>,
-## # birth_city <chr>, birth_country <chr>,
-## # gender <chr>, organization_name <chr>,
-## # organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>,
-## # decade <dbl>, prize_age <dbl>
-%>%
- nobel_winners top_n(1, year(birth_date) - prize_year)
## # A tibble: 1 x 20
-## prize_year category prize motivation prize_share
-## <dbl> <chr> <chr> <chr> <chr>
-## 1 2014 peace the ~ "\"for th~ 1/2
-## # ... with 15 more variables: laureate_id <dbl>,
-## # laureate_type <chr>, full_name <chr>,
-## # birth_date <date>, birth_city <chr>,
-## # birth_country <chr>, gender <chr>,
-## # organization_name <chr>, organization_city <chr>,
-## # organization_country <chr>, death_date <date>,
-## # death_city <chr>, death_country <chr>,
-## # decade <dbl>, prize_age <dbl>
-<- nobel_winners %>%
- df1 group_by(category) %>%
- summarise(
- mean_prise_age = mean(prize_age, na.rm = T),
- total_num = n()
-
- ) df1
## # A tibble: 6 x 3
-## category mean_prise_age total_num
-## <chr> <dbl> <int>
-## 1 chemistry 58.0 175
-## 2 economics 67.2 78
-## 3 literature 64.7 113
-## 4 medicine 58.0 211
-## 5 peace 61.4 130
-## 6 physics 55.4 204
-%>%
- df1 ggplot(aes(mean_prise_age, total_num)) +
- geom_point(aes(color = category)) +
- geom_smooth(method = lm, se = FALSE)
<- nobel_winners %>%
- nobel_winners_clean mutate_at(
- vars(birth_country, death_country),
- ~ ifelse(str_detect(., "\\("), str_extract(., "(?<=\\().*?(?=\\))"), .)
- %>%
- ) mutate_at(
- vars(birth_country, death_country),
- ~ case_when(
- == "scotland" ~ "united kingdom",
- . == "northern ireland" ~ "united kingdom",
- . str_detect(., "czech") ~ "czechia",
- str_detect(., "germany") ~ "germany",
- TRUE ~ .
-
- )%>%
- ) select(full_name, prize_year, category, birth_date, birth_country, gender, organization_name, organization_country, death_country)
%>% count(death_country, sort = TRUE) nobel_winners_clean
## # A tibble: 45 x 2
-## death_country n
-## <chr> <int>
-## 1 <NA> 329
-## 2 united states of america 203
-## 3 united kingdom 79
-## 4 germany 56
-## 5 france 51
-## 6 sweden 28
-## 7 switzerland 26
-## 8 italy 14
-## 9 russia 11
-## 10 spain 10
-## # ... with 35 more rows
-%>%
- nobel_winners_clean mutate(
- colour = case_when(
- == "united states of america" ~ "#FF2B4F",
- death_country == "germany" ~ "#fcab27",
- death_country == "united kingdom" ~ "#3686d3",
- death_country == "france" ~ "#88398a",
- death_country == "switzerland" ~ "#20d4bc",
- death_country TRUE ~ "gray60"
-
- )%>%
- ) ggplot(aes(
- x = 0,
- y = fct_rev(factor(birth_country)),
- xend = death_country,
- yend = 1,
- colour = colour,
- alpha = (colour != "gray60")
- +
- )) geom_curve(
- curvature = -0.5,
- arrow = arrow(length = unit(0.01, "npc"))
- +
- ) scale_x_discrete() +
- scale_y_discrete() +
- scale_color_identity() +
- scale_alpha_manual(values = c(0.1, 0.2), guide = F) +
- scale_size_manual(values = c(0.1, 0.4), guide = F) +
- theme_minimal() +
- theme(
- panel.grid = element_blank(),
- plot.background = element_rect(fill = "#F0EFF1", colour = "#F0EFF1"),
- legend.position = "none",
- axis.text.x = element_text(angle = 40, hjust = 1)
- )
library(here)
-library(sf)
-library(countrycode)
-
-# countrycode('Albania', 'country.name', 'iso3c')
-
-<- nobel_winners_clean %>%
- nobel_winners_birth_country count(birth_country) %>%
- filter(!is.na(birth_country)) %>%
- mutate(ISO3 = countrycode(birth_country,
- origin = "country.name", destination = "iso3c"
-
- ))
-
-<-
- global ::st_read("./demo_data/worldmap/TM_WORLD_BORDERS_SIMPL-0.3.shp") %>%
- sfst_transform(4326)
## Reading layer `TM_WORLD_BORDERS_SIMPL-0.3' from data source `G:\R_for_Data_Science\demo_data\worldmap\TM_WORLD_BORDERS_SIMPL-0.3.shp' using driver `ESRI Shapefile'
-## Simple feature collection with 246 features and 11 fields
-## geometry type: MULTIPOLYGON
-## dimension: XY
-## bbox: xmin: -180 ymin: -90 xmax: 180 ymax: 83.57
-## geographic CRS: WGS 84
-%>%
- global full_join(nobel_winners_birth_country, by = "ISO3") %>%
- ggplot() +
- geom_sf(aes(fill = n),
- color = "white",
- size = 0.1
- +
- ) labs(
- x = NULL, y = NULL,
- title = "Nobel Winners by country",
- subtitle = "color of map indicates number of Nobel lauretes",
- fill = "num of Nobel lauretes",
- caption = "Made: wang_minjie"
- +
- ) scale_fill_gradientn(colors = c("royalblue1", "magenta", "orange", "gold"), na.value = "white") +
- # scale_fill_gradient(low = "wheat1", high = "red") +
- theme_void() +
- theme(
- legend.position = c(0.1, 0.3),
- plot.background = element_rect(fill = "gray")
- )
# Determine to 10 Countries
-<- nobel_winners_clean %>%
- topCountries count(birth_country, sort = TRUE) %>%
- na.omit() %>%
- top_n(8)
-
- topCountries
## # A tibble: 8 x 2
-## birth_country n
-## <chr> <int>
-## 1 united states of america 259
-## 2 united kingdom 99
-## 3 germany 80
-## 4 france 54
-## 5 sweden 29
-## 6 poland 26
-## 7 russia 26
-## 8 japan 24
-<- nobel_winners_clean %>%
- df4 filter(birth_country %in% topCountries$birth_country) %>%
- group_by(birth_country, category, prize_year) %>%
- summarise(prizes = n()) %>%
- mutate(cumPrizes = cumsum(prizes))
-
- df4
## # A tibble: 489 x 5
-## # Groups: birth_country, category [47]
-## birth_country category prize_year prizes cumPrizes
-## <chr> <chr> <dbl> <int> <int>
-## 1 france chemistry 1906 1 1
-## 2 france chemistry 1912 2 3
-## 3 france chemistry 1913 1 4
-## 4 france chemistry 1935 2 6
-## 5 france chemistry 1970 1 7
-## 6 france chemistry 1987 1 8
-## 7 france chemistry 2016 1 9
-## 8 france economics 1983 1 1
-## 9 france economics 1988 1 2
-## 10 france economics 2014 1 3
-## # ... with 479 more rows
-library(gganimate)
-%>%
- df4 mutate(prize_year = as.integer(prize_year)) %>%
- ggplot(aes(x = birth_country, y = category, color = birth_country)) +
- geom_point(aes(size = cumPrizes), alpha = 0.6) +
- # geom_text(aes(label = cumPrizes)) +
- scale_size_continuous(range = c(2, 30)) +
- transition_reveal(prize_year) +
- labs(
- title = "诺奖获得者最多的10个国家",
- subtitle = "Year: {frame_along}",
- y = "Category"
- +
- ) theme_minimal() +
- theme(
- plot.title = element_text(size = 22),
- axis.title = element_blank()
- +
- ) scale_color_brewer(palette = "RdYlBu") +
- theme(legend.position = "none") +
- theme(plot.margin = margin(5.5, 5.5, 5.5, 5.5))
%>%
- nobel_winners_clean select(category, birth_country, death_country) %>%
- mutate(immigration = if_else(birth_country == death_country, 0, 1))
## # A tibble: 911 x 4
-## category birth_country death_country immigration
-## <chr> <chr> <chr> <dbl>
-## 1 chemistry netherlands germany 1
-## 2 literature france france 0
-## 3 medicine poland germany 1
-## 4 peace switzerland switzerland 0
-## 5 peace france france 0
-## 6 physics germany germany 0
-## 7 chemistry germany germany 0
-## 8 literature germany germany 0
-## 9 medicine india united kingdom 1
-## 10 peace switzerland switzerland 0
-## # ... with 901 more rows
-%>%
- nobel_winners separate(prize_share, into = c("num", "deno"), sep = "/", remove = FALSE)
%>%
- nobel_winners filter(category == "medicine") %>%
- mutate(
- num_a = as.numeric(str_sub(prize_share, 1, 1)),
- num_b = as.numeric(str_sub(prize_share, -1)),
- share = num_a / num_b,
- year = prize_year %% 10,
- decade = 10 * (prize_year %/% 10)
- %>%
- ) group_by(prize_year) %>%
- mutate(n = row_number()) %>%
- ggplot() +
- geom_col(aes(x = "", y = share, fill = as.factor(n)),
- show.legend = FALSE
- +
- ) coord_polar("y") +
- facet_grid(decade ~ year, switch = "both") +
- labs(title = "每年诺贝尔奖分享情况") +
- theme_void() +
- theme(
- plot.title = element_text(face = "bold", vjust = 8),
- strip.text.x = element_text(
- size = 7,
- margin = margin(t = 5)
-
- ),strip.text.y = element_text(
- size = 7,
- angle = 180, hjust = 1, margin = margin(r = 10)
-
- ) )
没有回答的问题,大家自己花时间探索下。
-这是Nature期刊上的一篇文章Nature. 2004 September 30; 431(7008),
- -虽然觉得这个结论不太严谨,但我却无力反驳。
-于是在文章补充材料里,我找到了文章使用的数据,现在的任务是,重复这张图和文章的分析过程。
-library(tidyverse)
-library(readxl)
<- read_excel("./demo_data/olympics.xlsx")
- d d
## # A tibble: 27 x 3
-## Olympic_year Men_score Women_score
-## <dbl> <dbl> <dbl>
-## 1 1900 11 NA
-## 2 1904 11 NA
-## 3 1908 10.8 NA
-## 4 1912 10.8 NA
-## 5 1916 NA NA
-## 6 1920 10.8 NA
-## 7 1924 10.6 NA
-## 8 1928 10.8 12.2
-## 9 1932 10.3 11.9
-## 10 1936 10.3 11.5
-## # ... with 17 more rows
-我们先画图看看
-%>%
- d ggplot() +
- geom_point(aes(x = Olympic_year, y = Men_score), color = "blue") +
- geom_point(aes(x = Olympic_year, y = Women_score), color = "red")
-这样写也是可以的,只不过最好先tidy数据
- -<- d %>%
- d1 pivot_longer(
- cols = -Olympic_year,
- names_to = "sex",
- values_to = "winning_time"
-
- )
- d1
## # A tibble: 54 x 3
-## Olympic_year sex winning_time
-## <dbl> <chr> <dbl>
-## 1 1900 Men_score 11
-## 2 1900 Women_score NA
-## 3 1904 Men_score 11
-## 4 1904 Women_score NA
-## 5 1908 Men_score 10.8
-## 6 1908 Women_score NA
-## 7 1912 Men_score 10.8
-## 8 1912 Women_score NA
-## 9 1916 Men_score NA
-## 10 1916 Women_score NA
-## # ... with 44 more rows
-然后在画图
-%>%
- d1 ggplot(aes(x = Olympic_year, y = winning_time, color = sex)) +
- geom_point() +
- # geom_smooth(method = "lm") +
- scale_color_manual(
- values = c("Men_score" = "blue", "Women_score" = "red")
- +
- ) scale_x_continuous(
- breaks = seq(1900, 2004, by = 4),
- labels = seq(1900, 2004, by = 4)
- +
- ) theme(axis.text.x = element_text(
- size = 10, angle = 45, colour = "black",
- vjust = 1, hjust = 1
- ))
建立年份与成绩的线性关系 -\[ -\text{score}_i = \alpha + \beta \times \text{year}_i + \epsilon_i; \qquad \epsilon_i\in \text{Normal}(\mu, \sigma) -\]
-我们需要求出其中系数\(\alpha\)和\(\beta\),写R语言代码如下
-(lm(y ~ 1 + x,data = d)
, 要求得 \(\alpha\)和\(\beta\),就是对应 1 和 x 前的系数)
<- lm(Men_score ~ 1 + Olympic_year, data = d)
- fit_1
-summary(fit_1)
##
-## Call:
-## lm(formula = Men_score ~ 1 + Olympic_year, data = d)
-##
-## Residuals:
-## Min 1Q Median 3Q Max
-## -0.26371 -0.05270 0.00738 0.08005 0.21456
-##
-## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) 31.826453 1.679643 18.9 4.1e-15 ***
-## Olympic_year -0.011006 0.000859 -12.8 1.1e-11 ***
-## ---
-## Signif. codes:
-## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Residual standard error: 0.135 on 22 degrees of freedom
-## (3 observations deleted due to missingness)
-## Multiple R-squared: 0.882, Adjusted R-squared: 0.876
-## F-statistic: 164 on 1 and 22 DF, p-value: 1.13e-11
-<- lm(Women_score ~ 1 + Olympic_year, data = d)
- fit_2
-summary(fit_2)
##
-## Call:
-## lm(formula = Women_score ~ 1 + Olympic_year, data = d)
-##
-## Residuals:
-## Min 1Q Median 3Q Max
-## -0.3758 -0.0846 0.0093 0.0829 0.3223
-##
-## Coefficients:
-## Estimate Std. Error t value Pr(>|t|)
-## (Intercept) 44.34705 4.28425 10.35 1.7e-08 ***
-## Olympic_year -0.01682 0.00218 -7.73 8.6e-07 ***
-## ---
-## Signif. codes:
-## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## Residual standard error: 0.21 on 16 degrees of freedom
-## (9 observations deleted due to missingness)
-## Multiple R-squared: 0.789, Adjusted R-squared: 0.776
-## F-statistic: 59.8 on 1 and 16 DF, p-value: 8.63e-07
-使用predict()
完成预测
<- data.frame(Olympic_year = 2020)
- df
-predict(fit_1, newdata = df)
## 1
-## 9.595
-为了图片中的一致,我们使用1900年到2252年(seq(1900, 2252, by = 4)
)建立预测项,并整理到数据框里
<- tibble(
- grid Olympic_year = as.numeric(seq(1900, 2252, by = 4))
-
- ) grid
## # A tibble: 89 x 1
-## Olympic_year
-## <dbl>
-## 1 1900
-## 2 1904
-## 3 1908
-## 4 1912
-## 5 1916
-## 6 1920
-## 7 1924
-## 8 1928
-## 9 1932
-## 10 1936
-## # ... with 79 more rows
-<- grid %>%
- tb mutate(
- Predict_Men = predict(fit_1, newdata = grid),
- Predict_Women = predict(fit_2, newdata = grid)
-
- ) tb
## # A tibble: 89 x 3
-## Olympic_year Predict_Men Predict_Women
-## <dbl> <dbl> <dbl>
-## 1 1900 10.9 12.4
-## 2 1904 10.9 12.3
-## 3 1908 10.8 12.3
-## 4 1912 10.8 12.2
-## 5 1916 10.7 12.1
-## 6 1920 10.7 12.0
-## 7 1924 10.7 12.0
-## 8 1928 10.6 11.9
-## 9 1932 10.6 11.8
-## 10 1936 10.5 11.8
-## # ... with 79 more rows
-有时候我喜欢用modelr::add_predictions()
函数实现相同的功能
library(modelr)
-%>%
- grid add_predictions(fit_1, var = "Predict_Men") %>%
- add_predictions(fit_2, var = "Predict_Women")
## # A tibble: 89 x 3
-## Olympic_year Predict_Men Predict_Women
-## <dbl> <dbl> <dbl>
-## 1 1900 10.9 12.4
-## 2 1904 10.9 12.3
-## 3 1908 10.8 12.3
-## 4 1912 10.8 12.2
-## 5 1916 10.7 12.1
-## 6 1920 10.7 12.0
-## 7 1924 10.7 12.0
-## 8 1928 10.6 11.9
-## 9 1932 10.6 11.8
-## 10 1936 10.5 11.8
-## # ... with 79 more rows
-<- tb %>%
- tb1 pivot_longer(
- cols = -Olympic_year,
- names_to = "sex",
- values_to = "winning_time"
-
- ) tb1
## # A tibble: 178 x 3
-## Olympic_year sex winning_time
-## <dbl> <chr> <dbl>
-## 1 1900 Predict_Men 10.9
-## 2 1900 Predict_Women 12.4
-## 3 1904 Predict_Men 10.9
-## 4 1904 Predict_Women 12.3
-## 5 1908 Predict_Men 10.8
-## 6 1908 Predict_Women 12.3
-## 7 1912 Predict_Men 10.8
-## 8 1912 Predict_Women 12.2
-## 9 1916 Predict_Men 10.7
-## 10 1916 Predict_Women 12.1
-## # ... with 168 more rows
-%>%
- tb1 ggplot(aes(
- x = Olympic_year,
- y = winning_time,
- color = sex
- +
- )) geom_line(size = 2) +
- geom_point(data = d1) +
- scale_color_manual(
- name = "标记",
- values = c(
- "Men_score" = "blue",
- "Women_score" = "red",
- "Predict_Men" = "#588B8B",
- "Predict_Women" = "#C8553D"
-
- ),labels = c(
- "Men_score" = "男性历史成绩",
- "Women_score" = "女性历史成绩",
- "Predict_Men" = "男性预测成绩",
- "Predict_Women" = "女性预测成绩"
-
- )+
- ) scale_x_continuous(
- breaks = seq(1900, 2252, by = 16),
- labels = as.character(seq(1900, 2252, by = 16))
- +
- ) theme(axis.text.x = element_text(
- size = 10, angle = 45, colour = "black",
- vjust = 1, hjust = 1
- ))
-早知道nature文章这么简单,10年前我也可以写啊!
-这里是另外的一种方法
-library(modelr)
<- d %>%
- d1 pivot_longer(
- cols = -Olympic_year,
- names_to = "sex",
- values_to = "winning_time"
-
- )
-<- function(df) lm(winning_time ~ Olympic_year, data = df)
- fit_model
-<- d1 %>%
- d2 group_nest(sex) %>%
- mutate(
- mod = map(data, fit_model)
-
- ) d2
## # A tibble: 2 x 3
-## sex data mod
-## <chr> <list<tbl_df[,2]>> <list>
-## 1 Men_score [27 x 2] <lm>
-## 2 Women_score [27 x 2] <lm>
-# d2 %>% mutate(p = list(grid, grid))
-# d3 <- d2 %>% mutate(p = list(grid, grid))
-# d3
-# d3 %>%
-# mutate(
-# predictions = map2(p, mod, add_predictions),
-# )
-
-# or
-<- d2 %>%
- tb4 mutate(
- predictions = map(mod, ~ add_predictions(grid, .))
- %>%
- ) select(sex, predictions) %>%
- unnest(predictions)
-
-%>%
- tb4 ggplot(aes(
- x = Olympic_year,
- y = pred,
- group = sex,
- color = sex
- +
- )) geom_point() +
- geom_line(size = 2) +
- geom_point(
- data = d1,
- aes(
- x = Olympic_year,
- y = winning_time,
- group = sex,
- color = sex
-
- )+
- ) scale_x_continuous(
- breaks = seq(1900, 2252, by = 16),
- labels = as.character(seq(1900, 2252, by = 16))
- +
- ) theme(axis.text.x = element_text(
- size = 10, angle = 45, colour = "black",
- vjust = 1, hjust = 1
- ))
今天讲一个关于企鹅的数据故事。这个故事来源于科考人员记录的大量企鹅体征数据,图片来源这里.
- -可通过宏包palmerpenguins::penguins
获取数据,也可以读取本地penguins.csv
文件,
-我们采取后面一种方法:
library(tidyverse)
-<- read_csv("./demo_data/penguins.csv") %>%
- penguins ::clean_names()
- janitor
-%>%
- penguins head()
## # A tibble: 6 x 8
-## species island bill_length_mm bill_depth_mm
-## <chr> <chr> <dbl> <dbl>
-## 1 Adelie Torge~ 39.1 18.7
-## 2 Adelie Torge~ 39.5 17.4
-## 3 Adelie Torge~ 40.3 18
-## 4 Adelie Torge~ NA NA
-## 5 Adelie Torge~ 36.7 19.3
-## 6 Adelie Torge~ 39.3 20.6
-## # ... with 4 more variables: flipper_length_mm <dbl>,
-## # body_mass_g <dbl>, sex <chr>, year <dbl>
-variable | -class | -description | -
---|---|---|
species | -integer | -企鹅种类 (Adelie, Gentoo, Chinstrap) | -
island | -integer | -所在岛屿 (Biscoe, Dream, Torgersen) | -
bill_length_mm | -double | -嘴峰长度 (单位毫米) | -
bill_depth_mm | -double | -嘴峰深度 (单位毫米) | -
flipper_length_mm | -integer | -鰭肢长度 (单位毫米) | -
body_mass_g | -integer | -体重 (单位克) | -
sex | -integer | -性别 | -
year | -integer | -记录年份 | -
检查缺失值(NA)这个很重要!
-%>% summarise(
- penguins across(everything(), ~ sum(is.na(.)))
- )
## # A tibble: 1 x 8
-## species island bill_length_mm bill_depth_mm
-## <int> <int> <int> <int>
-## 1 0 0 2 2
-## # ... with 4 more variables: flipper_length_mm <int>,
-## # body_mass_g <int>, sex <int>, year <int>
-有缺失值的地方找出来看看
-%>% filter_all(
- penguins any_vars(is.na(.))
- )
## # A tibble: 11 x 8
-## species island bill_length_mm bill_depth_mm
-## <chr> <chr> <dbl> <dbl>
-## 1 Adelie Torge~ NA NA
-## 2 Adelie Torge~ 34.1 18.1
-## 3 Adelie Torge~ 42 20.2
-## 4 Adelie Torge~ 37.8 17.1
-## 5 Adelie Torge~ 37.8 17.3
-## 6 Adelie Dream 37.5 18.9
-## 7 Gentoo Biscoe 44.5 14.3
-## 8 Gentoo Biscoe 46.2 14.4
-## 9 Gentoo Biscoe 47.3 13.8
-## 10 Gentoo Biscoe 44.5 15.7
-## 11 Gentoo Biscoe NA NA
-## # ... with 4 more variables: flipper_length_mm <dbl>,
-## # body_mass_g <dbl>, sex <chr>, year <dbl>
-发现共有11行至少有一处有缺失值,于是我们就删除这些行
-<- penguins %>% drop_na()
- penguins penguins
## # A tibble: 333 x 8
-## species island bill_length_mm bill_depth_mm
-## <chr> <chr> <dbl> <dbl>
-## 1 Adelie Torge~ 39.1 18.7
-## 2 Adelie Torge~ 39.5 17.4
-## 3 Adelie Torge~ 40.3 18
-## 4 Adelie Torge~ 36.7 19.3
-## 5 Adelie Torge~ 39.3 20.6
-## 6 Adelie Torge~ 38.9 17.8
-## 7 Adelie Torge~ 39.2 19.6
-## 8 Adelie Torge~ 41.1 17.6
-## 9 Adelie Torge~ 38.6 21.2
-## 10 Adelie Torge~ 34.6 21.1
-## # ... with 323 more rows, and 4 more variables:
-## # flipper_length_mm <dbl>, body_mass_g <dbl>,
-## # sex <chr>, year <dbl>
-大家可以提出自己想探索的内容:
-%>%
- penguins count(species, sort = T)
## # A tibble: 3 x 2
-## species n
-## <chr> <int>
-## 1 Adelie 146
-## 2 Gentoo 119
-## 3 Chinstrap 68
-%>%
- penguins count(island, sort = T)
## # A tibble: 3 x 2
-## island n
-## <chr> <int>
-## 1 Biscoe 163
-## 2 Dream 123
-## 3 Torgersen 47
-%>%
- penguins group_by(species) %>%
- summarize(across(where(is.numeric), mean, na.rm = TRUE))
## # A tibble: 3 x 6
-## species bill_length_mm bill_depth_mm flipper_length_~
-## <chr> <dbl> <dbl> <dbl>
-## 1 Adelie 38.8 18.3 190.
-## 2 Chinst~ 48.8 18.4 196.
-## 3 Gentoo 47.6 15.0 217.
-## # ... with 2 more variables: body_mass_g <dbl>,
-## # year <dbl>
-%>%
- penguins ggplot(aes(x = bill_length_mm)) +
- geom_density() +
- facet_wrap(vars(species), scales = "free")
%>%
- penguins ggplot(aes(x = bill_length_mm)) +
- geom_density(aes(fill = sex)) +
- facet_wrap(vars(species), scales = "free")
-男宝宝的嘴巴要长些,哈哈。
-来张更好看点的
-%>%
- penguins ggplot(aes(x = bill_length_mm, fill = sex)) +
- geom_histogram(
- position = "identity",
- alpha = 0.7,
- bins = 25
- +
- ) scale_fill_manual(values = c("#66b3ff", "#8c8c8c")) +
- ylab("number of penguins") +
- xlab("length (mm)") +
- theme_minimal() +
- theme(
- legend.position = "bottom",
- legend.text = element_text(size = 11),
- legend.title = element_blank(),
- panel.grid.minor = element_blank(),
- axis.title = element_text(color = "white", size = 10),
- plot.title = element_text(size = 20),
- plot.subtitle = element_text(size = 12, hjust = 1)
- +
- ) facet_wrap(vars(species), scales = "free")
同理,可以画出其他属性的分布。当然,我更喜欢用山峦图来呈现不同分组的分布,因为竖直方向可以更方便比较
-library(ggridges)
-%>%
- penguins ggplot(aes(x = bill_length_mm, y = species, fill = species)) +
- ::geom_density_ridges() ggridges
同样,我们也用颜色区分下性别,这样不同种类、不同性别企鹅的嘴峰长度分布一目了然
-%>%
- penguins ggplot(aes(x = bill_length_mm, y = species, fill = sex)) +
- geom_density_ridges(alpha = 0.5)
同样的代码,类似地画个其他体征的分布,
-%>%
- penguins ggplot(aes(x = bill_depth_mm, fill = species)) +
- ::geom_density_ridges(aes(y = species)) ggridges
%>%
- penguins ggplot(aes(x = bill_depth_mm, fill = sex)) +
- ::geom_density_ridges(aes(y = species)) ggridges
%>%
- penguins ggplot(aes(x = body_mass_g, y = species, fill = sex)) +
- ::geom_density_ridges(alpha = 0.5) ggridges
但这样一个特征一个特征的画,好麻烦。你知道程序员都是偷懒的,于是我们还有更骚的操作
-%>%
- penguins ::select(species, bill_length_mm:body_mass_g) %>%
- dplyrpivot_longer(-species, names_to = "measurement", values_to = "value") %>%
- ggplot(aes(x = value)) +
- geom_density(aes(color = species, fill = species), size = 1.2, alpha = 0.2) +
- facet_wrap(vars(measurement), ncol = 2, scales = "free")
%>%
- penguins ::select(species, bill_length_mm:body_mass_g) %>%
- dplyrpivot_longer(-species, names_to = "measurement", values_to = "value") %>%
- ggplot(aes(x = species, y = value)) +
- geom_boxplot(aes(color = species, fill = species), size = 1.2, alpha = 0.2) +
- facet_wrap(vars(measurement), ncol = 2, scales = "free")
%>%
- penguins ::select(species, bill_length_mm:body_mass_g) %>%
- dplyrpivot_longer(-species, names_to = "measurement", values_to = "value") %>%
- ggplot(aes(x = value, y = species, fill = species)) +
- ::geom_density_ridges() +
- ggridgesfacet_wrap(vars(measurement), scales = "free")
%>%
- penguins ::select(species,sex, bill_length_mm:body_mass_g) %>%
- dplyrpivot_longer(-c(species, sex), names_to = "measurement", values_to = "value") %>%
- ggplot(aes(x = value, y = species, fill = sex)) +
- ::geom_density_ridges() +
- ggridgesfacet_wrap(vars(measurement), scales = "free")
我若有所思的看着这张图,似乎看到了一些特征(pattern)了。
-嘴巴越长,嘴巴也会越厚?
-%>%
- penguins ggplot(aes(
- x = bill_length_mm, y = bill_depth_mm,
- shape = species, color = species
- +
- )) geom_point()
我们把不同的种类,用不同的颜色区分看看
-%>%
- penguins ggplot(aes(
- x = bill_length_mm, y = bill_depth_mm,
- shape = species, color = species
- +
- )) geom_point(aes(size = body_mass_g))
感觉这是一个辛普森佯谬, 我们画图看看
-%>%
- penguins ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point(aes(color = species, shape = species)) +
- geom_smooth(method = lm) +
- geom_smooth(method = lm, aes(color = species))
翅膀越长,体重越大?
-%>%
- penguins group_by(species, island, sex) %>%
- ggplot(aes(
- x = body_mass_g, y = reorder(species, -body_mass_g),
- color = species
- +
- )) geom_jitter(position = position_jitter(seed = 2020, width = 0.2), alpha = 0.4, size = 2) +
- stat_summary(fun = mean, geom = "point", size = 5, alpha = 1)
library(ggtext)
-%>%
- penguins ggplot(aes(flipper_length_mm, body_mass_g, group = species)) +
- geom_point(aes(colour = species, shape = species), alpha = 0.7) +
- scale_color_manual(values = c("darkorange", "purple", "cyan4")) +
- labs(
- title = "Penguin Size, Palmer Station LTER",
- subtitle = "Flipper length and body mass for <span style = 'color:darkorange;'>Adelie</span>, <span style = 'color:purple;'>Chinstrap</span> and <span style = 'color:cyan4;'>Gentoo</span> Penguins",
- x = "flipper length (mm)",
- y = "body mass (g)"
- +
- ) theme_minimal() +
- theme(
- legend.position = "none",
- # text = element_text(family = "Futura"),
- # (I only have 'Light' )
- plot.title = element_text(size = 16),
- plot.subtitle = element_markdown(), # element_markdown from `ggtext` to parse the css in the subtitle
- plot.title.position = "plot",
- plot.caption = element_text(size = 8, colour = "grey50"),
- plot.caption.position = "plot"
- )
先分组计算体重的均值和标准差
-%>%
- penguins group_by(species) %>%
- summarise(
- count = n(),
- mean_body_mass = mean(body_mass_g),
- sd_body_mass = sd(body_mass_g)
- )
## # A tibble: 3 x 4
-## species count mean_body_mass sd_body_mass
-## <chr> <int> <dbl> <dbl>
-## 1 Adelie 146 3706. 459.
-## 2 Chinstrap 68 3733. 384.
-## 3 Gentoo 119 5092. 501.
-%>%
- penguins ggplot(aes(x = species, y = body_mass_g)) +
- geom_boxplot() +
- geom_jitter()
用统计方法验证下我们的猜测吧。记住,我们是有科学精神的的人!
-::aov(formula = body_mass_g ~ species, data = penguins) %>%
- statssummary()
## Df Sum Sq Mean Sq F value Pr(>F)
-## species 2 1.45e+08 72595110 342 <2e-16 ***
-## Residuals 330 7.01e+07 212332
-## ---
-## Signif. codes:
-## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-p-value 很小,说明不同种类企鹅之间体重是有显著差异的,但aov只给出了species在整体上引起了体重差异(只要有任意两组之间有显著差异,aov给出的p-value都很小),如果想知道不同种类两两之间是否有显著差异,这就需要用到TukeyHSD().
-oneway.test(body_mass_g ~ species, data = penguins)
##
-## One-way analysis of means (not assuming equal
-## variances)
-##
-## data: body_mass_g and species
-## F = 317, num df = 2, denom df = 188, p-value
-## <2e-16
-::aov(formula = body_mass_g ~ species, data = penguins) %>%
- statsTukeyHSD(which = "species") %>%
- ::tidy() broom
## # A tibble: 3 x 7
-## term contrast null.value estimate conf.low conf.high
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 spec~ Chinstr~ 0 26.9 -132. 186.
-## 2 spec~ Gentoo-~ 0 1386. 1252. 1520.
-## 3 spec~ Gentoo-~ 0 1359. 1194. 1524.
-## # ... with 1 more variable: adj.p.value <dbl>
-表格第一行instrap-Adelie 的 p-value = 0.916,没通过显著性检验;而Gentoo-Adelie 和 Gentoo-Chinstrap 他们的p-value都接近0,通过显著性检验,这和图中的结果是一致的。
-作为统计出生的R语言,有很多宏包可以帮助我们验证我们的结论,我这里推荐可视化学统计的宏包ggstatsplot宏包将统计分析的结果写在图片里,统计结果和图形融合在一起,让统计结果更容易懂了。(使用这个宏包辅助我们学习统计)
-library(ggstatsplot)
-
-%>%
- penguins ::ggbetweenstats(
- ggstatsplotx = species, # > 2 groups
- y = body_mass_g,
- type = "parametric",
- pairwise.comparisons = TRUE,
- pairwise.display = "all",
- messages = FALSE,
- var.equal = FALSE
- )
相关介绍看here
-kruskal.test(body_mass_g ~ species, data = penguins)
##
-## Kruskal-Wallis rank sum test
-##
-## data: body_mass_g by species
-## Kruskal-Wallis chi-squared = 212, df = 2,
-## p-value <2e-16
-%>%
- penguins ::ggbetweenstats(
- ggstatsplotx = species,
- y = body_mass_g,
- type = "nonparametric",
- mean.ci = TRUE,
- pairwise.comparisons = TRUE, # <<
- pairwise.display = "all", # ns = only non-significant
- p.adjust.method = "fdr", # <<
- messages = FALSE
- )
哇,原来统计可以这样学!
-%>%
- penguins mutate(ratio = bill_length_mm / bill_depth_mm) %>%
- group_by(species) %>%
- summarise(mean = mean(ratio))
## # A tibble: 3 x 2
-## species mean
-## <chr> <dbl>
-## 1 Adelie 2.12
-## 2 Chinstrap 2.65
-## 3 Gentoo 3.18
-%>%
- penguins mutate(ratio = bill_length_mm / bill_depth_mm) %>%
- ggplot(aes(x = ratio, fill = species)) +
- ::geom_density_ridges(aes(y = species)) ggridges
男宝宝和女宝宝颜色区分下,代码只需要修改一个地方,留给大家自己实践下吧。
-建模需要标准化数据,并对分类变量(比如sex)编码为 1 和 0; (这是第二个好习惯)
-<- function(x) { # 标准化的子函数
- scale_fun - mean(x)) / sd(x)
- (x
- }
-<- penguins %>%
- d select(sex, species, bill_length_mm:body_mass_g) %>%
- mutate(
- across(where(is.numeric), scale_fun)
- %>%
- ) mutate(male = if_else(sex == "male", 1, 0))
- d
## # A tibble: 333 x 7
-## sex species bill_length_mm bill_depth_mm
-## <chr> <chr> <dbl> <dbl>
-## 1 male Adelie -0.895 0.780
-## 2 fema~ Adelie -0.822 0.119
-## 3 fema~ Adelie -0.675 0.424
-## 4 fema~ Adelie -1.33 1.08
-## 5 male Adelie -0.858 1.74
-## 6 fema~ Adelie -0.931 0.323
-## 7 male Adelie -0.876 1.24
-## 8 fema~ Adelie -0.529 0.221
-## 9 male Adelie -0.986 2.05
-## 10 male Adelie -1.72 2.00
-## # ... with 323 more rows, and 3 more variables:
-## # flipper_length_mm <dbl>, body_mass_g <dbl>,
-## # male <dbl>
-按照species分组后,对flipper_length_mm标准化?这样数据会聚拢到一起了喔, 还是不要了
-%>%
- penguins select(sex, species, bill_length_mm:body_mass_g) %>%
- group_by(species) %>%
- mutate(
- across(where(is.numeric), scale_fun)
- %>%
- ) ungroup()
我们将性别sex视为响应变量,其他变量为预测变量。这里性别变量是二元的(0 或者 1),所以我们用logistic回归
-<- glm(
- logit_mod1 ~ 1 + species + bill_length_mm + bill_depth_mm +
- male + body_mass_g,
- flipper_length_mm data = d,
- family = binomial(link = "logit")
-
- )
-summary(logit_mod1)
##
-## Call:
-## glm(formula = male ~ 1 + species + bill_length_mm + bill_depth_mm +
-## flipper_length_mm + body_mass_g, family = binomial(link = "logit"),
-## data = d)
-##
-## Deviance Residuals:
-## Min 1Q Median 3Q Max
-## -3.382 -0.215 0.002 0.155 2.809
-##
-## Coefficients:
-## Estimate Std. Error z value Pr(>|z|)
-## (Intercept) 4.684 1.187 3.95 7.9e-05
-## speciesChinstrap -6.980 1.574 -4.43 9.3e-06
-## speciesGentoo -8.354 2.524 -3.31 0.00093
-## bill_length_mm 3.357 0.716 4.69 2.8e-06
-## bill_depth_mm 3.196 0.655 4.88 1.0e-06
-## flipper_length_mm 0.291 0.670 0.43 0.66405
-## body_mass_g 4.723 0.872 5.41 6.2e-08
-##
-## (Intercept) ***
-## speciesChinstrap ***
-## speciesGentoo ***
-## bill_length_mm ***
-## bill_depth_mm ***
-## flipper_length_mm
-## body_mass_g ***
-## ---
-## Signif. codes:
-## 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-##
-## (Dispersion parameter for binomial family taken to be 1)
-##
-## Null deviance: 461.61 on 332 degrees of freedom
-## Residual deviance: 127.11 on 326 degrees of freedom
-## AIC: 141.1
-##
-## Number of Fisher Scoring iterations: 7
-计算每个变量的平均边际效应
-library(margins)
-
-<- logit_mod1 %>%
- logit_mod1_m margins() %>%
- summary() %>%
- as_tibble()
-
- logit_mod1_m
## # A tibble: 6 x 7
-## factor AME SE z p lower upper
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 bill_~ 0.185 0.0290 6.38 1.82e-10 0.128 0.242
-## 2 bill_~ 0.194 0.0339 5.72 1.04e- 8 0.128 0.261
-## 3 body_~ 0.273 0.0378 7.22 5.08e-13 0.199 0.347
-## 4 flipp~ 0.0169 0.0388 0.434 6.64e- 1 -0.0592 0.0929
-## 5 speci~ -0.373 0.0513 -7.27 3.67e-13 -0.473 -0.272
-## 6 speci~ -0.434 0.0740 -5.86 4.66e- 9 -0.579 -0.289
-%>%
- logit_mod1_m ggplot(aes(
- x = reorder(factor, AME),
- y = AME, ymin = lower, ymax = upper
- +
- )) geom_hline(yintercept = 0, color = "gray80") +
- geom_pointrange() +
- coord_flip() +
- labs(x = NULL, y = "Average Marginal Effect")
library(ggeffects)
-ggpredict(logit_mod1, terms = "bill_length_mm")
library(brms)
-
-<- brm(
- brms_mod2 ~ 1 + bill_length_mm + bill_depth_mm + flipper_length_mm + body_mass_g + (1 | species),
- male data = d,
- family = binomial(link = "logit")
- )
summary(brms_mod2)
library(ggeffects)
-ggpredict(brms_mod2, "bill_depth_mm [all]") %>%
-plot()
%>%
- penguins ggplot(aes(x = flipper_length_mm, y = bill_length_mm, color = species)) +
- geom_point()
<- brm(bill_length_mm ~ flipper_length_mm + (1|species),
- brms_mod3 data = penguins
- )
%>%
- penguins group_by(species) %>%
- ::data_grid(flipper_length_mm) %>%
- modelr::add_fitted_draws(brms_mod3, n = 100) %>%
- tidybayesggplot() +
- geom_point(
- data = penguins,
- aes(flipper_length_mm, bill_length_mm, color = species, shape = species)
- +
- ) geom_line(aes(flipper_length_mm, .value, group = interaction(.draw, species), color = species), alpha = 0.1)
纽约时报报道说,
---美国制药公司辉瑞(Pfizer)和德国生物科技公司(BioNTech)11月9日率先宣布 -,根据在数国临床试验初步结果,其研发的新冠疫苗有效率达到90%以上,星期三,完整结果显示,参加疫苗实验的44000个志愿者中,共有170人确诊感染,其中安慰剂组162人,接种疫苗组仅8人,这证明了辉瑞开发的新冠疫苗有效率高达95%。
-
-group - | --volunteers - | --got_covid - | -
---|---|---|
-placebo - | --22000 - | --162 - | -
-vaccinated - | --22000 - | --8 - | -
新冠疫苗是有效的,且有效率高达95%。 那么,这个95%是怎么计算出来的呢?它的概率是多少以及不确定性是多少呢? -回到这个问题,我们首先需要了解,辉瑞公司是如何定义疫苗有效率的
-\[ -\text{VE} = 1 - \frac{p_{t}}{p_{c}} -\]
-其中\(p_t\)是疫苗组(vaccinated)的感染率,\(p_c\)是安慰剂组(placebo)的感染率。
-library(tidyverse)
-library(tidybayes)
-library(rstan)
-rstan_options(auto_write = TRUE)
-options(mc.cores = parallel::detectCores())
然后,我们建立如下数学模型:
-\[ -\begin{align} -y_{c} \sim \textsf{binomial}(n_{c},p_{c}) \\ -y_{t} \sim \textsf{binomial}(n_{t},p_{t}) \\ -p_{c} \sim \textsf{beta}(1, 1) \\ -p_{t} \sim \textsf{beta}(1, 1) -\end{align} -\]
-通过模型可以直接计算干预效果\(\textsf{effect}\)和疫苗有效性\(VE\)
-\[ -\begin{align} -\text{effect} = p_{t} - p_{c} \\ -\text{VE} = 1 - \frac{p_{t}}{p_{c}} -\end{align} -\]
-具体Stan代码如下
-<- "
- stan_program data {
- int<lower=1> event_c; // num events, control
- int<lower=1> event_t; // num events, treatment
- int<lower=1> n_c; // num of person trial, control
- int<lower=1> n_t; // num of person trial, treatment
-}
-parameters {
- real<lower=0,upper=1> p_c;
- real<lower=0,upper=1> p_t;
-}
-model {
- event_c ~ binomial(n_c, p_c);
- event_t ~ binomial(n_t, p_t);
- p_c ~ beta(1, 1);
- p_t ~ beta(1, 1);
-}
-generated quantities {
- real effect = p_t - p_c;
- real VE = 1- p_t /p_c;
- real log_odds = log(p_t / (1- p_t)) - log(p_c / (1- p_c));
-}
-"
-
-
-<- list(
- stan_data event_c = 162,
- event_t = 8,
- n_c = 4.4e4 / 2,
- n_t = 4.4e4 / 2
-
- )
-<- stan(model_code = stan_program, data = stan_data) mod_vaccine
最后,我们后验概率抽样
-<- mod_vaccine %>%
- draws ::spread_draws(effect, VE, log_odds)
- tidybayes
-%>%
- draws head()
## # A tibble: 6 x 6
-## .chain .iteration .draw effect VE log_odds
-## <int> <int> <int> <dbl> <dbl> <dbl>
-## 1 1 1 1 -0.00915 0.986 -4.30
-## 2 1 2 2 -0.00604 0.959 -3.19
-## 3 1 3 3 -0.00637 0.936 -2.75
-## 4 1 4 4 -0.00761 0.956 -3.14
-## 5 1 5 5 -0.00646 0.947 -2.94
-## 6 1 6 6 -0.00723 0.949 -2.98
-从结果中看到effect中很多负数。事实上,effect中越多的负值,即被感染的可能性越低,说明疫苗干预效果越好
-mean(draws$effect < 0) %>% round(2)
## [1] 1
-结果告诉我们,疫苗有明显的干预效果。比如,我们假定10000个人接受了疫苗,那么被感染的人数以及相应的可能性,如下图
-%>%
- draws ggplot(aes(x = effect * 1e4)) +
- geom_density(fill = "blue", alpha = .2) +
- expand_limits(y = 0) +
- theme_minimal() +
- xlab("效应大小") +
- ggtitle("每10000个接种疫苗的人中被感染新冠的数量")
我们再看看疫苗有效率 VE 的结果
-%>%
- draws select(VE) %>%
- ::median_qi(.width = c(0.90)) ggdist
## # A tibble: 1 x 6
-## VE .lower .upper .width .point .interval
-## <dbl> <dbl> <dbl> <dbl> <chr> <chr>
-## 1 0.947 0.909 0.972 0.9 median qi
-通过数据看出,疫苗的有效性为0.95,在90%的可信赖水平, 中位数区间[0.91, 0.97].
-当然,通过图可能理解的更清晰。
-<- paste("median =", round(median(draws$VE), 2))
- label_txt
-%>%
- draws ggplot(aes(x = VE)) +
- geom_density(fill = "blue", alpha = .2) +
- expand_limits(y = 0) +
- theme_minimal() +
- geom_vline(xintercept = median(draws$VE), size = 0.2) +
- annotate("text", x = 0.958, y = 10, label = label_txt, size = 3) +
- xlab("疫苗有效率") +
- ggtitle("辉瑞公司定义疫苗有效率为 VE = 1 - Pt/Pc")
研究生生涯的主要工作就是学习,而学以致用是最好的学习路径。考虑同学们不同的学科背景,同时也参考国内其它高校的做法,本学期《数据科学中的 R 语言》期末考试安排如下:
-结合所在学科,找一篇与自己研究方向相关的文献,用课堂上学到的R统计编程技能,重复文献的数据分析过程。
-在2020年06月15日前,将以下资料打包并提交38552109@qq.com
邮箱
仅供参考
-本章介绍R语言中的因子类型数据。因子型变量常用于数据处理和可视化中,尤其在希望不以字母顺序排序的时候,因子就格外有用。
-因子是把数据进行分类并标记为不同层级(level,有时候也翻译成因子水平, 我个人觉得翻译为层级,更接近它的特性,因此,我都会用层级来描述)的数据对象,他们可以存储字符串和整数。因子类型有三个属性:
-library(tidyverse)
<- c("low", "high", "medium", "medium", "low", "high", "high")
- income factor(income)
## [1] low high medium medium low high high
-## Levels: high low medium
-因子层级会自动按照字符串的字母顺序排序,比如high low medium
。也可以指定顺序,
factor(income, levels = c("low", "high", "medium") )
## [1] low high medium medium low high high
-## Levels: low high medium
-不属于因子层级中的值, 比如这里因子层只有c("low", "high")
,那么income中的“medium”会被当作缺省值NA
factor(income, levels = c("low", "high") )
## [1] low high <NA> <NA> low high high
-## Levels: low high
-相比较字符串而言,因子类型更容易处理,因子很多函数会自动的将字符串转换为因子来处理,但事实上,这也会造成,不想当做因子的却又当做了因子的情形,最典型的是在R 4.0之前,data.frame()
中stringsAsFactors
选项,默认将字符串类型转换为因子类型,但这个默认也带来一些不方便,因此在R 4.0之后取消了这个默认。在tidyverse集合里,有专门处理因子的宏包forcats
,因此,本章将围绕forcats
宏包讲解如何处理因子类型变量,更多内容可以参考这里。
library(forcats)
前面看到因子层级是按照字母顺序排序
-<- factor(income)
- x x
## [1] low high medium medium low high high
-## Levels: high low medium
-也可以指定顺序
-%>% fct_relevel(levels = c("high", "medium", "low")) x
## [1] low high medium medium low high high
-## Levels: high medium low
-或者让“medium” 移动到最前面
-%>% fct_relevel(levels = c("medium")) x
## [1] low high medium medium low high high
-## Levels: medium high low
-或者让“medium” 移动到最后面
-%>% fct_relevel("medium", after = Inf) x
## [1] low high medium medium low high high
-## Levels: high low medium
-可以按照字符串第一次出现的次序
-%>% fct_inorder() x
## [1] low high medium medium low high high
-## Levels: low high medium
-按照其他变量的中位数的升序排序
-%>% fct_reorder(c(1:7), .fun = median) x
## [1] low high medium medium low high high
-## Levels: low medium high
-调整因子层级有什么用呢?
-这个功能在ggplot可视化中调整分类变量的顺序非常方便。这里为了方便演示,我们假定有数据框
-<- tibble(
- d x = c("a","a", "b", "b", "c", "c"),
- y = c(2, 2, 1, 5, 0, 3)
-
-
- ) d
## # A tibble: 6 x 2
-## x y
-## <chr> <dbl>
-## 1 a 2
-## 2 a 2
-## 3 b 1
-## 4 b 5
-## 5 c 0
-## 6 c 3
-先画个散点图看看吧
-%>%
- d ggplot(aes(x = x, y = y)) +
- geom_point()
我们看到,横坐标上是a-b-c的顺序。
-fct_reorder()
可以让x的顺序按照x中每个分类变量对应y值的中位数升序排序,具体为
c(2, 2)
中位数是median(c(2, 2)) = 2
c(1, 5)
中位数是median(c(1, 5)) = 3
c(0, 3)
中位数是median(c(0, 3)) = 1.5
因此,x的因子层级的顺序调整为c-a-b
-%>%
- d ggplot(aes(x = fct_reorder(x, y, .fun = median), y = y)) +
- geom_point()
当然,我们可以加一个参数.desc = TRUE
让因子层级变为降序排列b-a-c
%>%
- d ggplot(aes(x = fct_reorder(x, y, .fun = median, .desc = TRUE), y = y)) +
- geom_point()
但这样会造成x坐标标签一大串,因此建议可以写mutate()
函数里
%>%
- d mutate(x = fct_reorder(x, y, .fun = median, .desc = TRUE)) %>%
- ggplot(aes(x = x, y = y)) +
- geom_point()
我们还可以按照y值中最小值的大小降序排列
-%>%
- d mutate(x = fct_reorder(x, y, .fun = min, .desc = TRUE)) %>%
- ggplot(aes(x = x, y = y)) +
- geom_point()
按照因子层级的逆序排序
-%>%
- d mutate(x = fct_rev(x)) %>%
- ggplot(aes(x = x, y = y)) +
- geom_point()
%>%
- d mutate(
- x = fct_relevel(x, c("c", "a", "b"))
- %>%
- )
-ggplot(aes(x = x, y = y)) +
- geom_point()
在学术中,很多情形我们都需要画出统计分布图。比如,围绕天气温度数据(美国内布拉斯加州东部,林肯市, 2016年),我们想看每个月份里气温的分布情况
-<- ggridges::lincoln_weather %>%
- lincoln_df mutate(
- month_short = fct_recode(
-
- Month,Jan = "January",
- Feb = "February",
- Mar = "March",
- Apr = "April",
- May = "May",
- Jun = "June",
- Jul = "July",
- Aug = "August",
- Sep = "September",
- Oct = "October",
- Nov = "November",
- Dec = "December"
-
- )%>%
- ) mutate(month_short = fct_rev(month_short)) %>%
- select(Month, month_short, `Mean Temperature [F]`)
-
-%>%
- lincoln_df head(5)
## # A tibble: 5 x 3
-## Month month_short `Mean Temperature [F]`
-## <fct> <fct> <int>
-## 1 January Jan 24
-## 2 January Jan 23
-## 3 January Jan 23
-## 4 January Jan 17
-## 5 January Jan 29
-统计分布图的方法很多,我们下面比较各种方法的优劣。
-画分布图的最简单的方法,就是计算每个月的气温均值或者中位数,并在均值或者中位数位置标出误差棒(error bars),比如图 55.1 。
-<- lincoln_df %>%
- lincoln_errbar ggplot(aes(x = month_short, y = `Mean Temperature [F]`)) +
- stat_summary(
- fun.y = mean, fun.ymax = function(x) {
- mean(x) + 2 * sd(x)
-
- },fun.ymin = function(x) {
- mean(x) - 2 * sd(x)
- geom = "pointrange",
- }, fatten = 5
- +
- ) xlab("month") +
- ylab("mean temperature (°F)") +
- theme_classic(base_size = 14) +
- theme(
- axis.text = element_text(color = "black", size = 12),
- plot.margin = margin(3, 7, 3, 1.5)
-
- )
- lincoln_errbar
但这个图有很多问题,或者说是错误的
-图中只用了一个点和两个误差棒,丢失了很多分布信息。
读者不能很直观的读出这个点的含义(是均值还是中位数?)
误差棒代表的含义不明确(标准差?标准误?还是其他?)
----通过看代码,知道这里用的是,均值加减2倍的标准差,其目的是想表达这个范围涵盖了95%的的数据。 事实上,误差棒一般用于标准误(或者加减2倍的标准误来代表估计均值的95%置信区间),所以这里使用标准差就造成了混淆。
-
----( 标准误:对样本均值估计的不确定性; 标准差:对偏离均值的分散程度 )
-
为了解决以上问题,可以使用箱线图(boxplot),箱线图将数据分成若干段,如图 55.2.
-那么气温分布用箱线图画出来 (图 55.3)。 我们可以看到12月份数据 -偏态(绝大部分时候中等的冷,少部分是极度寒冷),其他月份,比如7月份,数据分布的比较正态
-<- lincoln_df %>%
- lincoln_box ggplot(aes(x = month_short, y = `Mean Temperature [F]`)) +
- geom_boxplot(fill = "grey90") +
- xlab("month") +
- ylab("mean temperature (°F)") +
- theme_classic(base_size = 14) +
- theme(
- axis.text = element_text(color = "black", size = 12),
- plot.margin = margin(3, 7, 3, 1.5)
-
- )
- lincoln_box
箱线图是1970年代统计学家发明的一种可视化方法,这种图可以很方便地用手工画出,所以当时很流行,现在计算机性能大大提升了,所以大家喜欢用视觉上更直观的小提琴图取代箱线图
-在图 55.6, 我们使用小提琴图画图气温分布,可以看到,11月份的时候,有两个高密度区间(两个峰,50 degrees 和 35 degrees Fahrenheit),注意,这个信息在前面两个图中是没有的。
-<- lincoln_df %>%
- lincoln_violin ggplot(aes(x = month_short, y = `Mean Temperature [F]`)) +
- geom_violin(fill = "grey90") +
- xlab("month") +
- ylab("mean temperature (°F)") +
- theme_classic(base_size = 14) +
- theme(
- axis.text = element_text(color = "black", size = 12),
- plot.margin = margin(3, 7, 3, 1.5)
-
- )
- lincoln_violin
事实上,小提琴图也是不完美的,用的是密度分布图,会造成没有数据点的地方,也会有分布。怎么解决呢?
-解决办法就是,把原始数据点打上去,
-<- lincoln_df %>%
- lincoln_points ggplot(aes(x = month_short, y = `Mean Temperature [F]`)) +
- geom_point(size = 0.75) +
- xlab("month") +
- ylab("mean temperature (°F)") +
- theme_classic(base_size = 14) +
- theme(
- axis.text = element_text(color = "black", size = 12),
- plot.margin = margin(3, 7, 3, 1.5)
-
- )
- lincoln_points
但问题又来了,这样会有大量重叠的点。有时候会采用透明度的办法,即给每个点设置透明度,某个位置颜色越深,说明这个位置重叠的越多。当然,最好的办法是,给每个点增加一个随机的很小的“偏移”,即抖散图。
-<- lincoln_df %>%
- lincoln_jitter ggplot(aes(x = month_short, y = `Mean Temperature [F]`)) +
- geom_point(position = position_jitter(width = .15, height = 0, seed = 320), size = 0.75) +
- xlab("month") +
- ylab("mean temperature (°F)") +
- theme_classic(base_size = 14) +
- theme(
- axis.text = element_text(
- color = "black",
- size = 12
-
- ),plot.margin = margin(3, 7, 3, 1.5)
-
- )
- lincoln_jitter
于是,(小提琴图 + 抖散图)= sina 图,这样既可以看到原始的点,又可以看到统计分布,见图 55.9.
-<- lincoln_df %>%
- lincoln_sina ggplot(aes(x = month_short, y = `Mean Temperature [F]`)) +
- geom_violin(color = "transparent", fill = "gray90") +
- # dviz.supp::stat_sina(size = 0.85) +
- geom_jitter(width = 0.25, size = 0.85) +
- xlab("month") +
- ylab("mean temperature (°F)") +
- theme_classic(base_size = 14) +
- theme(
- axis.text = element_text(
- color = "black",
- size = 12
-
- ),plot.margin = margin(3, 7, 3, 1.5)
-
- )
- lincoln_sina
前面的图,分组变量(月份)是顺着x轴,这里介绍的山峦图(重山叠叠的感觉)分组变量是顺着y轴,这种图,在画不同时间的分布图的时候,效果非常不错。 比如图 55.10, 展示气温分布的山峦图。同样,图中很直观地展示了11月份的气温分布有两个峰值。
-<- 3.4
- bandwidth
-%>%
- lincoln_df ggplot(aes(x = `Mean Temperature [F]`, y = `Month`)) +
- geom_density_ridges(
- scale = 3, rel_min_height = 0.01,
- bandwidth = bandwidth, fill = colorspace::lighten("#56B4E9", .3), color = "white"
- +
- ) scale_x_continuous(
- name = "mean temperature (°F)",
- expand = c(0, 0), breaks = c(0, 25, 50, 75)
- +
- ) scale_y_discrete(name = NULL, expand = c(0, .2, 0, 2.6)) +
- theme_minimal(base_size = 14) +
- theme(
- axis.text = element_text(color = "black", size = 12),
- axis.text.y = element_text(vjust = 0),
- plot.margin = margin(3, 7, 3, 1.5)
- )
但这种图,也有一个问题,y轴是分组变量,x轴是数据的密度分布,缺少了密度分布的标度(即,缺少了密度图的高度,事实上,小提琴图也有这个毛病),所以这种图不适合比较精确的密度分布展示,但在探索性分析中,比较不同分组的密度分布,可以很方便获取直观的认知感受。
-我们看到
-因此,可以将气温变量映射到位置属性和颜色属性,见图 55.11
-<- 3.4
- bandwidth
-<- lincoln_weather %>%
- lincoln_base ggplot(aes(x = `Mean Temperature [F]`, y = `Month`, fill = ..x..)) +
- geom_density_ridges_gradient(
- scale = 3, rel_min_height = 0.01, bandwidth = bandwidth,
- color = "black", size = 0.25
- +
- ) scale_x_continuous(
- name = "mean temperature (°F)",
- expand = c(0, 0), breaks = c(0, 25, 50, 75), labels = NULL
- +
- ) scale_y_discrete(name = NULL, expand = c(0, .2, 0, 2.6)) +
- ::scale_fill_continuous_sequential(
- colorspacepalette = "Heat",
- l1 = 20, l2 = 100, c2 = 0,
- rev = FALSE
- +
- ) guides(fill = "none") +
- theme_minimal(base_size = 14) +
- theme(
- axis.text = element_text(color = "black", size = 12),
- axis.text.y = element_text(vjust = 0),
- plot.margin = margin(3, 7, 3, 1.5)
-
- )
-
-# x axis labels
-<- data.frame(temp = c(0, 25, 50, 75))
- temps
-# calculate corrected color ranges
-# stat_joy uses the +/- 3*bandwidth calculation internally
-<- min(lincoln_weather$`Mean Temperature [F]`) - 3 * bandwidth
- tmin <- max(lincoln_weather$`Mean Temperature [F]`) + 3 * bandwidth
- tmax
-<- axis_canvas(lincoln_base, axis = "x", ylim = c(0, 2)) +
- xax geom_ridgeline_gradient(
- data = data.frame(temp = seq(tmin, tmax, length.out = 100)),
- aes(x = temp, y = 1.1, height = .9, fill = temp),
- color = "transparent"
- +
- ) geom_text(
- data = temps, aes(x = temp, label = temp),
- color = "black",
- y = 0.9, hjust = 0.5, vjust = 1, size = 14 / .pt
- +
- ) ::scale_fill_continuous_sequential(
- colorspacepalette = "Heat",
- l1 = 20, l2 = 100, c2 = 0,
- rev = FALSE
-
- )
-<- cowplot::insert_xaxis_grob(lincoln_base, xax, position = "bottom", height = unit(0.1, "null"))
- lincoln_final
-ggdraw(lincoln_final)
本章的数据和代码来源于《Fundamentals of Data Visualization》的第9章和第20章。感谢Claus O. Wilke为大家写了这本非常好的书。
- -上节课介绍了R语言的基本数据结构,可能大家有种看美剧的感觉,有些懵。这很正常,我在开始学习R的时候,感觉和大家一样,所以不要惊慌,我们后面会慢慢填补这些知识点。
-这节课,我们介绍R语言最强大的可视化,看看都有哪些炫酷的操作。
-library(tidyverse) # install.packages("tidyverse")
-library(patchwork) # install.packages("patchwork")
我们先从一个故事开始,1854年伦敦爆发严重霍乱,当时流行的观点是霍乱是通过空气传播的,而John Snow医生(不是《权力的游戏》里的 Jon Snow)研究发现,霍乱是通过饮用水传播的。研究过程中,John Snow医生统计每户病亡人数,每死亡一人标注一条横线,分析发现,大多数病例的住所都围绕在Broad Street水泵附近,结合其他证据得出饮用水传播的结论,于是移掉了Broad Street水泵的把手,霍乱最终得到控制。
- -另一个有趣的例子就是辛普森悖论(Simpson’s Paradox)。比如我们想研究下,学习时间和考试成绩的关联。结果发现两者呈负相关性,即补课时间越长,考试成绩反而越差(下图横坐标是学习时间,纵坐标是考试成绩),很明显这个结果有违生活常识。
- -事实上,当我们把学生按照不同年级分成五组,再来观察学习时间和考试成绩之间的关联,发现相关性完全逆转了! 我们可以看到学习时间和考试成绩强烈正相关。
-辛普森悖论在日常生活中层出不穷。 那么如何避免辛普森悖论呢?我们能做的,就是仔细地研究分析各种影响因素,不要笼统概括地、浅尝辄止地看问题。其中,可视化分析为我们提供了一个好的方法。
-我们在图中画一个点,那么这个点就有(形状,大小,颜色,位置,透明度)等属性, -这些属性就是图形属性(有时也称之为图形元素或者视觉元素),下图 7.1列出了常用的图形属性。
-数据可视化的过程,就是我们的数据通过这些视觉上的元素表示出来,即,数值到图形属性的转换(映射)过程。
-ggplot2是RStudio首席科学家Hadley Wickham在2005年读博士期间的作品。很多人学习R语言,就是因为ggplot2宏包。目前, -ggplot2已经发展成为最受欢迎的R宏包,没有之一。
-我们可以看看它2019年cran的下载量
-library(cranlogs)
-
-<- cran_downloads(package = "ggplot2", from = "2019-01-01", to = "2019-12-31")
- d
-sum(d$count)
## [1] 9889742
-ggplot2有一套优雅的绘图语法,包名中“gg”是grammar of graphics的简称。
- - - - -ggplot()
函数包括9个部件:
其中前三个是必需的。
-Hadley Wickham将这套可视化语法诠释为:
-一张统计图形就是从数据到几何对象(geometric object,缩写geom)的图形属性(aesthetic attribute,缩写aes)的一个映射。
- -此外,图形中还可能包含数据的统计变换(statistical transformation,缩写stats),最后绘制在某个特定的坐标系(coordinate -system,缩写coord)中,而分面(facet)则可以用来生成数据不同子集的图形。
-先看一个简单的案例(1880-2014年温度变化和二氧化碳排放量)
-library(tidyverse)
-<- read_csv("./demo_data/temp_carbon.csv")
- d d
## # A tibble: 135 x 5
-## year temp_anomaly land_anomaly ocean_anomaly
-## <dbl> <dbl> <dbl> <dbl>
-## 1 1880 -0.11 -0.48 -0.01
-## 2 1881 -0.08 -0.4 0.01
-## 3 1882 -0.1 -0.48 0
-## 4 1883 -0.18 -0.66 -0.04
-## 5 1884 -0.26 -0.69 -0.14
-## 6 1885 -0.25 -0.56 -0.17
-## 7 1886 -0.24 -0.51 -0.17
-## 8 1887 -0.28 -0.47 -0.23
-## 9 1888 -0.13 -0.41 -0.05
-## 10 1889 -0.09 -0.31 -0.02
-## # ... with 125 more rows, and 1 more variable:
-## # carbon_emissions <dbl>
-library(ggplot2)
-ggplot(data = d, mapping = aes(x = year, y = carbon_emissions)) +
-geom_line() +
- xlab("Year") +
- ylab("Carbon emissions (metric tons)") +
- ggtitle("Annual global carbon emissions, 1880-2014")
是不是很简单?
-我们这里用科考人员收集的企鹅体征数据来演示。
- -library(tidyverse)
-<- read_csv("./demo_data/penguins.csv") %>%
- penguins ::clean_names() %>%
- janitordrop_na()
-
-%>%
- penguins head()
## # A tibble: 6 x 8
-## species island bill_length_mm bill_depth_mm
-## <chr> <chr> <dbl> <dbl>
-## 1 Adelie Torge~ 39.1 18.7
-## 2 Adelie Torge~ 39.5 17.4
-## 3 Adelie Torge~ 40.3 18
-## 4 Adelie Torge~ 36.7 19.3
-## 5 Adelie Torge~ 39.3 20.6
-## 6 Adelie Torge~ 38.9 17.8
-## # ... with 4 more variables: flipper_length_mm <dbl>,
-## # body_mass_g <dbl>, sex <chr>, year <dbl>
-variable | -class | -description | -
---|---|---|
species | -integer | -企鹅种类 (Adelie, Gentoo, Chinstrap) | -
island | -integer | -所在岛屿 (Biscoe, Dream, Torgersen) | -
bill_length_mm | -double | -嘴峰长度 (单位毫米) | -
bill_depth_mm | -double | -嘴峰深度 (单位毫米) | -
flipper_length_mm | -integer | -鰭肢长度 (单位毫米) | -
body_mass_g | -integer | -体重 (单位克) | -
sex | -integer | -性别 | -
year | -integer | -记录年份 | -
这里提出一个问题,嘴巴越长,嘴巴也会越厚?
- -回答这个问题,我们用到penguins数据集其中的四个变量
-%>%
- penguins select(species, sex, bill_length_mm, bill_depth_mm) %>%
- head(4)
为考察嘴峰长度(bill_length_mm)与嘴峰深度(bill_depth_mm)之间的关联,先绘制这两个变量的散点图,
- -ggplot()
表示调用该函数画图,data = penguins
表示使用penguins这个数据框来画图。
aes()
表示数值和视觉属性之间的映射。
aes(x = bill_length_mm, y = bill_depth_mm)
,意思是变量bill_length_mm作为(映射为)x轴方向的位置,变量bill_depth_mm作为(映射为)y轴方向的位置。
aes()
除了位置上映射,还可以实现色彩、形状或透明度等视觉属性的映射。
geom_point()
表示绘制散点图。
+
表示添加图层。
运行脚本后生成图片:
- -刚才看到的是位置上的映射,ggplot()
还包含了颜色、形状以及透明度等图形属性的映射,
比如我们在aes()
里增加一个颜色映射color = species
, 这样做就是希望,不同的企鹅类型, 用不同的颜色来表现。这里,企鹅类型有三组,那么就用三种不同的颜色来表示
ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
- geom_point()
此图绘制不同类型的企鹅,嘴峰长度与嘴峰深度散点图,并用颜色来实现了分组。
-大家试试下面代码呢,
-ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm, size = species)) +
- geom_point()
ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm, shape = species)) +
- geom_point()
ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm, alpha = species)) +
- geom_point()
为什么图中是这样的颜色呢?那是因为ggplot()
内部有一套默认的设置
不喜欢默认的颜色,可以自己定义喔。请往下看
-想把图中的点指定为某一种颜色,可以使用设置语句,比如
-ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point(color = "blue")
大家也可以试试下面
-ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point(size = 5)
ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point(shape = 2)
ggplot(penguins,
-aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point(alpha = 0.5)
思考下左图中aes(color = "blue")
为什么会变成了红色的点?
geom_point()
可以画散点图,也可以使用geom_smooth()
绘制平滑曲线,
<-
- p1 ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point()
- p1
<-
- p2 ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_smooth()
- p2
<-
- p3 ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_point() +
- geom_smooth()
- p3
library(patchwork)
-/ p2) | p3 (p1
ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
-geom_point()
ggplot(penguins) +
-geom_point(aes(x = bill_length_mm, y = bill_depth_mm, color = species))
大家可以看到,以上两段代码出来的图是一样。但背后的含义却不同。
-事实上,如果映射关系aes()
写在ggplot()
里,
ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
-geom_point()
那么映射关系x = bill_length_mm, y = bill_depth_mm, color = species
为全局变量。因此,当geom_point()
画图时,发现缺少所绘图所需要的映射关系(点的位置、点的大小、点的颜色等等),就会从ggplot()
全局变量中继承映射关系。
如果映射关系 aes()
写在几何对象geom_point()
里, 那么此处的映射关系就为局部变量, 比如。
ggplot(penguins) +
-geom_point(aes(x = bill_length_mm, y = bill_depth_mm, color = species))
此时geom_point()
绘图所需要的映射关系aes(x = bill_length_mm, y = bill_depth_mm, color = species)
已经存在,就不会继承全局变量的映射关系。
再看下面这个例子,
-ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
-geom_point(aes(color = species)) +
- geom_smooth()
这里的 geom_point()
和 geom_smooth()
都会从全局变量中继承位置映射关系。
再看下面这个例子,
-ggplot(penguins,aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
-geom_point(aes(color = sex))
局部变量中的映射关系
-aes(color = )
已经存在,因此不会从全局变量中继承,沿用当前的映射关系。
大家细细体会下,下面两段代码的区别
-ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm, color = species)) +
-geom_smooth(method = lm) +
- geom_point()
ggplot(penguins, aes(x = bill_length_mm, y = bill_depth_mm)) +
-geom_smooth(method = lm) +
- geom_point(aes(color = species))
可以使用ggsave()
函数,将图片保存为所需要的格式,如“.pdf,” “.png”等
<- penguins %>%
- p ggplot(aes(x = bill_length_mm, y = bill_depth_mm)) +
- geom_smooth(method = lm) +
- geom_point(aes(color = species)) +
- ggtitle("This is my first plot")
-
-
-ggsave(
-filename = "myfirst_plot.pdf",
- plot = p,
- width = 8,
- height = 6,
- dpi = 300
- )
补充代码,要求在一张图中画出
-ggplot(penguins, aes(x = ___, y = ___)) +
-geom_point() +
- geom_smooth() +
- geom_smooth()
library(tidyverse)
-library(palmerpenguins)
为了让图更好看,需要在画图中使用配色,但如果从颜色的色相、色度、明亮度三个属性(Hue-Chroma-Luminance )开始学,感觉这样要学的东西太多了 😞. 事实上,大神们已经为我们准备好了很多好看的模板,我们可以偷懒直接拿来用🎵.
-我个人比较喜欢colorspace中的配色,今天我们就讲讲如何使用这个宏包!
-library(colorspace)
colorspace
宏包提供了三种类型的配色模板:
三种类型对应着三个函数 qualitative_hcl()
, sequential_hcl()
, 和 diverging_hcl()
.
hcl_palettes(plot = TRUE)
ggplot2默认
-%>%
- penguins ggplot(aes(bill_length_mm, fill = species)) +
- geom_density(alpha = 0.6)
手动修改
-%>%
- penguins ggplot(aes(bill_length_mm, fill = species)) +
- geom_density(alpha = 0.6) +
- scale_fill_manual(
- breaks = c("Adelie", "Chinstrap", "Gentoo"),
- values = c("darkorange", "purple", "cyan4")
- )
模板配色
-%>%
- penguins ggplot(aes(bill_length_mm, fill = species)) +
- geom_density(alpha = 0.6) +
- scale_fill_discrete_qualitative(palette = "cold")
ggplot2的强大,还在于它的扩展包。本章在介绍ggplot2新的内容的同时还会引入一些新的宏包,需要提前安装
-install.packages(c("sf", "cowplot", "patchwork", "gghighlight", "ggforce"))
如果安装不成功,请先update宏包,再执行上面安装命令
- -library(tidyverse)
-library(gghighlight)
-library(cowplot)
-library(patchwork)
-library(ggforce)
-library(ggridges)
<- ggplot(mpg, aes(x = cty, y = hwy)) +
- p1 geom_point() +
- geom_smooth() +
- labs(title = "1: geom_point() + geom_smooth()") +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty, y = hwy)) +
- p2 geom_hex() +
- labs(title = "2: geom_hex()") +
- guides(fill = FALSE) +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = drv, fill = drv)) +
- p3 geom_bar() +
- labs(title = "3: geom_bar()") +
- guides(fill = FALSE) +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty)) +
- p4 geom_histogram(binwidth = 2, color = "white") +
- labs(title = "4: geom_histogram()") +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty, y = drv, fill = drv)) +
- p5 geom_violin() +
- guides(fill = FALSE) +
- labs(title = "5: geom_violin()") +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty, y = drv, fill = drv)) +
- p6 geom_boxplot() +
- guides(fill = FALSE) +
- labs(title = "6: geom_boxplot()") +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty, fill = drv)) +
- p7 geom_density(alpha = 0.7) +
- guides(fill = FALSE) +
- labs(title = "7: geom_density()") +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty, y = drv, fill = drv)) +
- p8 geom_density_ridges() +
- guides(fill = FALSE) +
- labs(title = "8: ggridges::geom_density_ridges()") +
- theme(plot.title = element_text(face = "bold"))
-
-<- ggplot(mpg, aes(x = cty, y = hwy)) +
- p9 geom_density_2d() +
- labs(title = "9: geom_density_2d()") +
- theme(plot.title = element_text(face = "bold"))
-
-+ p2 + p3 + p4 + p5 + p6 + p7 + p8 + p9 +
- p1 plot_layout(nrow = 3)
<- read_csv("./demo_data/gapminder.csv")
- gapdata gapdata
## # A tibble: 1,704 x 6
-## country continent year lifeExp pop gdpPercap
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Afghanist~ Asia 1952 28.8 8.43e6 779.
-## 2 Afghanist~ Asia 1957 30.3 9.24e6 821.
-## 3 Afghanist~ Asia 1962 32.0 1.03e7 853.
-## 4 Afghanist~ Asia 1967 34.0 1.15e7 836.
-## 5 Afghanist~ Asia 1972 36.1 1.31e7 740.
-## 6 Afghanist~ Asia 1977 38.4 1.49e7 786.
-## 7 Afghanist~ Asia 1982 39.9 1.29e7 978.
-## 8 Afghanist~ Asia 1987 40.8 1.39e7 852.
-## 9 Afghanist~ Asia 1992 41.7 1.63e7 649.
-## 10 Afghanist~ Asia 1997 41.8 2.22e7 635.
-## # ... with 1,694 more rows
-%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- scale_x_log10() +
- ggtitle("My Plot Title") +
- xlab("The X Variable") +
- ylab("The Y Variable")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- scale_x_log10() +
- labs(
- title = "My Plot Title",
- subtitle = "My Plot subtitle",
- x = "The X Variable",
- y = "The Y Variable"
- )
我喜欢用这两个函数定制喜欢的绘图色彩,scale_colour_manual()
和 scale_fill_manual()
. 更多方法可以参考 Colours chapter in Cookbook for R
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- scale_x_log10() +
- scale_color_manual(
- values = c("#195744", "#008148", "#C6C013", "#EF8A17", "#EF2917")
- )
我们有时候想把多张图组合到一起
-可以使用 cowplot
宏包的plot_grid()
函数完成多张图片的组合,使用方法很简单。
<- gapdata %>%
- p1 ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = lifeExp > mean(lifeExp))) +
- scale_x_log10() +
- theme(legend.position = "none") +
- scale_color_manual(values = c("orange", "pink")) +
- labs(
- title = "My Plot Title",
- x = "The X Variable",
- y = "The Y Variable"
- )
<- gapdata %>%
- p2 ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- scale_x_log10() +
- scale_color_manual(
- values = c("#195744", "#008148", "#C6C013", "#EF8A17", "#EF2917")
- +
- ) theme(legend.position = "none") +
- labs(
- title = "My Plot Title",
- x = "The X Variable",
- y = "The Y Variable"
- )
::plot_grid(
- cowplot
- p1,
- p2,labels = c("A", "B")
- )
也可以使用patchwork宏包,更简单的方法
-library(patchwork)
-+ p2 p1
/ p2 p1
+ p2 +
- p1 plot_annotation(
- tag_levels = "A",
- title = "The surprising truth about mtcars",
- subtitle = "These 3 plots will reveal yet-untold secrets about our beloved data-set",
- caption = "Disclaimer: None of these plots are insightful"
- )
再来一个
-library(palmerpenguins)
-
-<- penguins %>%
- g1 ggplot(aes(bill_length_mm, body_mass_g, color = species)) +
- geom_point() +
- theme_bw(base_size = 14) +
- labs(tag = "(A)", x = "Bill length (mm)", y = "Body mass (g)", color = "Species")
-
- <- penguins %>%
- g2 ggplot(aes(bill_length_mm, bill_depth_mm, color = species)) +
- geom_point() +
- theme_bw(base_size = 14) +
- labs(tag = "(B)", x = "Bill length (mm)", y = "Bill depth (mm)", color = "Species")
-
- + g2 + patchwork::plot_layout(guides = "collect") g1
patchwork 使用方法很简单,根本不需要记
- -使用ggsave()
函数,将图片保存为所需要的格式,如“.pdf,” “.png”等, 还可以指定图片的高度和宽度,默认units
是英寸,也可以使用“cm,” or “mm.”
<- gapdata %>%
- pp ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- scale_x_log10() +
- scale_color_manual(
- values = c("#195744", "#008148", "#C6C013", "#EF8A17", "#EF2917")
- +
- ) theme(legend.position = "none") +
- labs(
- title = "My Plot Title",
- x = "The X Variable",
- y = "The Y Variable"
-
- )
-# ggsave("demo_plot.pdf", plot = pp, width = 8, height = 6)
library(showtext)
-showtext_auto()
-
-%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- scale_x_log10() +
- scale_color_manual(
- values = c("#195744", "#008148", "#C6C013", "#EF8A17", "#EF2917")
- +
- ) theme(legend.position = "none") +
- labs(
- title = "这是我的标题美美哒",
- x = "这是我的x坐标",
- y = "这是我的y坐标"
- )
# ggsave("myfirst.pdf", width = 8, height = 6)
画图很容易,然而画一张好图,不容易。图片质量好不好,其原则就是不增加看图者的心智负担,有些图片的色彩很丰富,然而需要看图人配合文字和图注等信息才能看懂作者想表达的意思,这样就失去了图片“一图胜千言”的价值。
-分析数据过程中,我们可以使用高亮我们某组数据,突出我们想表达的信息,是非常好的一种可视化探索手段。
-这种方法是将背景部分和高亮部分分两步来画
-<- function(x) select(x, -continent)
- drop_facet
-%>%
- gapdata ggplot() +
- geom_line(
- data = drop_facet,
- aes(x = year, y = lifeExp, group = country), color = "grey",
- +
- ) geom_line(aes(x = year, y = lifeExp, color = country, group = country)) +
- facet_wrap(vars(continent)) +
- theme(legend.position = "none")
再来一个
-%>%
- gapdata mutate(group = country) %>%
- filter(continent == "Asia") %>%
- ggplot() +
- geom_line(
- data = function(d) select(d, -country),
- aes(x = year, y = lifeExp, group = group), color = "grey",
- +
- ) geom_line(aes(x = year, y = lifeExp, group = country), color = "red") +
- facet_wrap(vars(country)) +
- theme(legend.position = "none")
这里推荐gghighlight宏包
-%>% filter(country == "China") gapdata
## # A tibble: 12 x 6
-## country continent year lifeExp pop gdpPercap
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 China Asia 1952 44 556263527 400.
-## 2 China Asia 1957 50.5 637408000 576.
-## 3 China Asia 1962 44.5 665770000 488.
-## 4 China Asia 1967 58.4 754550000 613.
-## 5 China Asia 1972 63.1 862030000 677.
-## 6 China Asia 1977 64.0 943455000 741.
-## 7 China Asia 1982 65.5 1000281000 962.
-## 8 China Asia 1987 67.3 1084035000 1379.
-## 9 China Asia 1992 68.7 1164970000 1656.
-## 10 China Asia 1997 70.4 1230075000 2289.
-## 11 China Asia 2002 72.0 1280400000 3119.
-## 12 China Asia 2007 73.0 1318683096 4959.
-%>%
- gapdata ggplot(
- aes(x = year, y = lifeExp, color = continent, group = country)
- +
- ) geom_line() +
- gghighlight(
- == "China", # which is passed to dplyr::filter().
- country label_key = country
- )
%>% filter(continent == "Asia") gapdata
## # A tibble: 396 x 6
-## country continent year lifeExp pop gdpPercap
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Afghanist~ Asia 1952 28.8 8.43e6 779.
-## 2 Afghanist~ Asia 1957 30.3 9.24e6 821.
-## 3 Afghanist~ Asia 1962 32.0 1.03e7 853.
-## 4 Afghanist~ Asia 1967 34.0 1.15e7 836.
-## 5 Afghanist~ Asia 1972 36.1 1.31e7 740.
-## 6 Afghanist~ Asia 1977 38.4 1.49e7 786.
-## 7 Afghanist~ Asia 1982 39.9 1.29e7 978.
-## 8 Afghanist~ Asia 1987 40.8 1.39e7 852.
-## 9 Afghanist~ Asia 1992 41.7 1.63e7 649.
-## 10 Afghanist~ Asia 1997 41.8 2.22e7 635.
-## # ... with 386 more rows
-%>%
- gapdata filter(continent == "Asia") %>%
- ggplot(aes(year, lifeExp, color = country, group = country)) +
- geom_line(size = 1.2, alpha = .9, color = "#E58C23") +
- theme_minimal(base_size = 14) +
- theme(
- legend.position = "none",
- panel.grid.major.x = element_blank(),
- panel.grid.minor = element_blank()
- +
- ) gghighlight(
- %in% c("China", "India", "Japan", "Korea, Rep."),
- country use_group_by = FALSE,
- use_direct_label = FALSE,
- unhighlighted_params = list(color = "grey90")
- +
- ) facet_wrap(vars(country))
有时候我们想画一个函数图,比如正态分布的函数,可能会想到先产生数据,然后画图,比如下面的代码
-tibble(x = seq(from = -3, to = 3, by = .01)) %>%
-mutate(y = dnorm(x, mean = 0, sd = 1)) %>%
- ggplot(aes(x = x, y = y)) +
- geom_line(color = "grey33")
事实上,stat_function()
可以简化这个过程
ggplot(data = data.frame(x = c(-3, 3)), aes(x = x)) +
-stat_function(fun = dnorm)
当然我们也可以绘制自定义函数
-<- function(x) {
- myfun - 1)**2
- (x
- }
-ggplot(data = data.frame(x = c(-1, 3)), aes(x = x)) +
-stat_function(fun = myfun, geom = "line", colour = "red")
下面这是一个很不错的例子,细细体会下
-<- tibble(x = rnorm(2000, mean = 2, sd = 4))
- d
-ggplot(data = d, aes(x = x)) +
-geom_histogram(aes(y = stat(density))) +
- geom_density() +
- stat_function(fun = dnorm, args = list(mean = 2, sd = 4), colour = "red")
--小时候画地图很容易,长大了画地图却不容易了。
-
这是一个公园🏞地图和公园里松鼠🐿数量的数据集
-<- read_csv("./demo_data/nyc_squirrels.csv")
- nyc_squirrels <- sf::read_sf("./demo_data/central_park") central_park
先来一个地图,
-ggplot() +
-geom_sf(data = central_park)
一个geom_sf
就搞定了🥂,貌似没那么难呢? 好吧,换个姿势,在地图上标注松鼠出现的位置
%>%
- nyc_squirrels drop_na(primary_fur_color) %>%
- ggplot() +
- geom_sf(data = central_park, color = "grey85") +
- geom_point(
- aes(x = long, y = lat, color = primary_fur_color),
- size = .8
- )
分开画呢
-%>%
- nyc_squirrels drop_na(primary_fur_color) %>%
- ggplot() +
- geom_sf(data = central_park, color = "grey85") +
- geom_point(
- aes(x = long, y = lat, color = primary_fur_color),
- size = .8
- +
- ) facet_wrap(vars(primary_fur_color)) +
- theme(legend.position = "none")
<-
- label_colors c("all squirrels" = "grey75", "highlighted group" = "#0072B2")
-
-%>%
- nyc_squirrels drop_na(primary_fur_color) %>%
- ggplot() +
- geom_sf(data = central_park, color = "grey85") +
- geom_point(
- data = function(x) select(x, -primary_fur_color),
- aes(x = long, y = lat, color = "all squirrels"),
- size = .8
- +
- ) geom_point(
- aes(x = long, y = lat, color = "highlighted group"),
- size = .8
- +
- ) ::theme_map(16) +
- cowplottheme(
- legend.position = "bottom",
- legend.justification = "center"
- +
- ) facet_wrap(vars(primary_fur_color)) +
- scale_color_manual(name = NULL, values = label_colors) +
- guides(color = guide_legend(override.aes = list(size = 2)))
# ggsave("Squirrels.pdf", width = 9, height = 6)
当然,也可以用gghighlight
的方法
%>%
- nyc_squirrels drop_na(primary_fur_color) %>%
- ggplot() +
- geom_sf(data = central_park, color = "grey85") +
- geom_point(
- aes(x = long, y = lat, color = primary_fur_color),
- size = .8
- +
- ) gghighlight(
- label_key = primary_fur_color,
- use_direct_label = FALSE
- +
- ) facet_wrap(vars(primary_fur_color)) +
- ::theme_map(16) +
- cowplottheme(legend.position = "none")
library(ggplot2)
-library(showtext)
-showtext_auto()
-
-font_families()
## [1] "sans" "serif" "mono"
-## [4] "wqy-microhei"
-font_paths()
## [1] "C:\\Windows\\Fonts"
-# font_files()
-
-## Add fonts that are available on Windows(默认路径"C:\\Windows\\Fonts")
-font_add("heiti", "simhei.ttf")
-font_add("constan", "constan.ttf", italic = "constani.ttf")
-font_add("kaishu", "simkai.ttf")
-# font_add("Noto", "NotoSansCJKsc-Regular.otf")
-font_add("Yahei", "Yahei.ttf")
-
-# 也可放在指定的目录(尽量英文)
-# https://github.com/yixuan/showtext/issues/18
-font_add("fzfsj", here::here("myfont", "fzfsj.ttf"))
-font_add("fzxbsj", here::here("myfont", "FZXBSJW.ttf"))
-font_add("maoti", here::here("myfont", "maoti.ttf"))
-font_add("fzshuliu", here::here("myfont", "fzshuliu.ttf"))
-font_families()
## [1] "sans" "serif" "mono"
-## [4] "wqy-microhei" "heiti" "constan"
-## [7] "kaishu" "Yahei" "fzfsj"
-## [10] "fzxbsj" "maoti" "fzshuliu"
-## maybe, 保存为pdf图,才能看到有效字体
-ggplot(data = mpg) +
-geom_point(mapping = aes(x = displ, y = hwy)) +
- ggtitle("这是我的小标宋简体") +
- theme(
- plot.title = element_text(family = "fzxbsj")
- +
- ) geom_text(aes(x = 5, y = 40),
- label = "方正仿宋简体",
- family = "fzfsj"
- +
- ) geom_text(aes(x = 5, y = 38),
- label = "这是我的雅黑",
- family = "Yahei"
- +
- ) geom_text(aes(x = 5, y = 35),
- label = "方正楷书简体",
- family = "kaishu"
- +
- ) geom_text(aes(x = 5, y = 30),
- label = "草檀斋毛泽东字体",
- family = "maoti"
- +
- ) geom_text(aes(x = 5, y = 28),
- label = "方正苏新诗柳楷简体",
- family = "fzshuliu"
- )
# ggsave("showtext-example-9.pdf", width = 7, height = 4, dpi = 200)
根据往年大家提交的作业,有同学用rmarkdown生成pdf,图片标题使用了中文字体,但中文字体无法显示
-。解决方案是R code chunks加上fig.showtext=TRUE
```{r, fig.showtext=TRUE}
详细资料可参考这里
-library(ggplot2)
-library(latex2exp)
-
-ggplot(mpg, aes(x = displ, y = hwy)) +
-geom_point() +
- annotate("text",
- x = 4, y = 40,
- label = TeX("$\\alpha^2 + \\theta^2 = \\omega^2 $"),
- size = 9
- +
- ) labs(
- title = TeX("The ratio of 1 and 2 is $\\,\\, \\frac{1}{2}$"),
- x = TeX("$\\alpha$"),
- y = TeX("$\\alpha^2$")
- )
--采菊东篱下,悠然见南山。
-
根据大家投票,觉得ggplot2
是最想掌握的技能,我想这就是R语言中最有质感的部分吧。所以,这里专门拿出一节课讲ggplot2
,也算是补上之前第 7 章数据可视化没讲的内容。
library(tidyverse)
先看一组数据
-<- read_csv("./demo_data/datasaurus.csv")
- df df
## # A tibble: 1,846 x 3
-## dataset x y
-## <chr> <dbl> <dbl>
-## 1 dino 55.4 97.2
-## 2 dino 51.5 96.0
-## 3 dino 46.2 94.5
-## 4 dino 42.8 91.4
-## 5 dino 40.8 88.3
-## 6 dino 38.7 84.9
-## 7 dino 35.6 79.9
-## 8 dino 33.1 77.6
-## 9 dino 29.0 74.5
-## 10 dino 26.2 71.4
-## # ... with 1,836 more rows
-先用dataset
分组后,然后计算每组下x
的均值和方差,y
的均值和方差,以及x,y
两者的相关系数,我们发现每组数据下它们几乎都是相等的
%>%
- df group_by(dataset) %>%
- summarise(
- across(everything(), list(mean = mean, sd = sd), .names = "{fn}_{col}")
- %>%
- ) mutate(
- across(is.numeric, round, 3)
- )
## # A tibble: 13 x 5
-## dataset mean_x sd_x mean_y sd_y
-## <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 away 54.3 16.8 47.8 26.9
-## 2 bullseye 54.3 16.8 47.8 26.9
-## 3 circle 54.3 16.8 47.8 26.9
-## 4 dino 54.3 16.8 47.8 26.9
-## 5 dots 54.3 16.8 47.8 26.9
-## 6 h_lines 54.3 16.8 47.8 26.9
-## 7 high_lines 54.3 16.8 47.8 26.9
-## 8 slant_down 54.3 16.8 47.8 26.9
-## 9 slant_up 54.3 16.8 47.8 26.9
-## 10 star 54.3 16.8 47.8 26.9
-## 11 v_lines 54.3 16.8 47.8 26.9
-## 12 wide_lines 54.3 16.8 47.8 26.9
-## 13 x_shape 54.3 16.8 47.8 26.9
-如果上面代码不熟悉,可以用第 6 章的代码重新表达,也是一样的
-%>%
- df group_by(dataset) %>%
- summarize(
- mean_x = mean(x),
- mean_y = mean(y),
- std_dev_x = sd(x),
- std_dev_y = sd(y),
- corr_x_y = cor(x, y)
- )
## # A tibble: 13 x 6
-## dataset mean_x mean_y std_dev_x std_dev_y corr_x_y
-## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 away 54.3 47.8 16.8 26.9 -0.0641
-## 2 bullseye 54.3 47.8 16.8 26.9 -0.0686
-## 3 circle 54.3 47.8 16.8 26.9 -0.0683
-## 4 dino 54.3 47.8 16.8 26.9 -0.0645
-## 5 dots 54.3 47.8 16.8 26.9 -0.0603
-## 6 h_lines 54.3 47.8 16.8 26.9 -0.0617
-## 7 high_lin~ 54.3 47.8 16.8 26.9 -0.0685
-## 8 slant_do~ 54.3 47.8 16.8 26.9 -0.0690
-## 9 slant_up 54.3 47.8 16.8 26.9 -0.0686
-## 10 star 54.3 47.8 16.8 26.9 -0.0630
-## 11 v_lines 54.3 47.8 16.8 26.9 -0.0694
-## 12 wide_lin~ 54.3 47.8 16.8 26.9 -0.0666
-## 13 x_shape 54.3 47.8 16.8 26.9 -0.0656
-那么,我们是否能得出结论,每组的数据长的差不多呢?然而,我们画图发现
-ggplot(df, aes(x = x, y = y, colour = dataset)) +
-geom_point() +
- # geom_smooth(method = lm) +
- theme(legend.position = "none") +
- facet_wrap(~dataset, ncol = 3)
事实上,每张图都相差很大。所以,这里想说明的是,眼见为实。换句话说,可视化是数据探索中非常重要的部分。本章的目的就是带领大家学习ggplot2基本的绘图技能。
-data
: 数据框data.frame (注意,不支持向量vector和列表list类型)
aes
: 数据框中的数据变量映射到图形属性。什么叫图形属性?就是图中点的位置、形状,大小,颜色等眼睛能看到的东西。什么叫映射?就是一种对应关系,比如数学中的函数b = f(a)
就是a
和b
之间的一种映射关系, a
的值决定或者控制了b
的值,在ggplot2语法里,a
就是我们输入的数据变量,b
就是图形属性, 这些图形属性包括:
geoms
: 几何对象,确定我们想画什么样的图,一个geom_***
确定一种图形。更多几何对象推荐阅读这里
geom_bar()
geom_density()
geom_freqpoly()
geom_histogram()
geom_violin()
geom_boxplot()
geom_col()
geom_point()
geom_smooth()
geom_tile()
geom_density2d()
geom_bin2d()
geom_hex()
geom_count()
geom_text()
geom_sf()
stats
: 统计变换scales
: 标度coord
: 坐标系统facet
: 分面layer
: 增加图层theme
: 主题风格save
: 保存图片ggplot2图层语法框架
- --前面讲到R语言数据类型有字符串型、数值型、因子型、逻辑型、日期型等,ggplot2会将字符串型、因子型、逻辑型、日期型默认为离散变量,而数值型默认为连续变量。我们在而呈现数据的时候,可能会同时用到多种类型的数据,比如 -
--一个离散 -
--一个连续 -
--两个离散 -
--两个连续 -
--一个离散, 一个连续 -
--三个连续 -
-<- read_csv("./demo_data/gapminder.csv")
- gapdata gapdata
## # A tibble: 1,704 x 6
-## country continent year lifeExp pop gdpPercap
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Afghanist~ Asia 1952 28.8 8.43e6 779.
-## 2 Afghanist~ Asia 1957 30.3 9.24e6 821.
-## 3 Afghanist~ Asia 1962 32.0 1.03e7 853.
-## 4 Afghanist~ Asia 1967 34.0 1.15e7 836.
-## 5 Afghanist~ Asia 1972 36.1 1.31e7 740.
-## 6 Afghanist~ Asia 1977 38.4 1.49e7 786.
-## 7 Afghanist~ Asia 1982 39.9 1.29e7 978.
-## 8 Afghanist~ Asia 1987 40.8 1.39e7 852.
-## 9 Afghanist~ Asia 1992 41.7 1.63e7 649.
-## 10 Afghanist~ Asia 1997 41.8 2.22e7 635.
-## # ... with 1,694 more rows
-# 是否有缺失值
-
-%>%
- gapdata summarise(
- across(everything(), ~ sum(is.na(.)))
- )
## # A tibble: 1 x 6
-## country continent year lifeExp pop gdpPercap
-## <int> <int> <int> <int> <int> <int>
-## 1 0 0 0 0 0 0
-country
代表国家countinet
表示所在的洲year
时间lifeExp
平均寿命pop
人口数量gdpPercap
人均GDP-接下来,我们需要思考我们应该选择什么样的图,呈现这些不同类型的数据,探索数据背后的故事 -
-常用于一个离散变量
-%>%
- gapdata ggplot(aes(x = continent)) +
- geom_bar()
%>%
- gapdata ggplot(aes(x = reorder(continent, continent, length))) +
- geom_bar()
%>%
- gapdata ggplot(aes(x = reorder(continent, continent, length))) +
- geom_bar() +
- coord_flip()
# geom_bar vs stat_count
-%>%
- gapdata ggplot(aes(x = continent)) +
- stat_count()
%>% count(continent) gapdata
## # A tibble: 5 x 2
-## continent n
-## <chr> <int>
-## 1 Africa 624
-## 2 Americas 300
-## 3 Asia 396
-## 4 Europe 360
-## 5 Oceania 24
-可见,geom_bar() 自动完成了这个统计,更多geom与stat对应关系见这里
-%>%
- gapdata distinct(continent, country) %>%
- ggplot(aes(x = continent)) +
- geom_bar()
我个人比较喜欢先统计,然后画图
-%>%
- gapdata distinct(continent, country) %>%
- group_by(continent) %>%
- summarise(num = n()) %>%
- ggplot(aes(x = continent, y = num)) +
- geom_col()
常用于一个连续变量
-%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_histogram() # 对应的stat_bin()
%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_histogram(binwidth = 1)
#' histograms, 默认使用 `position = "stack"`
-%>%
- gapdata ggplot(aes(x = lifeExp, fill = continent)) +
- geom_histogram()
#' 使用`position = "identity"`
-%>%
- gapdata ggplot(aes(x = lifeExp, fill = continent)) +
- geom_histogram(position = "identity")
%>%
- gapdata ggplot(aes(x = lifeExp, color = continent)) +
- geom_freqpoly()
#' smooth histogram = densityplot
-%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_density()
如果不喜欢下面那条线,可以这样
-%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_line(stat = "density")
# adjust 调节bandwidth,
-# adjust = 1/2 means use half of the default bandwidth.
-%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_density(adjust = 1)
%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_density(adjust = 0.2)
%>%
- gapdata ggplot(aes(x = lifeExp, color = continent)) +
- geom_density()
%>%
- gapdata ggplot(aes(x = lifeExp, fill = continent)) +
- geom_density(alpha = 0.2)
%>%
- gapdata filter(continent != "Oceania") %>%
- ggplot(aes(x = lifeExp, fill = continent)) +
- geom_density(alpha = 0.2)
%>%
- gapdata ggplot(aes(x = lifeExp)) +
- geom_density() +
- # facet_wrap(vars(continent))
- facet_grid(. ~ continent)
%>%
- gapdata filter(continent != "Oceania") %>%
- ggplot(aes(x = lifeExp, fill = continent)) +
- geom_histogram() +
- facet_grid(continent ~ .)
直方图和密度图画在一起。注意y = stat(density)
表示y是由x新生成的变量,这是一种固定写法,类似的还有stat(count)
, stat(level)
%>%
- gapdata filter(continent != "Oceania") %>%
- ggplot(aes(x = lifeExp, y = stat(density))) +
- geom_histogram(aes(fill = continent)) +
- geom_density() +
- facet_grid(continent ~ .)
一个离散变量 + 一个连续变量
-#' 思考下结果为什么是这样?
-%>%
- gapdata ggplot(aes(x = year, y = lifeExp)) +
- geom_boxplot()
# 数据框中的year变量是数值型,需要先转换成因子型,弄成离散型变量
-%>%
- gapdata ggplot(aes(x = as.factor(year), y = lifeExp)) +
- geom_boxplot()
# 明确指定分组变量
-%>%
- gapdata ggplot(aes(x = year, y = lifeExp)) +
- geom_boxplot(aes(group = year))
%>%
- gapdata ggplot(aes(x = year, y = lifeExp)) +
- geom_violin(aes(group = year)) +
- geom_jitter(alpha = 1 / 4) +
- geom_smooth(se = FALSE)
点重叠的处理方案
-%>% ggplot(aes(x = continent, y = lifeExp)) +
- gapdata geom_point()
%>% ggplot(aes(x = continent, y = lifeExp)) +
- gapdata geom_jitter()
%>% ggplot(aes(x = continent, y = lifeExp)) +
- gapdata geom_boxplot()
%>% ggplot(aes(x = continent, y = lifeExp)) +
- gapdata geom_boxplot() +
- geom_jitter()
%>%
- gapdata ggplot(aes(x = continent, y = lifeExp)) +
- geom_jitter() +
- stat_summary(fun.y = median, colour = "red", geom = "point", size = 5)
%>%
- gapdata ggplot(aes(reorder(x = continent, lifeExp), y = lifeExp)) +
- geom_jitter() +
- stat_summary(fun.y = median, colour = "red", geom = "point", size = 5)
注意到我们已经提到过 stat_count / stat_bin / stat_summary
-%>%
- gapdata ggplot(aes(x = continent, y = lifeExp)) +
- geom_violin(
- trim = FALSE,
- alpha = 0.5
- +
- ) stat_summary(
- fun.y = mean,
- fun.ymax = function(x) {
- mean(x) + sd(x)
-
- },fun.ymin = function(x) {
- mean(x) - sd(x)
-
- },geom = "pointrange"
- )
常用于一个离散变量 + 一个连续变量
-%>%
- gapdata ggplot(aes(
- x = lifeExp,
- y = continent,
- fill = continent
- +
- )) ::geom_density_ridges() ggridges
# https://learnui.design/tools/data-color-picker.html#palette
-%>%
- gapdata ggplot(aes(
- x = lifeExp,
- y = continent,
- fill = continent
- +
- )) ::geom_density_ridges() +
- ggridgesscale_fill_manual(
- values = c("#003f5c", "#58508d", "#bc5090", "#ff6361", "#ffa600")
- )
%>%
- gapdata ggplot(aes(
- x = lifeExp,
- y = continent,
- fill = continent
- +
- )) ::geom_density_ridges() +
- ggridgesscale_fill_manual(
- values = colorspace::sequential_hcl(5, palette = "Peach")
- )
常用于两个连续变量
-%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point()
%>%
- gapdata ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
- geom_point()
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point() +
- scale_x_log10() # A better way to log transform
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent))
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(alpha = (1 / 3), size = 2)
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point() +
- geom_smooth()
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE)
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point(show.legend = FALSE) +
- facet_wrap(~continent)
<- c("Canada", "Rwanda", "Cambodia", "Mexico")
- jCountries
-%>%
- gapdata filter(country %in% jCountries) %>%
- ggplot(aes(x = year, y = lifeExp, color = country)) +
- geom_line() +
- geom_point()
%>%
- gapdata filter(country %in% jCountries) %>%
- ggplot(aes(
- x = year, y = lifeExp,
- color = reorder(country, -1 * lifeExp, max)
- +
- )) geom_line() +
- geom_point()
这是一种技巧,但我更推荐以下方法
-<- gapdata %>%
- d1 filter(country %in% jCountries) %>%
- group_by(country) %>%
- mutate(end_label = if_else(year == max(year), country, NA_character_))
-
- d1
## # A tibble: 48 x 7
-## # Groups: country [4]
-## country continent year lifeExp pop gdpPercap
-## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
-## 1 Cambod~ Asia 1952 39.4 4.69e6 368.
-## 2 Cambod~ Asia 1957 41.4 5.32e6 434.
-## 3 Cambod~ Asia 1962 43.4 6.08e6 497.
-## 4 Cambod~ Asia 1967 45.4 6.96e6 523.
-## 5 Cambod~ Asia 1972 40.3 7.45e6 422.
-## 6 Cambod~ Asia 1977 31.2 6.98e6 525.
-## 7 Cambod~ Asia 1982 51.0 7.27e6 624.
-## 8 Cambod~ Asia 1987 53.9 8.37e6 684.
-## 9 Cambod~ Asia 1992 55.8 1.02e7 682.
-## 10 Cambod~ Asia 1997 56.5 1.18e7 734.
-## # ... with 38 more rows, and 1 more variable:
-## # end_label <chr>
-%>% ggplot(aes(
- d1 x = year, y = lifeExp, color = country
- +
- )) geom_line() +
- geom_point() +
- geom_label(aes(label = end_label)) +
- theme(legend.position = "none")
如果觉得麻烦,就用gghighlight
宏包吧
%>%
- gapdata filter(country %in% jCountries) %>%
- ggplot(aes(
- x = year, y = lifeExp, color = country
- +
- )) geom_line() +
- geom_point() +
- ::gghighlight() gghighlight
%>%
- gapdata filter(continent == "Asia" & year == 2007) %>%
- ggplot(aes(x = lifeExp, y = country)) +
- geom_point()
%>%
- gapdata filter(continent == "Asia" & year == 2007) %>%
- ggplot(aes(
- x = lifeExp,
- y = reorder(country, lifeExp)
- +
- )) geom_point(color = "blue", size = 2) +
- geom_segment(aes(
- x = 40,
- xend = lifeExp,
- y = reorder(country, lifeExp),
- yend = reorder(country, lifeExp)
-
- ),color = "lightgrey"
- +
- ) labs(
- x = "Life Expectancy (years)",
- y = "",
- title = "Life Expectancy by Country",
- subtitle = "GapMinder data for Asia - 2007"
- +
- ) theme_minimal() +
- theme(
- panel.grid.major = element_blank(),
- panel.grid.minor = element_blank()
- )
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point() +
- ::geom_mark_ellipse(aes(
- ggforcefilter = gdpPercap > 70000,
- label = "有钱的国家",
- description = "他们是什么国家?"
- ))
<- gapdata %>%
- ten_countries distinct(country) %>%
- pull() %>%
- sample(10)
library(ggrepel)
-%>%
- gapdata filter(year == 2007) %>%
- mutate(
- label = ifelse(country %in% ten_countries, as.character(country), "")
- %>%
- ) ggplot(aes(log(gdpPercap), lifeExp)) +
- geom_point(
- size = 3.5,
- alpha = .9,
- shape = 21,
- col = "white",
- fill = "#0162B2"
- +
- ) geom_text_repel(
- aes(label = label),
- size = 4.5,
- point.padding = .2,
- box.padding = .3,
- force = 1,
- min.segment.length = 0
- +
- ) theme_minimal(14) +
- theme(
- legend.position = "none",
- panel.grid.minor = element_blank()
- +
- ) labs(
- x = "log(GDP per capita)",
- y = "life expectancy"
- )
<- gapdata %>%
- avg_gapdata group_by(continent) %>%
- summarise(
- mean = mean(lifeExp),
- sd = sd(lifeExp)
-
- ) avg_gapdata
## # A tibble: 5 x 3
-## continent mean sd
-## <chr> <dbl> <dbl>
-## 1 Africa 48.9 9.15
-## 2 Americas 64.7 9.35
-## 3 Asia 60.1 11.9
-## 4 Europe 71.9 5.43
-## 5 Oceania 74.3 3.80
-%>%
- avg_gapdata ggplot(aes(continent, mean, fill = continent)) +
- # geom_col(alpha = 0.5) +
- geom_point() +
- geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = 0.25)
%>%
- gapdata ggplot(aes(x = log(gdpPercap), y = lifeExp)) +
- geom_point() +
- stat_ellipse(type = "norm", level = 0.95)
与一维的情形geom_density()
类似,
-geom_density_2d()
, geom_bin2d()
, geom_hex()
常用于刻画两个变量构成的二维区间的密度
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_bin2d()
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_hex()
geom_tile()
, geom_contour()
, geom_raster()
常用于3个变量
%>%
- gapdata group_by(continent, year) %>%
- summarise(mean_lifeExp = mean(lifeExp)) %>%
- ggplot(aes(x = year, y = continent, fill = mean_lifeExp)) +
- geom_tile() +
- scale_fill_viridis_c()
事实上可以有更好的呈现方式
-%>%
- gapdata group_by(continent, year) %>%
- summarise(mean_lifeExp = mean(lifeExp)) %>%
- ggplot(aes(x = year, y = continent, size = mean_lifeExp)) +
- geom_point()
%>%
- gapdata group_by(continent, year) %>%
- summarise(mean_lifeExp = mean(lifeExp)) %>%
- ggplot(aes(x = year, y = continent, size = mean_lifeExp)) +
- geom_point(shape = 21, color = "red", fill = "white") +
- scale_size_continuous(range = c(7, 15)) +
- geom_text(aes(label = round(mean_lifeExp, 2)), size = 3, color = "black") +
- theme(legend.position = "none")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ggtitle("Life expectancy over time by continent")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- theme_grey() # the default
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- theme_bw()
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_calc() +
- ggthemesggtitle("ggthemes::theme_calc()")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_economist() +
- ggthemesggtitle("ggthemes::theme_economist()")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_economist_white() +
- ggthemesggtitle("ggthemes::theme_economist_white()")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_few() +
- ggthemesggtitle("ggthemes::theme_few()")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_gdocs() +
- ggthemesggtitle("ggthemes::theme_gdocs()")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_tufte() +
- ggthemesggtitle("ggthemes::theme_tufte()")
%>%
- gapdata ggplot(aes(x = gdpPercap, y = lifeExp, color = continent)) +
- geom_point() +
- geom_smooth(lwd = 3, se = FALSE, method = "lm") +
- ::theme_wsj() +
- ggthemesggtitle("ggthemes::theme_wsj()")
这节课,我们讲如何让我们的图动起来。(因为渲染需要花费很长时间,所以文档中的动图代码都没有执行。)
-gganimate
宏包动图可以将其理解为多张静态图堆在一起,当然不是随意的堆放,而是按照一定的规则,比如按照时间的顺序,或者类别的顺序。一般而言,动图制作包括两个步骤: 静态图制作及图形组装。静态图制作,前面几章我们讲过主要用ggplot2宏包实现;对于图形组装,需要用到今天我们要讲Thomas Lin Pedersen的gganimate
宏包,来自同一工厂的产品,用起来自然是无缝衔接啦。
install.packages("gganimate")
library(tidyverse)
-library(covdata) # remotes::install_github("kjhealy/covdata")
-library(gganimate)
::covnat %>%
- covdata::filter(iso3 == "USA") %>%
- dplyr::filter(cu_cases > 0) %>%
- dplyrggplot(aes(x = date, y = cases)) +
- geom_path() +
- labs(
- title = "美国新冠肺炎累积确诊病例",
- subtitle = "数据来源https://kjhealy.github.io/covdata/"
- )
让它动起来,我们只需要增加一行代码!
-::covnat %>%
- covdata::filter(iso3 == "USA") %>%
- dplyr::filter(cu_cases > 0) %>%
- dplyrggplot(aes(x = date, y = cases)) +
- geom_path() +
- labs(
- title = "美国新冠肺炎累积确诊病例 {frame_along}",
- subtitle = "数据来源https://kjhealy.github.io/covdata/"
- +
- ) transition_reveal(along = date)
library(datasauRus)
-ggplot(datasaurus_dozen) +
-aes(x, y, color = dataset) +
- geom_point()
用分面展示
-ggplot(datasaurus_dozen) +
-aes(x, y, color = dataset) +
- geom_point() +
- facet_wrap(~dataset)
可以用动图展示
-ggplot(datasaurus_dozen) +
-aes(x, y, color = dataset) +
- geom_point() +
- transition_states(dataset, 3, 1) + # <<
- labs(title = "Dataset: {closest_state}")
是不是很炫酷,下面我们就一个个讲解其中的函数。
-使用gganimate
做动画,只需要掌握以下五类函数:
transition_*()
: 定义动画是根据哪个变量进行”动”,以及如何”动”view_*()
: 定义坐标轴随数据变化.shadow_*()
: 影子(旧数据的历史记忆)?定义点相继出现的方式.enter_*()/exit_*()
: 定义新数据出现和旧数据退去的方式.ease_aes()
: 美观定义,控制变化的节奏(如何让整个动画看起来更舒适).下面通过案例依次讲解这些函数功能。
-变量如何选择,这需要从变量类型和变量代表的信息来确定。
-transition_states(states = )
, 这里的参数states往往带有分组信息,可以等价于静态图中的分面。%>%
- diamonds ggplot(aes(carat, price)) +
- geom_point()
%>%
- diamonds ggplot(aes(carat, price)) +
- geom_point() +
- facet_wrap(vars(color))
%>%
- diamonds ggplot(aes(carat, price)) +
- geom_point() +
- transition_states(states = color, transition_length = 3, state_length = 1)
transition_time(time = )
, 这里的time一般认为是连续的值,相比于transition_states
,没有了transtion_length
这个选项,是因为transtion_length
默认为time. 事实上,transition_time
是transition_states
的一种特例,但其实也有分组的要求<- gapminder::gapminder %>%
- p ggplot(aes(x = gdpPercap, y = lifeExp, size = pop, colour = country)) +
- geom_point(alpha = 0.7, show.legend = FALSE) +
- scale_size(range = c(2, 12)) +
- scale_x_log10() +
- labs(
- x = "GDP per capita",
- y = "life expectancy"
-
- ) p
<- p +
- anim transition_time(time = year) +
- labs(title = "year: {frame_time}")
- anim
transition_reveal(along = )
, along 这个词可以看出,它是按照某个变量依次显示的意思,比如顺着x轴显示ggplot(data = economics) +
-aes(x = date, y = unemploy) +
- geom_line()
ggplot(economics) +
-aes(x = date, y = unemploy) +
- geom_line() +
- transition_reveal(along = date) +
- labs(title = "now is {frame_along}")
transition_filter( 至少2个筛选条件,transition_length = , filter_length =)
, 动图将会在这些筛选条件对应的子图之间转换%>%
- diamonds ggplot(aes(carat, price)) +
- geom_point() +
- transition_filter(
- transition_length = 3,
- filter_length = 1,
-
-== "Ideal",
- cut Deep = depth >= 60
- )
transition_layers()
: 依次显示每个图层%>%
- mtcars ggplot(aes(mpg, disp)) +
- geom_point() +
- geom_smooth(colour = "grey", se = FALSE) +
- geom_smooth(aes(colour = factor(gear))) +
- transition_layers(
- layer_length = 1, transition_length = 2,
- from_blank = FALSE, keep_layers = c(Inf, 0, 0)
- +
- ) enter_fade() +
- exit_fade()
transition_manual()
transition_components()
transition_events()
动画过程中,绘图窗口怎么变化呢?
-ggplot(iris, aes(Sepal.Length, Sepal.Width)) +
-geom_point() +
- labs(title = "{closest_state}") +
- transition_states(Species, transition_length = 4, state_length = 1) +
- view_follow()
view_step()
view_step_manual()
view_zoom()
view_zoom_manual()
shadow_wake(wake_length =, )
旧数据消退时,制造点小小的尾迹的效果(wake除了叫醒,还有尾迹的意思,合起来就是记忆_尾迹)shadow_trail(distance = 0.05)
旧数据消退时,制造面包屑一样的残留痕迹(记忆_零星残留)shadow_mark(past = TRUE, future = FALSE)
将旧数据和新数据当作背景(记忆_标记)+
- p transition_time(time = year) +
- labs(title = "year: {frame_time}") +
- shadow_wake(wake_length = 0.1, alpha = FALSE)
ggplot(iris, aes(Petal.Length, Sepal.Length)) +
-geom_point(size = 2) +
- labs(title = "{closest_state}") +
- transition_states(Species, transition_length = 4, state_length = 1) +
- shadow_wake(wake_length = 0.1)
+
- p transition_time(time = year) +
- labs(title = "year: {frame_time}") +
- shadow_trail(distance = 0.1)
ggplot(iris, aes(Petal.Length, Sepal.Length)) +
-geom_point(size = 2) +
- labs(title = "{closest_state}") +
- transition_states(Species, transition_length = 4, state_length = 1) +
- shadow_trail(distance = 0.1)
+
- p transition_time(time = year) +
- labs(title = "year: {frame_time}") +
- shadow_mark(alpha = 0.3, size = 0.5)
ggplot(airquality, aes(Day, Temp)) +
-geom_line(color = "red", size = 1) +
- transition_time(Month) +
- shadow_mark(colour = "black", size = 0.75)
出现和退去的函数是成对的
-透明度上的变化,我这里用柱状图展示,效果要明显一点。
-tibble(
-x = month.name,
- y = sample.int(12)
- %>%
- ) ggplot(aes(x = x, y = y)) +
- geom_col() +
- theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
- transition_states(states = month.name)
tibble(
-x = month.name,
- y = sample.int(12)
- %>%
- ) ggplot(aes(x = x, y = y)) +
- geom_col() +
- theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
- transition_states(states = month.name) +
- shadow_mark(past = TRUE) +
- enter_fade()
+
- p transition_time(time = year) +
- labs(title = "year: {frame_time}") +
- enter_fade()
大小上的变化
-tibble(
-x = month.name,
- y = sample.int(12)
- %>%
- ) ggplot(aes(x = x, y = y)) +
- geom_col() +
- theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
- transition_states(states = month.name) +
- shadow_mark(past = TRUE) +
- enter_grow()
+
- p transition_time(time = year) +
- labs(title = "year: {frame_time}") +
- enter_grow() +
- enter_fade()
控制数据点变化的快慢
-+ ease_aes({aesthetic} = {ease})
- p + ease_aes(x = "cubic") p
::include_graphics("images/ease.png") knitr
Source: https://easings.net/
-看下面的案例:
-%>%
- diamonds ggplot(aes(carat, price)) +
- geom_point() +
- transition_states(color, transition_length = 3, state_length = 1) +
- ease_aes("cubic-in") # Change easing of all aesthetics
%>%
- diamonds ggplot(aes(carat, price)) +
- geom_point() +
- transition_states(color, transition_length = 3, state_length = 1) +
- ease_aes(x = "elastic-in") # Only change `x` (others remain “linear”)
我们可能需要在标题中加入每张动画的信息,常用罗列如下
-transition_states(states = ) +
-labs(title = "previous is {previous_state},
- current is {closest_state},
- next is {next_state}")
-
-transition_layers() +
-labs(title = "previous is {previous_layers},
- current is {closest_layers},
- next is {next_layers}")
-
-transition_time(time = ) +
-labs(title = "now is {frame_time}")
-
-
-transition_reveal(along = ) +
-labs(title = "now is {frame_along}")
## # A tibble: 6 x 2
-## Function Description
-## <chr> <chr>
-## 1 gifski_rende~ Default, super fast gif renderer.
-## 2 magick_rende~ Somewhat slower gif renderer.
-## 3 ffmpeg_rende~ Uses ffmpeg to create a video from the~
-## 4 av_renderer Uses the av package to create a video ~
-## 5 file_renderer Dumps a list of image frames from the ~
-## 6 sprite_rende~ Creates a spritesheet from frames of t~
-一般用anim_save()
保存为 gif 格式,方法类似ggsave()
<- diamonds %>%
- animation_to_save ggplot(aes(carat, price)) +
- geom_point() +
- transition_states(color, transition_length = 3, state_length = 1) +
- ease_aes("cubic-in")
-
-anim_save("first_saved_animation.gif", animation = animation_to_save)
这是网上有段时间比较火的racing_bar图
-<- covdata::covnat %>%
- ranked_by_date group_by(date) %>%
- arrange(date, desc(cu_cases)) %>%
- mutate(rank = 1:n()) %>%
- filter(rank <= 10) %>%
- ungroup()
%>%
- ranked_by_date filter(date >= "2020-05-01") %>%
- ggplot(
- aes(x = rank, y = cname, group = cname, fill = cname)
- +
- ) geom_tile(
- aes(
- y = cu_cases / 2,
- height = cu_cases,
- width = 0.9
-
- ),alpha = 0.8,
- show.legend = F
- +
- ) geom_text(aes(
- y = cu_cases,
- label = cname
-
- ),show.legend = FALSE
- +
- ) scale_x_reverse(
- breaks = c(1:10),
- label = c(1:10)
- +
- ) theme_minimal() +
- coord_flip(clip = "off", expand = FALSE) +
- labs(
- title = "日期: {closest_state}",
- x = "",
- caption = "Source: github/kjhealy/covdata"
- +
- ) transition_states(date,
- transition_length = 4,
- state_length = 1,
- wrap = TRUE
- +
- ) ease_aes("cubic-in-out")
<- readr::read_csv("./demo_data/bats-subset.csv") %>%
- bats ::mutate(id = factor(id)) dplyr
%>%
- bats ggplot(aes(
- x = longitude,
- y = latitude,
- group = id,
- color = id
- +
- )) geom_point()
%>%
- bats ggplot(aes(
- x = longitude,
- y = latitude,
- group = id,
- color = id
- +
- )) geom_point() +
- transition_time(time) +
- shadow_mark(past = TRUE)
%>%
- bats ggplot(aes(
- x = longitude,
- y = latitude,
- group = id,
- color = id
- +
- )) geom_path() +
- transition_time(time) +
- shadow_mark(past = TRUE)
%>%
- bats ::mutate(
- dplyrimage = "images/bat-cartoon.png"
- %>%
- ) ggplot(aes(
- x = longitude,
- y = latitude,
- group = id,
- color = id
- +
- )) geom_path() +
- ::geom_image(aes(image = image), size = 0.1) +
- ggimagetransition_reveal(time)
全球R-Ladies组织,会议活动的情况,我们在地图上用动图展示
-<- read_csv("./demo_data/rladies.csv")
- rladies rladies
这里需要一个地图,可以这样
-ggplot() +
-::borders("world", colour = "gray85", fill = "gray80") +
- ggplot2::theme_map() ggthemes
当然,最好是这样
-library(maps)
-<- map_data("world")
- world
-<- ggplot() +
- world_map geom_polygon(data = world,
- aes(x = long, y = lat, group = group),
- color = "white", fill = "gray80"
- +
- ) ::theme_map()
- ggthemes
- world_map
然后把点打上去
-+
- world_map geom_point(
- data = rladies,
- aes(x = lon, y = lat, size = followers),
- colour = "purple", alpha = .5
- +
- ) scale_size_continuous(
- range = c(1, 8),
- breaks = c(250, 500, 750, 1000)
- +
- ) labs(size = "Followers")
用动图展示(这种方法常用在流行病传播的展示上)
-+
- world_map geom_point(aes(x = lon, y = lat, size = followers),
- data = rladies,
- colour = "purple", alpha = .5
- +
- ) scale_size_continuous(
- range = c(1, 8),
- breaks = c(250, 500, 750, 1000)
- +
- ) transition_states(created_at) +
- shadow_mark(past = TRUE) +
- labs(title = "Day: {closest_state}")
把下图弄成你喜欢的样子
-library(gapminder)
-theme_set(theme_bw())
-
-ggplot(gapminder) +
-aes(
- x = gdpPercap, y = lifeExp,
- size = pop, colour = country
- +
- ) geom_point(show.legend = FALSE) +
- scale_x_log10() +
- scale_color_viridis_d() +
- scale_size(range = c(2, 12)) +
- labs(x = "GDP per capita", y = "Life expectancy") +
- transition_time(year) +
- labs(title = "Year: {frame_time}")
那请说说这以下三个的区别?
-%>%
- bats ::filter(id == 1) %>%
- dplyrggplot(
- aes(
- x = longitude,
- y = latitude
-
- )+
- ) geom_point() +
- transition_reveal(time) # <<
-
-
-
-%>%
- bats ::filter(id == 1) %>%
- dplyrggplot(
- aes(
- x = longitude,
- y = latitude
-
- )+
- ) geom_point() +
- transition_states(time) # <<
-
-
-
-%>%
- bats ::filter(id == 1) %>%
- dplyrggplot(
- aes(
- x = longitude,
- y = latitude
-
- )+
- ) geom_point() +
- transition_time(time) # <<
这一章,我们一起学习ggplot2中的图例系统,内容相对简单,但还是推荐大家阅读ggplot2官方文档
-为了方便演示,我们还是用熟悉的配方ggplot2::mpg
library(tidyverse)
-
-%>%
- mpg ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point()
如果想调整图例的样式,可以使用guides()
函数,用法类似上节课中的theme
函数, 具体参数为:
字符串
(i.e. "color = colorbar"
or "color = legend"
),特定的函数
(i.e. color = guide_colourbar()
or color = guide_legend()
)%>%
- mpg ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- guides(color = "legend")
%>%
- mpg ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- guides(color = guide_bins(
- title = "my title",
- label.hjust = 1
-
- ) )
%>%
- mpg ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- guides(color = guide_legend(
- ncol = 4
-
- ) )
%>%
- mpg ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- guides(color = guide_legend(
- title = "标题好像有点高",
- title.position = "top",
- title.vjust = 5,
- label.position = "left",
- label.hjust = 1,
- label.theme = element_text(size = 15,
- face = "italic",
- colour = "red",
- angle = 0),
- keywidth = 5,
- reverse = TRUE
-
- ) )
%>%
- mpg ggplot(aes(x = displ, y = hwy, color = class, size = cyl)) +
- geom_point()
比如,我们想删除size这个图例,那么需要这样做
-%>%
- mpg ggplot(aes(x = displ, y = hwy, color = class, size = cyl)) +
- geom_point() +
- guides(color = guide_legend("汽车类型"), # keep
- size = FALSE # remove
- )
到了这里,ggplot2内容的差不多介绍完了,最后做下自我测试,能读懂下面代码(来源 Emi Tanaka)的意思?
-%>%
- mtcars as_tibble() %>%
- ggplot(aes(x = wt, y = mpg, shape = factor(vs), color = hp)) +
- geom_point(size = 3) +
- ::scale_color_continuous_sequential(palette = "Dark Mint") +
- colorspacescale_shape_discrete(labels = c("V-shaped", "Straight")) +
- labs(
- x = "Weight (1000 lbs)", y = "Miles per gallon",
- title = "Motor Trend Car Road Tests",
- shape = "Engine", color = "Horsepower"
- +
- ) theme(
- text = element_text(size = 18, color = "white"),
- rect = element_rect(fill = "black"),
- panel.background = element_rect(fill = "black"),
- legend.key = element_rect(fill = "black"),
- axis.text = element_text(color = "white"),
- plot.title.position = "plot",
- plot.margin = margin(10, 10, 10, 10)
- +
- ) guides(
- shape =
- guide_legend(override.aes = list(color = "white"))
- )
这一章我们一起学习ggplot2中的scales语法,推荐大家阅读Hadley Wickham最新版的《ggplot2: Elegant Graphics for Data Analysis》,但如果需要详细了解标度参数体系,还是要看ggplot2官方文档
-在 13章,我们了解到ggplot2中,映射是数据转化到图形属性,这里的图形属性是指视觉可以感知的东西,比如大小,形状,颜色和位置等。我们今天讨论的标度(scale)是控制着数据到图形属性映射的函数,每一种标度都是从数据空间的某个区域(标度的定义域)到图形属性空间的某个区域(标度的值域)的一个函数。
-简单点来说,标度是用于调整数据映射的图形属性。
-在ggplot2中,每一种图形属性都拥有一个默认的标度,也许你对这个默认的标度不满意,可以就需要学习如何修改默认的标度。比如,
-系统默认"a"
对应红色,"b"
对应蓝色,我们想让"a"
对应紫色,"b"
对应橙色。
还是用我们熟悉的ggplot2::mpg
,可能有同学说,我画图没接触到scale
啊,比如
library(tidyverse)
-%>%
- mpg ggplot(aes(x = displ, y = hwy)) +
- geom_point(aes(colour = class))
能画个很漂亮的图,那是因为ggplot2默认缺省条件下,已经很美观了。(据说Hadley Wickham很后悔使用了这么漂亮的缺省值,因为很漂亮了大家都不认真学画图了。马云好像也说后悔创立了阿里巴巴?)
-事实上,根据映射关系和变量名,我们将标度写完整,应该是这样的
-ggplot(mpg, aes(x = displ, y = hwy)) +
-geom_point(aes(colour = class)) +
-
- scale_x_continuous() +
- scale_y_continuous() +
- scale_colour_discrete()
如果每次都要手动设置一次标度函数,那将是比较繁琐的事情。因此ggplot2使用了默认了设置,如果不满意ggplot2的默认值,可以手动调整或者改写标度,比如
-ggplot(mpg, aes(x = displ, y = hwy)) +
-geom_point(aes(colour = class)) +
-
- scale_x_continuous(name = "这是我的x坐标") +
- scale_y_continuous(name = "这是我的y坐标") +
- scale_colour_brewer()
注意到,标度函数是由"_"分割的三个部分构成的 -- scale -- 视觉属性名 (e.g., colour, shape or x) -- 标度名 (e.g., continuous, discrete, brewer).
- -每个标度函数内部都有丰富的参数系统
-scale_colour_manual(
-palette = function(),
- limits = NULL,
- name = waiver(),
- labels = waiver(),
- breaks = waiver(),
- minor_breaks = waiver(),
- values = waiver(),
-
- ... )
参数name
,坐标和图例的名字,如果不想要图例的名字,就可以 name = NULL
参数limits
, 坐标或图例的范围区间。连续性c(n, m)
,离散型c("a", "b", "c")
参数breaks
, 控制显示在坐标轴或者图例上的值(元素)
参数labels
, 坐标和图例的间隔标签
breaks
提供的字符型向量一一对应breaks
提供的字符型向量当做函数的输入NULL
,就是去掉标签参数values
指的是(颜色、形状等)视觉属性值,
breaks
提供的字符型向量长度一致c("数据标签" = "视觉属性")
提供参数expand
, 控制参数溢出量
参数range
, 设置尺寸大小范围,比如针对点的相对大小
下面,我们通过具体的案例讲解如何使用参数,把图形变成我们想要的模样。
-先导入一个数据
-<- read_csv("./demo_data/gapminder.csv") gapdata
<- gapdata %>%
- newgapdata group_by(continent, country) %>%
- summarise(
- across(c(lifeExp, gdpPercap, pop), mean)
-
- ) newgapdata
## # A tibble: 142 x 5
-## # Groups: continent [5]
-## continent country lifeExp gdpPercap pop
-## <chr> <chr> <dbl> <dbl> <dbl>
-## 1 Africa Algeria 59.0 4426. 1.99e7
-## 2 Africa Angola 37.9 3607. 7.31e6
-## 3 Africa Benin 48.8 1155. 4.02e6
-## 4 Africa Botswana 54.6 5032. 9.71e5
-## 5 Africa Burkina Faso 44.7 844. 7.55e6
-## 6 Africa Burundi 44.8 472. 4.65e6
-## 7 Africa Cameroon 48.1 1775. 9.82e6
-## 8 Africa Central African~ 43.9 959. 2.56e6
-## 9 Africa Chad 46.8 1165. 5.33e6
-## 10 Africa Comoros 52.4 1314. 3.62e5
-## # ... with 132 more rows
-%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_continuous()
%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_log10()
%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_log10(breaks = c(500, 1000, 3000, 10000, 30000),
- labels = scales::dollar)
%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_log10(
- name = "人均GDP",
- breaks = c(500, 1000, 3000, 10000, 30000),
- labels = scales::unit_format(unit = "美元"))
%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_log10() +
- scale_color_viridis_d()
离散变量映射到色彩的情形,可以使用ColorBrewer色彩。
-%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_log10() +
- scale_color_brewer(type = "qual", palette = "Set1")
%>%
- newgapdata ggplot(aes(x = gdpPercap, y = lifeExp)) +
- geom_point(aes(color = continent, size = pop)) +
- scale_x_log10() +
- scale_color_manual(
- name = "五大洲",
- values = c("Africa" = "red", "Americas" = "blue", "Asia" = "orange",
- "Europe" = "black", "Oceania" = "gray"),
- breaks = c("Africa", "Americas", "Asia", "Europe", "Oceania"),
- labels = c("非洲", "美洲", "亚洲", "欧洲", "大洋洲")
- +
- ) scale_size(
- name = "人口数量",
- breaks = c(2e8, 5e8, 7e8),
- labels = c("2亿", "5亿", "7亿")
- )
那什么时候用标度,什么时候用主题?这里有个原则:主题风格不会增加标签,也不会改变变量的范围,主题只会改变字体、大小、颜色等等。
-用 ggplot2 重复这张lego图
- - -美学映射是图形语法中非常重要的一个概念,变量映射到视觉元素,然后通过几何对象GEOM
画出图形。(下图是每个几何对象所对应的视觉元素)
比如geom_point(mapping = aes(x = mass, y = height))
将会画出散点图,这里的x
轴代表mass
变量,而y
轴代表height
变量.
因为geom_*()
很强大而且也很容易理解,所以一般我们不会去思考我们的数据在喂给ggplot()
后发生了什么,只希望能出图就行了。比如下面的直方图例子
library(tidyverse)
-library(palmerpenguins)
-
-ggplot(data = penguins, mapping = aes(x = body_mass_g)) +
-geom_histogram()
这里发生了什么呢?你可能看到body_mass_g
这个变量代表了x轴,这个没错,但想弄清楚这个直方图,需要回答下面的问题
x
轴的变量被分成了若干离散的小区间(bins)y
轴上是一个新的变量x
变量和经过计算处理后的y
变量,共同确定了柱状图中每个柱子的位置和高度我并不是说,不能给出geom_histogram()
详细说明就是一个傻子。相反,我这里的本意是强调数据->视觉元素的映射并不是理所当然的,尽管看上去往往非常自然、直观和客观。
我们这里是提醒下,我们是否想过,修改上面中间过程,比如第1步和第2步,然后看看输出的图形是否还是直方图。
-这个想法非常重要,但我们很少想到。某种程度是因为在我们最初学习ggplot画图的时候,ggplot已经影响了我们的思维方式。比如,初学者可能经历过拿到数据却还不出图形的受挫感,举个例子来说,这里有个数据
-<- tibble::tribble(
- d ~variable, ~subject1, ~subject2, ~subject3,
- "mass", 75, 70, 55,
- "height", 154, 172, 144
-
- ) d
## # A tibble: 2 x 4
-## variable subject1 subject2 subject3
-## <chr> <dbl> <dbl> <dbl>
-## 1 mass 75 70 55
-## 2 height 154 172 144
-用geom_point(aes(x = mass, y = height))
画图,却报错了。初学者可能苦苦搜索答案,然后被告知,ggplot画图需要先弄成tidy格式
%>% pivot_longer(
- d cols = subject1:subject3,
- names_to = "subject",
- names_pattern = "subject(\\d)",
- values_to = "value"
- %>%
- ) pivot_wider(names_from = variable,
- values_from = value)
## # A tibble: 3 x 3
-## subject mass height
-## <chr> <dbl> <dbl>
-## 1 1 75 154
-## 2 2 70 172
-## 3 3 55 144
-现在数据tidy了,你可以使用ggplot(),问题得以解决。于是我们得出了一个结论:想要ggplot工作就需要tidy data。 如果这样想,那么今天的内容ggplot2统计图层
就更加有必要了。
你可能每天都在用ggplot
,却用不到stat_*()
函数,这样也可以胜任很多工作。事实上,因为我们仅仅只使用geom_*()
函数,你会发现stat_*()
是开发者才使用的深奥和神秘的部分,如果这样想,你可能怀疑你是否有必要了解这些stat_*()
函数。
好吧,学习 STAT
最主要的原因
--“Even though the data is tidy, it may not represent the values you want to display”
-
我们这里再用一个例子说明,假定我们有一数据框simple_data
<- tibble(group = factor(rep(c("A", "B"), each = 15)),
- simple_data subject = 1:30,
- score = c(rnorm(15, 40, 20), rnorm(15, 60, 10)))
- simple_data
## # A tibble: 30 x 3
-## group subject score
-## <fct> <int> <dbl>
-## 1 A 1 76.9
-## 2 A 2 66.2
-## 3 A 3 28.5
-## 4 A 4 31.4
-## 5 A 5 18.3
-## 6 A 6 46.0
-## 7 A 7 64.9
-## 8 A 8 46.8
-## 9 A 9 17.9
-## 10 A 10 51.6
-## # ... with 20 more rows
-假定我们现在想画一个柱状图,一个柱子代表每一组group,柱子的高度代表的score的均值。
-好比,按照我们的想法,我们首先规整(tidy)数据,并且确保数据包含每个geom所需的美学映射,最后传递给ggplot()
%>%
- simple_data group_by(group) %>%
- summarize(
- mean_score = mean(score),
- .groups = 'drop'
- %>%
- ) ggplot(aes(x = group, y = mean_score)) +
- geom_col()
那么,传递给ggplot()
的数据是
%>%
- simple_data group_by(group) %>%
- summarize(
- mean_score = mean(score),
- .groups = 'drop'
- )
## # A tibble: 2 x 2
-## group mean_score
-## <fct> <dbl>
-## 1 A 41.8
-## 2 B 61.5
-需求很简单,很容易搞定。但如果我们想加误差棒(stand error)呢? 那我们需要再对数据整理统计,然后再传给ggplot()
.
于是,我们再计算误差棒,这里变型的数据是这个样子的
-%>%
- simple_data group_by(group) %>%
- summarize(
- mean_score = mean(score),
- se = sqrt(var(score)/length(score)),
- .groups = 'drop'
- %>%
- ) mutate(
- lower = mean_score - se,
- upper = mean_score + se
- )
## # A tibble: 2 x 5
-## group mean_score se lower upper
-## <fct> <dbl> <dbl> <dbl> <dbl>
-## 1 A 41.8 4.82 37.0 46.7
-## 2 B 61.5 2.83 58.6 64.3
-然后把变型的数据传递给ggplot()
%>%
- simple_data group_by(group) %>%
- summarize(
- mean_score = mean(score),
- se = sqrt(var(score)/length(score)),
- .groups = 'drop'
- %>%
- ) mutate(
- lower = mean_score - se,
- upper = mean_score + se
- %>%
- ) ggplot(aes(x = group, y = mean_score, ymin = lower, ymax = upper)) +
- geom_errorbar()
最后,我们把两个数据框组会到一起,一个用于柱状图,一个用于画误差棒。
-<- simple_data %>%
- simple_data_bar group_by(group) %>%
- summarize(
- mean_score = mean(score),
- .groups = 'drop'
-
- )
- <- simple_data %>%
- simple_data_errorbar group_by(group) %>%
- summarize(
- mean_score = mean(score),
- se = sqrt(var(score)/length(score)),
- .groups = 'drop'
- %>%
- ) mutate(
- lower = mean_score - se,
- upper = mean_score + se
-
- )
-ggplot() +
-geom_col(
- aes(x = group, y = mean_score),
- data = simple_data_bar
- +
- ) geom_errorbar(
- aes(x = group, y = mean_score, ymin = lower, ymax = upper),
- data = simple_data_errorbar
- )
OMG, 为了画一个简单的图,我们需要写这么长的一段代码。究其原因就是,我们认为,一定要准备好一个tidy的数据,并且把想画的几何对象所需要的美学映射,都整理到这个tidy的数据框中
-事实上,理论上讲,simple_data_bar
和 simple_data_errorbar
并不是真正的tidy
格式。因为按照Hadley Wickham的对tidy的定义是,一行代表一次观察。
-而这里的柱子的高度以及误差棒的两端不是观察出来的,而是统计计算出来的。
-所以我们的观点是,辛辛苦苦创建一个(包含每个几何对象所需的美学映射)的数据框,太低效了,而且这种方法也不支持tidy原则。 -
-既然 simple_data_bar
和 simple_data_errorbar
都来源于simple_data
,那为何不直接传递simple_data
给ggplot()
,让数据在内部转换,得到每个几何对象所需的美学映射呢?
或许,你想要的是这样?
-%>%
- simple_data ggplot(aes(group, score)) +
- stat_summary(geom = "bar") +
- stat_summary(geom = "errorbar")
Bingo
-这一节,我们用一个很长的数据整理的代码,借助geom_*()
画了一张含有误差棒的柱状图,而用stat_summary()
不需要数据整理,只需要两行代码就实现相同效果。
-感受到了stat_summary()
的强大了?
不忙,好戏才慢慢开始…
-前面讲到的 stat_summary()
是学习和理解 stat_*()
很好的例子,理解了stat_summary()
的工作原理,其它的stat_*()
也就都明白了,
-事实上,stat_summary()
也是在数据视化中最常用的,因此我们接着讲它。
那么,我们现在模拟一个测试数据height_df
<- tibble(group = "A",
- height_df height = rnorm(30, 170, 10))
用我们熟悉的geom_point()
%>%
- height_df ggplot(aes(x = group, y = height)) +
- geom_point()
然后用stat_summary()
代替geom_point()
,然后看看发生了什么
%>%
- height_df ggplot(aes(x = group, y = height)) +
- stat_summary()
看到了一个点和经过这个点的一条线,实际上,它也是一个几何对象pointrange.
-那么geom_pointrange()
是怎么数据转换的呢?回答这个问题,我们需要了解下geom_pointrange()
需要哪些美学映射(参见图 18.1):
所以,我们回去看看ggplot(aes(x = group, y = height))
中aes()
里的参数,group 映射到 x
, height映射到了y
, 但我们没有发现有ymin / xmin
或者ymax / xmax
的踪迹。问题来了,我们没有给出geom_pointrange()
需要的美学映射,那stat_summmary()
是怎么画出pointrange
的呢?
我们先猜测一下,stat_summary()
先计算出必要的数据值,然后传递给pointrange
?
-是不是呢?我们先看上图过程中有个提示
`mean_se()` No summary function supplied, defaulting to
看到了吧,summary function
,说明我们猜对了,这就是stat_*()
神秘的地方。
stat_summary()
中的fun.data
参数,它的默认值是mean_se()
mean_se
function (x, mult = 1)
-
- {<- stats::na.omit(x)
- x <- mult * sqrt(stats::var(x)/length(x))
- se <- mean(x)
- mean new_data_frame(list(y = mean, ymin = mean - se, ymax = mean +
- n = 1)
- se),
- }<bytecode: 0x0000021aef28aa10>
-<environment: namespace:ggplot2>
这个mean_se()
函数有两个参数,一个是x
,一个是mult
(默认为1), 那么这个函数的功能,一步步来说
NA
se
, 公式为\(SE = \sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}\)x
的均值y = mean, ymin = mean - se, ymax = mean + se
很酷的一件事情是,mean_se()
看上去是在ggplot()
内部使用,实际上加载ggplot2
宏包后,在全局环境变量里就可以访问到,不妨试试看, 注意到stat_summary()
是对向量(单维度)做统计,因此要传height_df$height
给它
mean_se(height_df$height)
## y ymin ymax
-## 1 170.2 168.6 171.9
-数据看上去和我们前面 stat_summary()
画的点线图一样。当然为了保险起见,我们还是核对下,这里用到ggplot2
包中的一个神奇的函数layer_data()
, 它可以拉取在图层中使用的数据,第二个参数是指定拉取哪个图层的数据,这里只有唯一的一个图层,因此指定为1。
<- height_df %>%
- pointrange_plot ggplot(aes(x = group, y = height)) +
- stat_summary()
-
-layer_data(pointrange_plot, 1)
## x group y ymin ymax PANEL flipped_aes colour
-## 1 1 1 170.2 168.6 171.9 1 FALSE black
-## size linetype shape fill alpha stroke
-## 1 0.5 1 19 NA NA 1
-喔喔,结果很丰富,我们注意到y, ymin, and ymax
的值与 mean_se()
计算的结果一致。
我们揭开了stat_summary()
统计图层的神秘面纱的一角:
stat_summary()
里若没有指定数据,那就会从ggplot(data = .)
里继承fun.data
会调用函数将数据变形,这个函数默认是mean_se()
fun.data
返回的是数据框,这个数据框将用于geom参数画图,这里缺省的geom是pointrangefun.data
返回的数据框包含了所需要的美学映射,图形就会显示出来。为了让大家看的更明白,我们在stat_summary()
中显式地给出fun.data
和geom
两个参数
%>%
- height_df ggplot(aes(x = group, y = height)) +
- stat_summary(
- geom = "pointrange",
- fun.data = mean_se
- )
Look, it’s the same plot!
-现在我们进入了stat_summary()
有趣的环节: 调整其中的参数画出各种图
我们用企鹅数据画出不同性别sex下的企鹅体重均值,同时误差棒要给出95%的置信区间( -即均值加减 1.96倍的标准误)
-<- na.omit(penguins)
- my_penguins
-%>%
- my_penguins ggplot(aes(sex, body_mass_g)) +
- stat_summary(
- fun.data = ~mean_se(., mult = 1.96), # Increase `mult` value for bigger interval!
- geom = "errorbar",
- )
那么这里在stat_summary()
函数内部发生了什么呢?
分组分别各自的mean_se()
,
<- my_penguins %>%
- female_mean_se filter(sex == "female") %>%
- pull(body_mass_g) %>%
- mean_se(., mult = 1.96)
-
-<- my_penguins %>%
- male_mean_se filter(sex == "male") %>%
- pull(body_mass_g) %>%
- mean_se(., mult = 1.96)
-
-bind_rows(female_mean_se, male_mean_se)
## y ymin ymax
-## 1 3862 3761 3964
-## 2 4546 4427 4665
-当ggplot()
中提供了分组变量(比如这里的sex
),stat_summary()
会分组计算,
-再次感受到ggplot2的强大气息!
不同的企鹅种类,画出bill_length_mm
长度的中位数(不再是均值),同时,让中位数小于40的用粉红色标出。这里需要自定义fun.data
函数
<- function(x, threshold = 40) {
- calc_median_and_color tibble(y = median(x)) %>%
- mutate(fill = ifelse(y < threshold, "pink", "grey35"))
-
- }
-%>%
- my_penguins ggplot(aes(species, bill_length_mm)) +
- stat_summary(
- fun.data = calc_median_and_color,
- geom = "bar"
- )
我们再来看看,stat_summary()内部发生了什么?
-%>%
- my_penguins group_split(species) %>%
- map(~ pull(., bill_length_mm)) %>%
- map_dfr(calc_median_and_color)
## # A tibble: 3 x 2
-## y fill
-## <dbl> <chr>
-## 1 38.8 pink
-## 2 49.6 grey35
-## 3 47.4 grey35
-注意到,fun.data
中的定制函数还可以计算fill
美学映射,最后一起传递给geom画图,强大!
我们现在想画不同岛屿islands上企鹅bill_depth_mm
均值,要求点线图中点的大小随观测数量(该岛屿企鹅的数量)变化
%>%
- my_penguins ggplot(aes(species, bill_depth_mm)) +
- stat_summary(
- fun.data = function(x) {
-
- <- length(x)/nrow(my_penguins)
- scaled_size
- mean_se(x) %>%
- mutate(size = scaled_size)
-
- } )
这张图其实听酷的,每个岛屿观察值越小(也就说样本量越小),pointrange的不确定性就越大(图中的误差棒范围就越长)。我们再看看,这里的stat_summary()
内部发生了什么,或者说数据是怎么转换的。
%>%
- my_penguins group_split(species) %>%
- map(~ pull(., bill_depth_mm)) %>%
- map_dfr(
- function(x) {
-
- <- length(x)/nrow(my_penguins)
- scaled_size
- mean_se(x) %>%
- mutate(size = scaled_size)
-
- } )
## y ymin ymax size
-## 1 18.35 18.25 18.45 0.4384
-## 2 18.42 18.28 18.56 0.2042
-## 3 15.00 14.91 15.09 0.3574
-尽管数据是tidy的,但它未必能代表你想展示的值
解决办法不是去规整数据以符合几何对象的要求,而是将原初tidy数据传递给ggplot()
,
-让stat_*()
函数在内部实现变型
可以stat_*()
函数可以定制geom以及相应的变形函数。当然,定制自己的函数,需要核对stat_*()
所需要的变量和数据类型
如果想用不同的geom,确保变换函数能计算出(几何对象所需要的)美学映射
尽管我们在谈论geom_*()
的局限性,从而衬托出stat_*()
的强大,但并不意味了后者可以取代前者,因为这不是一个非此即彼的问题,事实上,他们彼此依赖– 我们看到stat_summary()
有 geom
参数, geom_*()
也有 stat
参数。
-在更高的层级上讲,stat_*()
和 geom_*()
都只是ggplot里构建图层的layer()
函数的一个便利的方法,用曹植的《七步诗》来说, 本是同根生,相煎何太急。
将layer()
分成stat_*()
和 geom_*()
两块,或许是一个失误,最后我们用Hadley的原话来结束本章内容
-- - -Unfortunately, due to an early design mistake I called these either stat_() or geom_(). A better decision would have been to call them layer_() functions: that’s a more accurate description because every layer involves a stat and a geom
-
这一章我们一起学习ggplot2中的theme elements -语法,感谢Henry Wang提供了很好的思路。如果需要详细了解,可以参考Hadley Wickham最新版的《ggplot2: Elegant Graphics for Data Analysis》,最推荐的是ggplot2官方文档
-theme(element_name = element_function())
这里element_function()
有四个
element_text()
-element_line()
-element_rect()
-element_blank()
望文生义吧,内置元素函数有四个基础类型:
-element_text()
, 文本,一般用于控制标签和标题的字体风格element_line()
, 线条,一般用于控制线条或线段的颜色或线条类型element_rect()
, 矩形区域,一般用于控制背景矩形的颜色或者边界线条类型element_blank()
, 空白,就是不分配相应的绘图空间,即删去这个地方的绘图元素。每个元素函数都有一系列控制外观的参数,下面我们通过具体的案例来一一介绍吧。
-library(tidyverse)
还是用让人生厌的ggplot2::mpg
数据包吧,具体介绍请见?? 章。
glimpse(mpg)
## Rows: 234
-## Columns: 11
-## $ manufacturer <chr> "audi", "audi", "audi", "audi...
-## $ model <chr> "a4", "a4", "a4", "a4", "a4",...
-## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8,...
-## $ year <int> 1999, 1999, 2008, 2008, 1999,...
-## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4,...
-## $ trans <chr> "auto(l5)", "manual(m5)", "ma...
-## $ drv <chr> "f", "f", "f", "f", "f", "f",...
-## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 1...
-## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 2...
-## $ fl <chr> "p", "p", "p", "p", "p", "p",...
-## $ class <chr> "compact", "compact", "compac...
-稍微做点数据整理
-<- mpg %>%
- df as_tibble() %>%
- filter(class != "2seater", manufacturer %in% c("toyota", "volkswagen"))
- df
## # A tibble: 61 x 11
-## manufacturer model displ year cyl trans drv
-## <chr> <chr> <dbl> <int> <int> <chr> <chr>
-## 1 toyota 4run~ 2.7 1999 4 manu~ 4
-## 2 toyota 4run~ 2.7 1999 4 auto~ 4
-## 3 toyota 4run~ 3.4 1999 6 auto~ 4
-## 4 toyota 4run~ 3.4 1999 6 manu~ 4
-## 5 toyota 4run~ 4 2008 6 auto~ 4
-## 6 toyota 4run~ 4.7 2008 8 auto~ 4
-## 7 toyota camry 2.2 1999 4 manu~ f
-## 8 toyota camry 2.2 1999 4 auto~ f
-## 9 toyota camry 2.4 2008 4 manu~ f
-## 10 toyota camry 2.4 2008 4 auto~ f
-## # ... with 51 more rows, and 4 more variables:
-## # cty <int>, hwy <int>, fl <chr>, class <chr>
-我相信这种图你们已经会画了吧
-%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy")
想让这张图,符合你的想法?如何控制呢?come on
-图表整体元素包括:
-描述 | -主题元素 | -类型 | -
---|---|---|
整个图形背景 | -plot.background | -element_rect() | -
图形标题 | -plot.title | -element_text() | -
图形边距 | -plot.margin | -margin() | -
%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- theme(
- plot.background = element_rect(fill = "orange", color = "black", size = 10),
- plot.title = element_text(hjust = 1, color = "red", face = "italic"),
- plot.margin = margin(t = 20, r = 20, b = 20, l = 20, unit = "pt")
- )
坐标轴元素包括:
-描述 | -主题元素 | -类型 | -
---|---|---|
坐标轴刻度 | -axis.ticks | -element_line() | -
坐标轴标题 | -axis.title | -element_text() | -
坐标轴标签 | -axis.text | -element_text() | -
直线和坐标轴 | -axis.line | -element_line() | -
%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- theme(
- axis.line = element_line(color = "orange", size = 2),
- axis.title = element_text(color = "red", face = "italic"),
- axis.ticks = element_line(color = "purple", size = 3),
- axis.text = element_text(color = "blue"),
- axis.text.x = element_text(angle = 45, hjust = 1)
- )
面板元素包括:
-描述 | -主题元素 | -类型 | -
---|---|---|
面板背景 | -panel.background | -element_rect() | -
面板网格线 | -panel.grid | -element_line() | -
面板边界 | -panel.border | -element_rect() | -
%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- theme(
- panel.background = element_rect(fill = "orange", color = "red"),
- panel.grid = element_line(color = "grey80", size = 0.5)
- )
或者
-%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- theme(
- panel.background = element_rect(fill = "orange"),
- panel.grid = element_line(color = "grey80", size = 0.5),
- panel.border = element_rect(color = "red", fill = NA)
- )
图例元素包括:
-描述 | -主题元素 | -类型 | -
---|---|---|
图例背景 | -legend.background | -element_rect() | -
图例符号 | -legend.key | -element_rect() | -
图例标签 | -legend.text | -element_text() | -
图例标题 | -legend.title | -element_text() | -
图例边距 | -legend.margin | -margin | -
图例位置 | -legend.postion | -“top,” “bottom,” “left,” “right” | -
%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- theme(
- legend.background = element_rect(fill = "orange"),
- legend.title = element_text(color = "blue", size = 10),
- legend.key = element_rect(fill = "grey80"),
- legend.text = element_text(color = "red"),
- legend.margin = margin(t = 20, r = 20, b = 20, l = 20, unit = "pt"),
- legend.position = "bottom"
- )
分面元素包括:
-描述 | -主题元素 | -类型 | -
---|---|---|
分面标签背景 | -strip.background | -element_rect() | -
条状文本 | -strip.text | -element_text() | -
分面间隔 | -panel.spacing | -unit | -
%>%
- df ggplot(aes(x = displ, y = hwy, color = factor(cyl))) +
- geom_point() +
- facet_grid(vars(manufacturer), vars(class)) +
- ggtitle("这是我的标题") +
- labs(x = "x_displ", y = "y_hwy") +
- theme(
- strip.background = element_rect(fill = "orange"),
- strip.text = element_text(color = "red"),
- panel.spacing = unit(0.3, "inch") # ,
- # strip.switch.pad.grid =
- )
%>%
- diamonds ggplot(aes(carat, price)) +
- geom_hex() +
- labs(title = "Diamond") +
- theme(
- axis.title.x = element_text(
- size = 30,
- color = "red",
- face = "bold",
- angle = 10
-
- ),legend.title = element_text(
- size = 25,
- color = "#ff6361",
- margin = margin(b = 5)
-
- ),plot.title = element_text(
- size = 35,
- face = "bold",
- color = "blue"
-
- ) )
-你肯定不会觉得这图好看。
-library(palmerpenguins)
-%>%
- penguins ggplot(aes(bill_length_mm, bill_depth_mm)) +
- geom_point() +
- theme(
- axis.line.y = element_line(
- color = "black",
- size = 1.2,
- arrow = grid::arrow()
-
- ),axis.line.x = element_line(
- linetype = "dashed",
- color = "brown",
- size = 1.2
-
- ),axis.ticks = element_line(color = "red", size = 1.1),
- axis.ticks.length = unit(3, "mm"),
- panel.grid.major = element_line(
- color = "blue",
- size = 1.2
-
- ),panel.grid.minor = element_line(
- color = "#58508d",
- size = 1.2,
- linetype = "dotted"
-
- ) )
%>%
- penguins ggplot(aes(bill_length_mm, bill_depth_mm)) +
- geom_point(aes(color = species)) +
- theme(
- legend.background = element_rect(
- fill = "#fff6c2",
- color = "black",
- linetype = "dashed"
-
- ),legend.key = element_rect(fill = "grey", color = "brown"),
- panel.background = element_rect(
- fill = "#005F59",
- color = "red", size = 3
-
- ),panel.border = element_rect(
- color = "black",
- fill = "transparent",
- linetype = "dashed", size = 3
-
- ),plot.background = element_rect(
- fill = "#a1dce9",
- color = "black",
- size = 1.3
-
- ),legend.position = "bottom"
- )
ggplot2中 plot 与 panel 有区别?
假定数据是这样
library(tidyverse)
-set.seed(12)
-
-<- data.frame(x = rnorm(50, 10, 2), type = "Island #1")
- d1 <- data.frame(x = rnorm(50, 18, 1.2), type = "Island #2")
- d2
-<- bind_rows(d1, d2) %>%
- dd set_names(c("Height", "Location"))
-
-head(dd)
## Height Location
-## 1 7.039 Island #1
-## 2 13.154 Island #1
-## 3 8.087 Island #1
-## 4 8.160 Island #1
-## 5 6.005 Island #1
-## 6 9.455 Island #1
-你画图后,交给老板看
-%>%
- dd ggplot(aes(x = Height, fill = Location)) +
- geom_histogram(binwidth = 1, color = "white") +
- scale_fill_manual(values = c("green3", "turquoise3"))
然而,老板有点不满意,希望你要这样改 -
-请用前后两章学到的内容让老板满意吧
- -2021-01-09
-你好,这里是四川师范大学研究生公选课《数据科学中的R语言》的课程内容。考虑到大家来自不同的学院,有着不同的学科背景,因此讲授的内容不会太深奥(要有信心喔)。
-比如在课程中以下内容就不会出现
-\[ -f(x)=\frac{1}{\sqrt{2 \pi}} e^{-\frac{1}{2} x^{2}} -\]
-而出现更多的是
-library(tidyverse)
-<- weather %>%
- summary_monthly_temp group_by(month) %>%
- summarize(mean = mean(temp),
- std_dev = sd(temp))
在跟进本课程的同时, 我强烈推荐大家阅读Hadley Wickham的 -r4ds这本书 (Grolemund and Wickham 2017)。作者可是2019年8月刚刚获得考普斯总统奖(被誉为统计学的诺贝尔奖)的大神喔,点击这里可以看他照片。
- -1、课程安排是这样的,每个章节研究的内容都是彼此独立的,大家可以单独阅读每章及运行代码。
-2、课件源代码和数据
-我将持续改进课件,所以欢迎大家提出建议
- - - -4、关于课程目标
-课程目标: 熟悉数据科学流程,掌握统计编程技能,能运用探索性分析方法,解决基本的实际应用问题,做到学以致用,不是 learning R,而是 learning with R
授课方式:
-课堂要求
-科学脚手架
-5、关于如何提问
-有的同学,这样一上来就问:老师,我的代码怎么运行不出来呢?或者图省事,干脆手机拍个照片一发。
-<-
- my_packages c("brms", "broom", "broom.mixed", "colorspace", "corrr", "countrycode", "cowplot", "cranlogs", "datapasta", "datasauRus", "devtools", "dplyr", "equatiomatic", "forcats", "gapminder", "geoshpere", "gganimate", "ggbeeswarm", "ggeffects", "ggforce", "gghighlight", "ggimage", "ggplot2", "ggpubr", "ggraph", "ggrepel", "ggridges", "ggstatsplot", "ggtext", "ggthemes", "gt", "gtsummary", "haven", "here", "janitor", "knitr", "latex2exp", "lme4", "lubridate", "maps", "margins", "MASS", "modelr", "naniar", "nycflights13", "ordinal", "pacman", "pacman", "paletteer", "palmerpenguins", "patchwork", "performance", "purrr", "readr", "readxl", "remotes", "reprex", "rlang", "rmarkdown", "rstan", "rvest", "scales", "sf", "shadowtext", "showtext", "slider", "stars", "statsExpressions", "stringr", "styler", "tibble", "tibbletime", "tidybayes", "tidygraph", "tidymodels", "tidyr", "tidytext", "tidyverse", "tinytex", "viridis", "visdat", "namer")
install.packages(my_packages, repos = "http://cran.rstudio.com", dependencies = T)
可能用到的开发版本的宏包
-#remotes::install_github("datalorax/equatiomatic")
-::install_github("easystats/report")
- devtools::install_github("kassambara/navdata")
- devtools::install_github('cttobin/ggthemr')
- devtools::install_github("daranzolin/inferregex")
- remotes::install_github("EmilHvitfeldt/gganonymize")
- devtools::install_github("ThinkR-open/remedy")
- remotes::install_git("https://git.rud.is/hrbrmstr/hrbraddins.git")
- remotes::install_github("hadley/emo")
- devtools::install_github("romainfrancois/lay")
- remotes::install_github("kjhealy/covdata")
- remotes::install_github("kbodwin/flair")
- devtools::install_github("seasmith/AlignAssign") devtools
非常感谢川师研究生院的信任,有了您的支持,才会有更多的川师学子了解R的美!
- -
-王敏杰
-于 川师图书馆某角落
-
Statistical Inference: A Tidy Approach
-这是一个关于电影评分的数据集3,
-library(tidyverse)
-<- ggplot2movies::movies
- d d
## # A tibble: 58,788 x 24
-## title year length budget rating votes r1 r2
-## <chr> <int> <int> <int> <dbl> <int> <dbl> <dbl>
-## 1 $ 1971 121 NA 6.4 348 4.5 4.5
-## 2 $1000 ~ 1939 71 NA 6 20 0 14.5
-## 3 $21 a ~ 1941 7 NA 8.2 5 0 0
-## 4 $40,000 1996 70 NA 8.2 6 14.5 0
-## 5 $50,00~ 1975 71 NA 3.4 17 24.5 4.5
-## 6 $pent 2000 91 NA 4.3 45 4.5 4.5
-## 7 $windle 2002 93 NA 5.3 200 4.5 0
-## 8 '15' 2002 25 NA 6.7 24 4.5 4.5
-## 9 '38 1987 97 NA 6.6 18 4.5 4.5
-## 10 '49-'17 1917 61 NA 6 51 4.5 0
-## # ... with 58,778 more rows, and 16 more variables:
-## # r3 <dbl>, r4 <dbl>, r5 <dbl>, r6 <dbl>, r7 <dbl>,
-## # r8 <dbl>, r9 <dbl>, r10 <dbl>, mpaa <chr>,
-## # Action <int>, Animation <int>, Comedy <int>,
-## # Drama <int>, Documentary <int>, Romance <int>,
-## # Short <int>
-数据集包含58788 行 和 24 变量
-variable | -description | -
---|---|
title | -电影名 | -
year | -发行年份 | -
budget | -预算金额 | -
length | -电影时长 | -
rating | -平均得分 | -
votes | -投票人数 | -
r1-10 | -各分段投票人占比 | -
mpaa | -MPAA 分级 | -
action | -动作片 | -
animation | -动画片 | -
comedy | -喜剧片 | -
drama | -戏剧 | -
documentary | -纪录片 | -
romance | -爱情片 | -
short | -短片 | -
我们想看下爱情片与动作片(不是爱情动作片)的平均得分是否显著不同。
-<- d %>%
- movies_genre_sample select(title, year, rating, Action, Romance) %>%
- filter(!(Action == 1 & Romance == 1)) %>% # 既是爱情片又是动作片的,删去
- mutate(genre = case_when(
- == 1 ~ "Action",
- Action == 1 ~ "Romance",
- Romance TRUE ~ "Neither"
- %>%
- )) filter(genre != "Neither") %>%
- select(-Action, -Romance) %>%
- group_by(genre) %>%
- slice_sample(n = 34) %>% # 每种题材的电影只选取了34个
- ungroup()
-
- movies_genre_sample
## # A tibble: 68 x 4
-## title year rating genre
-## <chr> <int> <dbl> <chr>
-## 1 Alley Cat 1982 6.1 Acti~
-## 2 Purple Plain, The 1954 6.3 Acti~
-## 3 Martial Law 1990 4 Acti~
-## 4 Vodkaa, komisario Palmu 1969 5.7 Acti~
-## 5 Vendetta 1986 5.1 Acti~
-## 6 Confessione di un commissario di~ 1971 7 Acti~
-## 7 Full Clip 2004 4.1 Acti~
-## 8 Tailspin Tommy in The Great Air ~ 1935 9.6 Acti~
-## 9 Cyclone 1978 3.1 Acti~
-## 10 Gharaana Mogudu 1992 9 Acti~
-## # ... with 58 more rows
-%>%
- movies_genre_sample ggplot(aes(x = genre, y = rating)) +
- geom_boxplot() +
- geom_jitter()
%>%
- movies_genre_sample ggplot(mapping = aes(x = rating)) +
- geom_histogram(binwidth = 1, color = "white") +
- facet_grid(vars(genre))
<- movies_genre_sample %>%
- summary_ratings group_by(genre) %>%
- summarize(
- mean = mean(rating),
- std_dev = sd(rating),
- n = n()
-
- ) summary_ratings
## # A tibble: 2 x 4
-## genre mean std_dev n
-## <chr> <dbl> <dbl> <int>
-## 1 Action 5.54 1.72 34
-## 2 Romance 6.08 1.41 34
-假设:
-零假设:
-备选假设:
-两种可能的结论:
-<- t.test(rating ~ genre,
- t_test_eq data = movies_genre_sample,
- var.equal = TRUE
- %>%
- ) ::tidy()
- broom t_test_eq
## # A tibble: 1 x 10
-## estimate estimate1 estimate2 statistic p.value
-## <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 -0.541 5.54 6.08 -1.42 0.161
-## # ... with 5 more variables: parameter <dbl>,
-## # conf.low <dbl>, conf.high <dbl>, method <chr>,
-## # alternative <chr>
-<- t.test(rating ~ genre,
- t_test_uneq data = movies_genre_sample,
- var.equal = FALSE
- %>%
- ) ::tidy()
- broom t_test_uneq
## # A tibble: 1 x 10
-## estimate estimate1 estimate2 statistic p.value
-## <dbl> <dbl> <dbl> <dbl> <dbl>
-## 1 -0.541 5.54 6.08 -1.42 0.161
-## # ... with 5 more variables: parameter <dbl>,
-## # conf.low <dbl>, conf.high <dbl>, method <chr>,
-## # alternative <chr>
-所有的假设检验都符合这个框架4:
-library(infer)
-
-<- movies_genre_sample %>%
- obs_diff specify(formula = rating ~ genre) %>%
- calculate(
- stat = "diff in means",
- order = c("Romance", "Action")
-
- ) obs_diff
## # A tibble: 1 x 1
-## stat
-## <dbl>
-## 1 0.541
-<- movies_genre_sample %>%
- null_dist specify(formula = rating ~ genre) %>%
- hypothesize(null = "independence") %>%
- generate(reps = 5000, type = "permute") %>%
- calculate(
- stat = "diff in means",
- order = c("Romance", "Action")
-
- )head(null_dist)
## # A tibble: 6 x 2
-## replicate stat
-## <int> <dbl>
-## 1 1 0.135
-## 2 2 0.0353
-## 3 3 -0.7
-## 4 4 0.294
-## 5 5 -0.141
-## 6 6 0.247
-%>%
- null_dist visualize()
%>%
- null_dist visualize() +
- shade_p_value(obs_stat = obs_diff, direction = "both")
# shade_p_value(bins = 100, obs_stat = obs_diff, direction = "both")
<- null_dist %>%
- pvalue get_pvalue(obs_stat = obs_diff, direction = "two_sided")
-
- pvalue
## # A tibble: 1 x 1
-## p_value
-## <dbl>
-## 1 0.164
-在构建的虚拟(\(\Delta = 0\))的平行世界里,出现实际观察值(0.5412)的概率很小,这里是(0.1644)。 如果以(p< 0.05)为标准,那我们有足够的证据证明,H0不成立,即爱情电影和动作电影的评分均值存在显著差异,具体来说,动作电影的平均评分要比爱情电影低些。
-美国国家航空航天局的预算是否存在党派门户之见?
-<- read_rds("./demo_data/gss.rds")
- gss
-%>%
- gss select(NASA, party) %>%
- count(NASA, party) %>%
- head(8)
## # A tibble: 8 x 3
-## NASA party n
-## <fct> <fct> <int>
-## 1 TOO LITTLE Dem 8
-## 2 TOO LITTLE Ind 13
-## 3 TOO LITTLE Rep 9
-## 4 ABOUT RIGHT Dem 22
-## 5 ABOUT RIGHT Ind 37
-## 6 ABOUT RIGHT Rep 17
-## 7 TOO MUCH Dem 13
-## 8 TOO MUCH Ind 22
-%>%
- gss ggplot(aes(x = party, fill = NASA)) +
- geom_bar()
假设:
-零假设 \(H_0\):
-备选假设 \(H_a\):
-两种可能的结论:
-chisq.test(gss$party, gss$NASA)
##
-## Pearson's Chi-squared test
-##
-## data: gss$party and gss$NASA
-## X-squared = 1.3, df = 4, p-value = 0.9
-或者
-%>%
- gss chisq_test(NASA ~ party) %>%
- ::select(p_value) %>%
- dplyr::pull() dplyr
## [1] 0.8569
-<- gss %>%
- obs_stat specify(NASA ~ party) %>%
- calculate(stat = "Chisq")
- obs_stat
## # A tibble: 1 x 1
-## stat
-## <dbl>
-## 1 1.33
-<- gss %>%
- null_dist specify(NASA ~ party) %>% # (1)
- hypothesize(null = "independence") %>% # (2)
- generate(reps = 5000, type = "permute") %>% # (3)
- calculate(stat = "Chisq") # (4)
- null_dist
## # A tibble: 5,000 x 2
-## replicate stat
-## <int> <dbl>
-## 1 1 4.85
-## 2 2 1.01
-## 3 3 6.20
-## 4 4 4.08
-## 5 5 2.46
-## 6 6 2.72
-## 7 7 2.63
-## 8 8 1.27
-## 9 9 3.96
-## 10 10 3.24
-## # ... with 4,990 more rows
-%>%
- null_dist visualize() +
- shade_p_value(obs_stat = obs_stat, method = "both", direction = "right")
%>%
- null_dist get_pvalue(obs_stat = obs_stat, direction = "greater")
## # A tibble: 1 x 1
-## p_value
-## <dbl>
-## 1 0.851
-看到 p_value > 0.05
,不能拒绝 \(H_0\),我们没有足够的证据证明党派之间有显著差异
案例 quine
数据集有 146 行 5 列,包含学生的生源、文化、性别和学习成效,具体说明如下
<- MASS::quine %>%
- td as_tibble() %>%
- mutate(
- across(c(Sex, Eth), as_factor)
-
- ) td
## # A tibble: 146 x 5
-## Eth Sex Age Lrn Days
-## <fct> <fct> <fct> <fct> <int>
-## 1 A M F0 SL 2
-## 2 A M F0 SL 11
-## 3 A M F0 SL 14
-## 4 A M F0 AL 5
-## 5 A M F0 AL 5
-## 6 A M F0 AL 13
-## 7 A M F0 AL 20
-## 8 A M F0 AL 22
-## 9 A M F1 SL 6
-## 10 A M F1 SL 6
-## # ... with 136 more rows
-从民族背景有两组(A, N)来看,性别为 F 的占比 是否有区别?
-%>% count(Eth, Sex) td
## # A tibble: 4 x 3
-## Eth Sex n
-## <fct> <fct> <int>
-## 1 A F 38
-## 2 A M 31
-## 3 N F 42
-## 4 N M 35
-prop.test(table(td$Eth, td$Sex), correct = FALSE)
##
-## 2-sample test for equality of proportions
-## without continuity correction
-##
-## data: table(td$Eth, td$Sex)
-## X-squared = 0.0041, df = 1, p-value = 0.9
-## alternative hypothesis: two.sided
-## 95 percent confidence interval:
-## -0.1564 0.1670
-## sample estimates:
-## prop 1 prop 2
-## 0.5507 0.5455
-<- td %>%
- obs_diff specify(Sex ~ Eth, success = "F") %>% # #被解释变量 sex中F的占比
- calculate(
- stat = "diff in props",
- order = c("A", "N") # 解释变量中两组A,N
-
- )
- obs_diff
## # A tibble: 1 x 1
-## stat
-## <dbl>
-## 1 0.00527
-<- td %>%
- null_distribution specify(Sex ~ Eth, success = "F") %>%
- hypothesize(null = "independence") %>%
- generate(reps = 5000, type = "permute") %>%
- calculate(stat = "diff in props", order = c("A", "N"))
%>%
- null_distribution visualize()
<- null_distribution %>%
- pvalue get_pvalue(obs_stat = obs_diff, direction = "both")
-
- pvalue
## # A tibble: 1 x 1
-## p_value
-## <dbl>
-## 1 1
-%>%
- null_distribution get_ci(level = 0.95, type = "percentile")
## # A tibble: 1 x 2
-## lower_ci upper_ci
-## <dbl> <dbl>
-## 1 -0.160 0.170
-infer
我比较喜欢infer宏包的设计思想,它把统计推断分成了四个步骤
- -下图可以更好的帮助我们理解infer的工作流程 -
-specify()
指定解释变量和被解释变量 (y ~ x
)
hypothesize()
指定零假设 (比如, independence
= y
和 x
彼此独立)
generate()
从基于零假设的平行世界中抽样:
type = "bootstrap"
(有放回的),对应的零假设往往是null = “point” ; 重抽样type = "permuting"
(无放回的),对应的零假设往往是null = “independence,” 指的是y和x之间彼此独立的,因此抽样后会重新排列,也就说原先 value1-group1 可能变成了value1-group2,(因为我们假定他们是独立的啊,这种操作,也不会影响y和x的关系)reps = 1000
)calculate()
计算每组(reps
)的统计值 (stat = "diff in props"
)
visualize()
可视化,对比零假设的分布与实际观察值.
下面是我自己对重抽样的理解 -
-更多统计推断的内容可参考
- - -R 软件是一个自由、开源软件平台,具有统计分析、可视化和编程的强大功能。 -你可以从这里免费下载。 为了更好的使用 R 软件,我推荐大家使用 RStudio这个 IDE。这里有个在线教程帮助我们熟悉 R 和 RStudio。
-我们从官方网站http://cran.r-project.org下载, 网站界面感觉有点朴素:
- -安装完R, 还需要安装RStudio。有同学可能要问 R 与 RStudio 是什么关系呢?打个比方吧,R 就像汽车的发动机, RStudio 就是汽车的仪表盘。但我更觉得 R 是有趣的灵魂,而 Rstudio 是好看的皮囊。
- -同样,我们从官方网站下载并安装,如果你是苹果系统的用户,选择苹果系统对应的rstudio版本即可。
-RStudio Desktop
-这里有个小小的提示: -
-D:/R
-D:/Rstudio
-安装完毕后,从windos开始菜单
,点开rstudio
图标,就打开了rstudio的窗口,界面效果如下
RStudio 的用户界面十分友好,想要运行一段R代码,只需要在 RStudio 控制台面板最下面 (Console)一行内键入R 代码,然后回车即可。比如我们键入1 + 1
并按回车后,RStudio 将显示如下结果
1 + 1
## [1] 2
-log(8)
## [1] 2.079
-1:15
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
-在R中存储的数据称为对象, R语言数据处理实际上就是不断的创建和操控这些对象。创建一个R对象,首先确定一个名称,然后使用
-赋值操作符 <-
,将数据赋值给它。比如,如果想给变量 x 赋值为5,在命令行中可以这样写 x <- 5
,然后回车.
<- 5 x
当键入x
然后回车,就打印出 x 的值。当然也可以使用命令print(x)
,结果一样。
x
## [1] 5
-+ 2 x
## [1] 7
-<- 1:6 die
die
## [1] 1 2 3 4 5 6
-/ 2 die
## [1] 0.5 1.0 1.5 2.0 2.5 3.0
-* die die
## [1] 1 4 9 16 25 36
-%*% die die
## [,1]
-## [1,] 91
-%o% die die
## [,1] [,2] [,3] [,4] [,5] [,6]
-## [1,] 1 2 3 4 5 6
-## [2,] 2 4 6 8 10 12
-## [3,] 3 6 9 12 15 18
-## [4,] 4 8 12 16 20 24
-## [5,] 5 10 15 20 25 30
-## [6,] 6 12 18 24 30 36
-3
## [1] 3
-5000
## [1] 5000
-3e+06
## [1] 3e+06
-class(0.0001)
## [1] "numeric"
-"hello"
## [1] "hello"
-"girl"
## [1] "girl"
-"1" # 注意 1 和 "1" 的区别
## [1] "1"
-class("1")
## [1] "character"
-TRUE
## [1] TRUE
-FALSE
## [1] FALSE
-3 < 4
## [1] TRUE
-class(T)
## [1] "logical"
-3 < 4
## [1] TRUE
-因子型可以看作是字符串向量的增强版,比如 “Alice,” “Bob,” “Carol,” “Ted” 是四个人名的字符串,因子型就在字符串的基础上,告诉计算机他们每个人都是有官阶层级的,比如 “排长”,“团长,” “师长,” “军长,” 也就说“Ted”排第一,“Carol”排第二,“Bob”排第三,“Alice” 排最后, 相比字符串而言,多了官阶层级信息。
-<- factor(c("Alice", "Bob", "Carol", "Ted"),
- fac levels = c("Ted", "Carol", "Bob", "Alice")
-
- ) fac
## [1] Alice Bob Carol Ted
-## Levels: Ted Carol Bob Alice
-class(fac)
## [1] "factor"
-再比如,General上将;Colonel上校;Captain上尉, 如果没有指定层级levels,c("Colonel", "General", "Captain")
就是一个常规的字符串向量,若指定了层级levels,这个字符串就有了军衔信息.
factor(c("Colonel", "General", "Captain"),
-levels = c("General", "Colonel", "Captain")
- )
## [1] Colonel General Captain
-## Levels: General Colonel Captain
-x <- 1
和 x <- c(1, 2, 3)
,这就是最简单的数据对象,叫原子型向量。c
函数将一组数据构造成向量,要求每个元素用逗
-号分隔,且每个元素的数据类型是一致的,可以把它想象成手里拿着一个糖葫芦<- c(2, 4, 3, 1, 5, 7)
- die die
## [1] 2 4 3 1 5 7
-长度为 1 的原子型向量
-<- c(1) # or
- x <- 1 x
强制转换
-<- c("R", 1, TRUE)
- vec class(vec)
## [1] "character"
-你依次输入,就发现三种类型的优先级关系
-c(TRUE, 1) # 被转换成了数值型
## [1] 1 1
-c( 1, "R") # 被转换成了字符串型
## [1] "1" "R"
-c(TRUE, 1, "R") # 被转换成了字符串型
## [1] "TRUE" "1" "R"
-die %o% die
是矩阵类型,矩阵就是二维数组matrix
函数创建,可以想象成糖葫芦太多,一个棒子串不下,就多用几根棒子串。<- matrix(c(2, 4, 3, 1, 5, 7),
- m nrow = 2, ncol = 3, byrow = TRUE
- )
m
## [,1] [,2] [,3]
-## [1,] 2 4 3
-## [2,] 1 5 7
-array
函数生成n
维数组,可以想象成我们吃的土司面包一样。<- array(c(11:14, 21:24, 31:34), dim = c(2, 2, 3))
- ar ar
## , , 1
-##
-## [,1] [,2]
-## [1,] 11 13
-## [2,] 12 14
-##
-## , , 2
-##
-## [,1] [,2]
-## [1,] 21 23
-## [2,] 22 24
-##
-## , , 3
-##
-## [,1] [,2]
-## [1,] 31 33
-## [2,] 32 34
-c
函数创建向量的方式相似,不同的元素用逗号分开。不同的是,列表允许不同的数据类型(数值型,字符型,逻辑型等), 而向量要求每个元素的数据类型必须相同。可以想象成小火车,每节车厢可以装自己喜欢的东西<- list(100:110, "R", c(2, 4, 3, 1, 5, 7))
- list1 list1
## [[1]]
-## [1] 100 101 102 103 104 105 106 107 108 109 110
-##
-## [[2]]
-## [1] "R"
-##
-## [[3]]
-## [1] 2 4 3 1 5 7
-data.frame
函数构建<- data.frame(
- df name = c("ace", "bob", "carl", "kaite"),
- age = c(21, 14, 13, 15),
- sex = c("girl", "boy", "boy", "girl")
-
- ) df
R 对象的数据结构(向量、矩阵、数组、列表和数据框),总结如下
- -为了更好地理解相关概念,建议大家阅读Garrett Grolemund的 -hopr这本书 (Grolemund 2014)。
-R 语言的强大在于使用函数操控各种对象,你可以把对象看作是名词,而函数看作是动词。
-我们用一个简单的例子,sum()
来演示函数如何工作的。这个函数的功能正如它的名字一样,对输入的各个对象求和,然后返回求和后的值,你可以在命令行中键入?sum()
查看其官方文档。
-sum()
后的结果可以直接显示出来,也可以赋名。比如下面代码,首先计算x + 10
并赋以名字y
, 然后第二行中打印出来这个新创建的对象y
<- sum(x, 10)
- y y
## [1] 11
-因为代码的灵活性,可以不断地重新定义对象。只要数据发生改变,原来的代码就会返回新的值。比如,对x
重新赋值为 15, 同样运行sum()
函数,这次我们不赋值给对象y
,而是让它直接显示
<- 15
- x sum(x, 10)
## [1] 25
-再比如
-round(3.14159)
## [1] 3
-mean(1:6)
## [1] 3.5
-<- 100
- n <- seq(1, n)
- x sum(x)
## [1] 5050
-<- mtcars[, 1:4]
- dt head(dt)
cor(dt)
## mpg cyl disp hp
-## mpg 1.0000 -0.8522 -0.8476 -0.7762
-## cyl -0.8522 1.0000 0.9020 0.8324
-## disp -0.8476 0.9020 1.0000 0.7909
-## hp -0.7762 0.8324 0.7909 1.0000
-如果我们已经写好了一段R程序,我们可以保存为脚本文件,脚本文件通常以.R作为文件的后缀名。比如我们可以将刚才创建x
和 y
对象的命令,保存为脚本文件my_script.R
。
-这样我们可以在其它时间修改和重新运行它。
在RStudio中,你可以通过菜单栏依此点击File > New File > R Script
来创建一个新的脚本。
-强烈建议大家在运行代码之前,使用脚本的形式编写和编辑自己的程序,养成这样的习惯后,你今后所有的工作都有案可查,并且具有可重复性。
Run
或者 Source
运行脚本Run
, 运行光标所在行的代码Source
,从头到尾运行全部代码R 语言的强大还在于各种宏包,一般在The Comprehensive R Archive Network (CRAN)下载安装。宏包扩展了R语言本身的各种功能,也为解决问题提供了各种方案。截至撰写本书时止,CRAN上大约有1.4万个宏包可以使用。但由于各种包接口不统一,语法不一致,也带来一些困扰。为了解决这个问题,RStudio 公司的Hadley Wickham 与其带领的团队推出了tidyverse
宏包, tidyverse将常用的宏包整合在一起,并保持了语法的一致性。可以说,tidyverse
宏包是R语言入门 学习的首选。
-本书正是基于tidyverse
宏包而成的,本书也将通过一些例子不断地展示tidyverse
在数据分析和可视化的应用。
可以用如下命令安装 ggplot2
宏包:
# 安装单个包
-install.packages("tidyverse")
# 安装多个包
-install.packages(c("ggplot2", "devtools", "dplyr"))
然后再输入命令install.packages("tidyverse")
,或者直接指定清华大学镜像
install.packages("tidyverse", repos = "http://mirrors.tuna.tsinghua.edu.cn/CRAN")
in install.packages :
- Warning for repository http://cran.rstudio.com/src/contrib:
- unable to access index 'http://cran.rstudio.com/src/contrib/PACKAGES' cannot open URL
输入下面命令后,再试试
-options(download.file.method="libcurl")
或者打开D:\R\etc\Rprofile.site
,添加以下内容:
local({r <- getOption("repos")
-"CRAN"] <- "http://mirrors.tuna.tsinghua.edu.cn/CRAN"
- r[options(repos=r)})
-
-options(download.file.method="libcurl")
问题4:如果每次打开Rstudio非常慢,可以在Rstudio里将这几个选项取消 -
问题5:如果 Rstudio 打开是空白
很大的可能是你的电脑用户名是中文的,修改用户名再试试
-可能你是低版本的windows系统,建议安装旧版本的Rstudio,可以在这里找到旧版本.
-更多Rstudio的使用,可参考这里introducing-the-rstudio。
-Rstudio
右下面板的Help
选项卡)
- ?sqrt
- ?gather
- ?spread
- ?ggplot2
- ?scale ?map_dfr
比如:
- -R 语言社区非常友好,可以在这里找到你问题的答案
-a <- c("a", "c", "e")
的第二个元素?矩阵和列表的时候,又该如何?c(1, FALSE)
与 c("a", TRUE)
会是什么?1 == "1"
和 -1 < FALSE
为什么为真? "one" < 2
为什么为假?马克思曾说过:“一门科学只有当它达到能够成功运用数学时,才算真正得到发展。”数学为数据科学提供了坚实的理论基础,数据科学也为数学与实际应用之间建立起一个直接的桥梁。
-数据科学是综合了统计学、计算机科学和领域知识的交叉学科,其基本内容就是用数据的方法研究科学,用科学的方法研究数据(鄂维南院士)。2010年,Drew Conway画了一张数据科学的韦恩图
- -从数据科学所涉及的学科领域来看,其知识结构不仅仅包括数学、统计学、计算机科学、信息科学等在内的基础性理论,还应该包括社会学、物理学、情报学、生物医学等在内的专业性领域理论。
-(事实上,编程是工具,统计是灵魂,专业是核心,最重要的最下面那个部分,专业领域的知识)
-想了解R语言的发展历史,可阅读The History of R
-官网定义:https://www.r-project.org/
- -R语言是用于统计分析、图形表示和报告的编程语言:
-2019 年 8 月,国际统计学年会将考普斯总统奖(The Committee of Presidents of Statistical Societies -Awards,简称 COPSS 奖,被誉为统计学的诺贝尔奖)奖颁给 tidyverse的作者Hadley Wickham后,充分说明R语言得到了学术界的肯定和认可,我相信未来它在自然科学、社会科学和工业领域中的应用前景会非常光明。
- -Hadley Wickham将数据科学流程分解成6个环节
- -即数据导入、数据规整、数据处理、可视化、建模以及形成可重复性报告,整个分析和探索过程都在一个程序代码中完成,这种方式对训练我们的数据思维非常有帮助。
-tidyverse套餐,其主要成员包括
-功能 | -宏包 | -
---|---|
有颜值担当 | -ggplot2 | -
数据处理王者 | -dplyr | -
数据转换专家 | -tidyr | -
数据载入利器 | -readr | -
循环加速器 | -purrr | -
强化数据框 | -tibble | -
字符串处理 | -stringr | -
因子处理 | -forcats | -
序号 | -内容 | -代码演示 | -
---|---|---|
1 | -统计 | -Download 01_stats.R | -
2 | -可视化 | -Download 02_visual.R | -
3 | -探索性分析 | -Download 03_eda.R | -
4 | -可重复性报告 | -Download 04_reproducible.Rmd | -
看到这图,有同学可能会有同感。我认为,一个学科之所以成为一门科学,必须要有数学作 -为基础。我说这话,相信很多人会反驳我。我接受反驳。但我还是会坚持我的观点。很多同学在选专业的时候,导师会说,这个专业不会用太多数学,事实上被忽悠了,尤其在(新文科建设、跨学科研究)背景下,社会科学(包括心理学、语言学)都在交叉融合,都需要用数学和计算机。所以,我们不是学统计的,但需要用统计。一个更残酷的现实,用统计的,往往不是学统计的。
-我们人,都是视觉动物,都喜欢看漂亮美好的东西。如果文章或者报告太多表格,不会 -给人留下深刻影响;相反,用图片,重点突出、观点明确,一图胜千言,很容易传递信息。当然,前提是,画图要画的好。 事实上,可视化,一半是科学、一半是艺术。
-又一个残酷的现实,在这个看脸的时代,没有好看的皮囊,没人愿意了解你的灵魂。
-为什么要统计编程,回答这个问题,相 -当于回答,为什么不能用 excel 做数据分析?画个图说明下
- -对于数据量不大,或者复杂程度不高的需求来说,excel很方便也很直观。但随着数据量或复杂程度不断增大,excel解决起来难度系数就陡增,或者无法搞定,这就需要借助编程完成。也就说,掌握了编程技能,对于简单的问题和复杂的问题,难度系数是差不多了。
-所以,第三残酷的现实:现在小学生都开始学编程了。 -
-科学的可重复性危机,已经成为举世瞩目的热点议题。 -科研结果可重复性低的原因很多很多。不可重复,说明事情没那么简单。 -或许,科学固有不确定性,但我们需要从研究方法、实验 -设计和统计方法方面改进。 -所以,第四个残酷的现实:科学研究的方向是(开放科学 -框架 (Open Science Framework, OSF)), 正如 Nature 期刊 -要求的一样,需要公布原始数据和如何分析的代码
-我想,R语言之美,你值得拥有,因为它可以缓解你的压力
-所以,R语言之美,体现在好用、好看、好学、好玩。
-序号 | -内容 | -特性 | -评价 | -
---|---|---|---|
1 | -统计分析 | -看家本领 | -好用 | -
2 | -ggplot2画图 | -颜值担当 | -好看 | -
3 | -tidyverse语法 | -人类语言 | -好学 | -
4 | -可重复性报告 | -方便快捷 | -好玩 | -
2016年权威机构KDnuggets做过调研,显示数据科学领域最受欢迎的工具,是python和R两种语言
- -事实上,python和R都是非常强大的工具,两者各有优劣,作为初学者,究竟选择谁? -可以参考这篇文章,这篇文章旗帜鲜明地指出,R语言,是当今最值得学习的数据科学语言。为此做了详细的对比,并罗列了很多理由,其中的3点理由很重要,我圈了出来(传统的统计学,贝叶斯新统计、数据可视化)。
- -事实上,数据科学,是和数据打交道(定义:用科学的方法研究数据,用数据的方法研究科学),目的要利用(计算机和统计知识)推动学科发展,不是把大家培养成程序员。
-所以,我看完这篇文章的感受是:
-%>%
太酷了 )R社区上很多大神,贡献了很多非常优秀的工具,节省了我们的时间,也给我们的生活增添了无限乐趣。我平时逛github的时候时整理一些,现在分享出来供像我一样的懒人用,因此本文档叫“懒人系列”。欢迎大家补充。
-library(tidyverse)
-library(janitor)
-## install.packages("janitor")
-## https://github.com/sfirke/janitor
<- tibble::tribble(
- fake_raw ~id, ~`count/num`, ~W.t, ~Case, ~`time--d`, ~`%percent`,
- "china", 3L, "w", 5L, 25L,
- 1L, "us", 4L, "f", 6L, 34L,
- 2L, "india", 5L, "q", 8L, 78L
- 3L,
- ) fake_raw
## # A tibble: 3 x 6
-## id `count/num` W.t Case `time--d` `%percent`
-## <int> <chr> <int> <chr> <int> <int>
-## 1 1 china 3 w 5 25
-## 2 2 us 4 f 6 34
-## 3 3 india 5 q 8 78
-%>% janitor::clean_names() fake_raw
## # A tibble: 3 x 6
-## id count_num w_t case time_d percent_percent
-## <int> <chr> <int> <chr> <int> <int>
-## 1 1 china 3 w 5 25
-## 2 2 us 4 f 6 34
-## 3 3 india 5 q 8 78
-%>%
- mtcars ::count(cyl) dplyr
## # A tibble: 3 x 2
-## cyl n
-## <dbl> <int>
-## 1 4 11
-## 2 6 7
-## 3 8 14
-%>%
- mtcars ::tabyl(cyl) janitor
## cyl n percent
-## 4 11 0.3438
-## 6 7 0.2188
-## 8 14 0.4375
-<- tribble(
- df ~id, ~date, ~store_id, ~sales,
- 1, "2020-03-01", 1, 100,
- 2, "2020-03-01", 2, 100,
- 3, "2020-03-01", 3, 150,
- 4, "2020-03-02", 1, 110,
- 5, "2020-03-02", 3, 101
-
- )
-%>%
- df ::get_dupes(store_id) janitor
## # A tibble: 4 x 5
-## store_id dupe_count id date sales
-## <dbl> <int> <dbl> <chr> <dbl>
-## 1 1 2 1 2020-03-01 100
-## 2 1 2 4 2020-03-02 110
-## 3 3 2 3 2020-03-01 150
-## 4 3 2 5 2020-03-02 101
-%>%
- df ::get_dupes(date) janitor
## # A tibble: 5 x 5
-## date dupe_count id store_id sales
-## <chr> <int> <dbl> <dbl> <dbl>
-## 1 2020-03-01 3 1 1 100
-## 2 2020-03-01 3 2 2 100
-## 3 2020-03-01 3 3 3 150
-## 4 2020-03-02 2 4 1 110
-## 5 2020-03-02 2 5 3 101
-## install.packages("styler")
安装后,然后这两个地方点两下,就发现你的代码整齐很多了。或者直接输入
-:::style_active_file() styler
library(equatiomatic)
-## https://github.com/datalorax/equatiomatic
<- lm(mpg ~ cyl + disp, mtcars) mod1
extract_eq(mod1)
\[ -\operatorname{mpg} = \alpha + \beta_{1}(\operatorname{cyl}) + \beta_{2}(\operatorname{disp}) + \epsilon -\]
-extract_eq(mod1, use_coefs = TRUE)
\[ -\operatorname{mpg} = 34.66 - 1.59(\operatorname{cyl}) - 0.02(\operatorname{disp}) + \epsilon -\]
-library(report)
-## https://github.com/easystats/report
<- lm(Sepal.Length ~ Species, data = iris)
- model report(model)
We fitted a linear model (estimated using OLS) to predict Sepal.Length with Species (formula = Sepal.Length ~ Species). Standardized parameters were obtained by fitting the model on a standardized version of the dataset. Effect sizes were labelled following Cohen’s (1988) recommendations.
-The model explains a significant and substantial proportion of variance (R2 = 0.62, F(2, 147) = 119.26, p < .001, adj. R2 = 0.61). The model’s intercept, corresponding to Sepal.Length = 0 and Species = setosa, is at 5.01 (SE = 0.07, 95% CI [4.86, 5.15], p < .001). Within this model:
-library(performance)
-
-<- lm(mpg ~ wt * cyl + gear, data = mtcars)
- model ::check_model(model) performance
library(gtsummary)
-## https://github.com/ddsjoberg/gtsummary
-
-
-::trial %>%
- gtsummary::select(trt, age, grade, response) %>%
- dplyr::tbl_summary(
- gtsummaryby = trt,
- missing = "no"
- %>%
- ) ::add_p() %>%
- gtsummary::add_overall() %>%
- gtsummary::add_n() %>%
- gtsummary::bold_labels() gtsummary
直接复制到论文即可
-<-
- t1 glm(response ~ trt + age + grade, trial, family = binomial) %>%
- ::tbl_regression(exponentiate = TRUE)
- gtsummary
-<-
- t2 ::coxph(survival::Surv(ttdeath, death) ~ trt + grade + age, trial) %>%
- survival::tbl_regression(exponentiate = TRUE)
- gtsummary
-
-
-::tbl_merge(
- gtsummarytbls = list(t1, t2),
- tab_spanner = c("**Tumor Response**", "**Time to Death**")
- )
library(ggplot2)
-library(statsExpressions)
-# https://github.com/IndrajeetPatil/statsExpressions
-
-
-ggplot(mtcars, aes(x = mpg, y = wt)) +
-geom_point() +
- geom_smooth(method = "lm") +
- labs(
- title = "Spearman's rank correlation coefficient",
- subtitle = expr_corr_test(mtcars, mpg, wt, type = "nonparametric")
- )
library(inferregex)
-## remotes::install_github("daranzolin/inferregex")
<- "abcd-9999-ab9"
- s infer_regex(s)$regex
## [1] "^[a-z]{4}-\\d{4}-[a-z]{2}\\d$"
-有了它,妈妈再也不担心我的正则表达式了
-library(ggthemr) ## devtools::install_github('cttobin/ggthemr')
-ggthemr("dust")
%>%
- mtcars mutate(cyl = factor(cyl)) %>%
- ggplot(aes(x = mpg, fill = cyl, colour = cyl)) +
- geom_density(alpha = 0.75) +
- labs(fill = "Cylinders", colour = "Cylinders", x = "MPG", y = "Density") +
- legend_top()
用完别忘了
-ggthemr_reset()
scales也是大神的作品,功能多多
-## https://github.com/r-lib/scales
-library(scales)
-
-show_col(viridis_pal()(10))
不推荐个人配色,因为我们不专业。直接用专业的配色网站 -colorbrewer
-先看看颜色,再选择
-library(pacman)
-## p_load(lattice, foreign, boot, rpart)
唉,这个library()
都要偷懒,真服了你们了
## https://github.com/EmilHvitfeldt/gganonymize
-library(ggplot2)
-library(gganonymize)
-
-<-
- ggg ggplot(mtcars, aes(as.factor(cyl))) +
- geom_bar() +
- labs(
- title = "Test title",
- subtitle = "Test subtitle, this one have a lot lot lot lot lot more text then the rest",
- caption = "Test caption",
- tag = 1
- +
- ) facet_wrap(~vs)
-
-gganonomize(ggg)
你可以看我的图,但就不想告诉你图什么意思,因为我加密了
-# remotes::install_github("tjmahr/WrapRmd")
-# remotes::install_github("fkeck/quickview")
-# remotes::install_github("mwip/beautifyR")
直接看官方网站,这里不举例了
-## install.packages("reprex")
-## https://reprex.tidyverse.org/
## beepr::beep(sound = "mario")
你听到了声音吗?
-library(patchwork)
-<- ggplot(mtcars) +
- p1 geom_point(aes(mpg, disp))
- <- ggplot(mtcars) +
- p2 geom_boxplot(aes(gear, disp, group = gear))
- <- ggplot(mtcars) +
- p3 geom_smooth(aes(disp, qsec))
- + p2 + p3 p1
library(naniar)
-## https://github.com/njtierney/naniar
-
-%>%
- airquality group_by(Month) %>%
- ::miss_var_summary() naniar
## # A tibble: 25 x 4
-## # Groups: Month [5]
-## Month variable n_miss pct_miss
-## <int> <chr> <int> <dbl>
-## 1 5 Ozone 5 16.1
-## 2 5 Solar.R 4 12.9
-## 3 5 Wind 0 0
-## 4 5 Temp 0 0
-## 5 5 Day 0 0
-## 6 6 Ozone 21 70
-## 7 6 Solar.R 0 0
-## 8 6 Wind 0 0
-## 9 6 Temp 0 0
-## 10 6 Day 0 0
-## # ... with 15 more rows
-library(visdat)
-
-vis_dat(airquality)
管道都不想写, 写代码还有美感?
-## library(nakepipe)
## https://github.com/daattali/addinslist