update regex RMD

gillopy · Jan 29, 2021 · 7703f1f · 7703f1f
1 parent e83f64b
commit 7703f1f
Show file tree

Hide file tree

Showing 9 changed files with 301 additions and 224 deletions.
diff --git a/02_figures/2020-05-16_greedy-matches.jpg b/02_figures/2020-05-16_greedy-matches.jpg
diff --git a/03_blog_posts/2020-05-16_untangling-strings/2020-05-16_untangling-strings.Rmd b/03_blog_posts/2020-05-16_untangling-strings/2020-05-16_untangling-strings.Rmd
diff --git a/03_blog_posts/2020-05-16_untangling-strings/2020-05-16_untangling-strings.md b/03_blog_posts/2020-05-16_untangling-strings/2020-05-16_untangling-strings.md
diff --git a/...g-strings/2020-05-16_untangling-strings_files/figure-gfm/unnamed-chunk-21-1.png b/...g-strings/2020-05-16_untangling-strings_files/figure-gfm/unnamed-chunk-21-1.png
diff --git a/03_blog_posts/2020-09-12_binomial_distribution/2020-09-12_binomial-distribution.Rmd b/03_blog_posts/2020-09-12_binomial_distribution/2020-09-12_binomial-distribution.Rmd
@@ -378,28 +378,35 @@ CIs <- CIs %>%
 unemployment_data <- bind_rows(unemployment_data, CIs)
 ```
 
-```{r, message = FALSE, warning = FALSE}
+```{r, message = FALSE, warning = FALSE, height = 6, width = 8}
 #-----visualise the funnel plot-----  
 # x-axis displays LGA population size
 # y-axis displays local unemployment rate  
 # control limits are 95% confidence intervals for p  
-
-unemployment_data %>%
-  ggplot(aes(x = lga_pop, y = lga_unemploy_rate, label = lgas)) +
-  geom_point() + 
-  geom_text_repel(colour = "grey30", direction = "both", force = 2, nudge_x = 10000, nudge_y = 0.005) + 
-  geom_hline(yintercept = 0.1, size = 0.5, colour = "firebrick", linetype = "dashed") + 
-  geom_line(aes(x = lga_pop, y = lwr.ci), colour = "steelblue", linetype = "dotted") + 
-  geom_line(aes(x = lga_pop, y = upr.ci), colour = "steelblue", linetype = "dotted") + 
+# subset the LGAs which fall outside of the 95% confidence intervals  
+
+subset_lgas <- c("Orange", "Leeton", "Broken Hill", "Griffith")
+
+ggplot(unemployment_data, aes(x = lga_pop, y = lga_unemploy_rate, label = lgas)) +
+  geom_point(colour = if_else(unemployment_data$lgas %in% subset_lgas, "#b2182b", "grey40")) + 
+  geom_text_repel(colour = if_else(unemployment_data$lgas %in% subset_lgas,"black","grey40"),
+                  segment.color = "grey70", point.padding = 0.1, 
+                  size = 3.5, direction = "both", force = 6, nudge_x = 10000, nudge_y = 0.006) + 
+  geom_hline(yintercept = 0.1, size = 0.5, colour = "#2166ac", linetype = "dashed") + 
+  geom_line(aes(x = lga_pop, y = lwr.ci), size = 0.5, colour = "#67a9cf", linetype = "dotted") + 
+  geom_line(aes(x = lga_pop, y = upr.ci), size = 0.5, colour = "#67a9cf", linetype = "dotted") + 
   scale_x_continuous(labels = scales::comma) + 
   scale_y_continuous(limits = c(0.04, 0.16), labels = scales::percent_format(accuracy = 1)) + 
   labs(x = "LGA resident population",
        y = "LGA unemployment rate (%)",
-       title = "Which LGA unemployment rates significantly differ from the national rate?") + 
+       title = "Which LGA unemployment rates significantly differ from the national rate?",
+       caption = "Labour Force, Australia, Detailed - released 24 September 2020") + 
   theme_bw() +
   theme(panel.grid.minor = element_blank(),
         panel.grid.major.x = element_blank(),
-        panel.grid.major.y = element_line(linetype = "dashed"))  
+        panel.grid.major.y = element_line(linetype = "dashed"),
+        plot.title = element_text(hjust = 0.5),
+        plot.caption = element_text(hjust = 0.5, colour = "grey35")) 
 ```
 
 **Note:** The final step in generating a funnel plot would be to calculate overdispersion limits (when we have more variation between LGAs than expected from statistical theory alone). It would also be interesting to apply the [package `nullabor`](https://cran.r-project.org/web/packages/nullabor/vignettes/nullabor.html) to visually confirm that the unemployment rate in Orange is not differently distributed due to chance alone.      

diff --git a/03_blog_posts/2020-09-12_binomial_distribution/2020-09-12_binomial-distribution.md b/03_blog_posts/2020-09-12_binomial_distribution/2020-09-12_binomial-distribution.md
@@ -1,7 +1,7 @@
 Introduction to binomial distributions
 ================
 Erika Duan
-2020-10-03
+2020-10-18
 
   - [Introduction](#introduction)
   - [Bernoulli trial](#bernoulli-trial)
@@ -479,23 +479,30 @@ unemployment_data <- bind_rows(unemployment_data, CIs)
 # x-axis displays LGA population size
 # y-axis displays local unemployment rate  
 # control limits are 95% confidence intervals for p  
-
-unemployment_data %>%
-  ggplot(aes(x = lga_pop, y = lga_unemploy_rate, label = lgas)) +
-  geom_point() + 
-  geom_text_repel(colour = "grey30", direction = "both", force = 2, nudge_x = 10000, nudge_y = 0.005) + 
-  geom_hline(yintercept = 0.1, size = 0.5, colour = "firebrick", linetype = "dashed") + 
-  geom_line(aes(x = lga_pop, y = lwr.ci), colour = "steelblue", linetype = "dotted") + 
-  geom_line(aes(x = lga_pop, y = upr.ci), colour = "steelblue", linetype = "dotted") + 
+# subset the LGAs which fall outside of the 95% confidence intervals  
+
+subset_lgas <- c("Orange", "Leeton", "Broken Hill", "Griffith")
+
+ggplot(unemployment_data, aes(x = lga_pop, y = lga_unemploy_rate, label = lgas)) +
+  geom_point(colour = if_else(unemployment_data$lgas %in% subset_lgas, "#b2182b", "grey40")) + 
+  geom_text_repel(colour = if_else(unemployment_data$lgas %in% subset_lgas,"black","grey40"),
+                  segment.color = "grey70", point.padding = 0.1, 
+                  size = 3.5, direction = "both", force = 6, nudge_x = 10000, nudge_y = 0.006) + 
+  geom_hline(yintercept = 0.1, size = 0.5, colour = "#2166ac", linetype = "dashed") + 
+  geom_line(aes(x = lga_pop, y = lwr.ci), size = 0.5, colour = "#67a9cf", linetype = "dotted") + 
+  geom_line(aes(x = lga_pop, y = upr.ci), size = 0.5, colour = "#67a9cf", linetype = "dotted") + 
   scale_x_continuous(labels = scales::comma) + 
   scale_y_continuous(limits = c(0.04, 0.16), labels = scales::percent_format(accuracy = 1)) + 
   labs(x = "LGA resident population",
        y = "LGA unemployment rate (%)",
-       title = "Which LGA unemployment rates significantly differ from the national rate?") + 
+       title = "Which LGA unemployment rates significantly differ from the national rate?",
+       caption = "Labour Force, Australia, Detailed - released 24 September 2020") + 
   theme_bw() +
   theme(panel.grid.minor = element_blank(),
         panel.grid.major.x = element_blank(),
-        panel.grid.major.y = element_line(linetype = "dashed"))  
+        panel.grid.major.y = element_line(linetype = "dashed"),
+        plot.title = element_text(hjust = 0.5),
+        plot.caption = element_text(hjust = 0.5, colour = "grey35")) 
 ```
 
 ![](2020-09-12_binomial-distribution_files/figure-gfm/unnamed-chunk-13-1.png)<!-- -->

diff --git a/...bution/2020-09-12_binomial-distribution_files/figure-gfm/unnamed-chunk-13-1.png b/...bution/2020-09-12_binomial-distribution_files/figure-gfm/unnamed-chunk-13-1.png
diff --git a/...05_statistical-testing-for-proportions/2020-11-05_statistical-testing-for-proportions.Rmd b/...05_statistical-testing-for-proportions/2020-11-05_statistical-testing-for-proportions.Rmd
@@ -0,0 +1,60 @@
+---
+title: "Untitled"
+author: "Erika Duan"
+date: "21/10/2020"
+output: html_document
+---
+
+If you are using categorical data you can use the Kruskal-Wallis test (the non-parametric equivalent of the one-way ANOVA) to determine group differences. If the test shows there are differences between the 3 groups. You can use the Mann-Whitney test to do pairwise comparisons as a post hoc or follow up analysis. Since you're only doing a few comparisons (i.e. X vs. Y, X, vs Z, and Y vs. Z) you wouldn't have to worry about family wise error rates.  
+
+With SPSS, you can run a chi-square, and test for pair-wise differences between the pair of groups. Put your 3 groups in columns, and "ride the tube: yes/no in the rows.  Use the raw numbers in the 6 cells.  Select the option to use Bonferroni corrections for the pairwise comparisons. Using sub-scripted (super-scripted?) letters, SPSS will tell you which of the pairwise differences are significant, while preserving the overall (experiment-wise) 0.05 significance level.   
+
+	Chi-square test
+
+	Compare three or more unmatched groups	One-way ANOVA	Kruskal-Wallis test	Chi-square test
+
+	https://www.graphpad.com/support/faqid/1790/
+
+	The chi-square test on the counts actually does check for homogeneity of proportions.  
+
+	http://www.sthda.com/english/wiki/two-proportions-z-test-in-r
+
+	https://stats.stackexchange.com/questions/70107/fishers-exact-test-in-r-2x4-table 
+
+	https://data.library.virginia.edu/pairwise-comparisons-of-proportions/    
+
+	https://uc-r.github.io/multivariate_inference  
+
+	https://stats.libretexts.org/Bookshelves/Introductory_Statistics/Book%3A_Introductory_Statistics_(Shafer_and_Zhang)/09%3A_Two-Sample_Problems/9.04%3A_Comparison_of_Two_Population_Proportions 
+
+
+
+	https://uc-r.github.io/multivariate_inference  
+
+
+```{r}
+# packages used regularly
+library(tidyverse)
+
+# full population data
+# A tibble: 4 x 4
+## # Groups: Attrition [2]
+##   Attrition Gender     n   pct
+##   <fctr>    <fctr> <int> <dbl>
+## 1 No        Female   501 0.406
+## 2 No        Male     732 0.594
+## 3 Yes       Female    87 0.367
+## 4 Yes       Male     150 0.633  
+
+attrition_by_gender = tibble(Attrition = c("No", "No", "Yes", "Yes"),
+                             Gender = c("Female", "Male", "Female", "Male"),
+                             n = c(501, 732, 87, 150),
+                             proportion = c(0.406, 0.594, 0.367, 0.633))
+```
+
+
+```{r}
+
+```
+
+
diff --git a/03_blog_posts/2020-12-30_regression-analysis/2020-12-30_regression-analysis.Rmd b/03_blog_posts/2020-12-30_regression-analysis/2020-12-30_regression-analysis.Rmd
@@ -0,0 +1,45 @@
+---
+title: "Introduction to regression analysis"
+author: "Erika Duan"
+date: "`r Sys.Date()`"
+output: 
+    github_document:
+    toc: true  
+    pandoc_args: --webtex 
+---
+
+```{r, echo = TRUE, message = FALSE, warning = FALSE}  
+#-----load required packages-----  
+if (!require("pacman")) install.packages("pacman")
+pacman::p_load(here,  
+               tidyverse,
+               tidymodels,
+               glmnet,
+               patchwork) 
+```
+
+
+# Introduction 
+
+
+# Linear Regression  
+
+```{r}
+#-----perform single variable linear regression-----  
+orange <- as_tibble(Orange)  
+
+lm_fit <- lm(age ~ circumference, data = orange)
+summary(lm_fit)  
+```
+
+```{r}
+#-----tidy results of linear regression-----  
+tidy(lm_fit)
+
+glance(lm_fit)
+```
+
+
+# Resources   
+
++ https://www.tidymodels.org/learn/statistics/tidy-analysis/