GitHub - Ryo-N7/tidy_tuesday: repo for all tidy_tuesday code!

Table of Contents	(#TidyTuesday)
Week 1:	College tuition data
Week 2:	NFL salaries by position
Week 3:	Causes of death across the world
Week 4:	Salary differences by gender in Australia
Week 5:	American Community Survey
Week 6:	Coffee chains across the USA and the World

April 3

College tuition data

library(xlsx)
library(dplyr)
library(stringr)
library(tidyr)
library(ggplot2)
library(scales)

load data, check data

usa_avg_tuition <- read.xlsx(file = "april_3_week_1/us_avg_tuition.xlsx", sheetName = "Table 5")

glimpse(usa_avg_tuition)

## Observations: 50
## Variables: 13
## $ State       <fct> Alabama, Alaska, Arizona, Arkansas, California, Co...
## $ X2004.05    <dbl> 5682.838, 4328.281, 5138.495, 5772.302, 5285.921, ...
## $ X2005.06    <dbl> 5840.550, 4632.623, 5415.516, 6082.379, 5527.881, ...
## $ X2006.07    <dbl> 5753.496, 4918.501, 5481.419, 6231.977, 5334.826, ...
## $ X..2007.08. <dbl> 6008.169, 5069.822, 5681.638, 6414.900, 5672.472, ...
## $ X2008.09    <dbl> 6475.092, 5075.482, 6058.464, 6416.503, 5897.888, ...
## $ X2009.10    <dbl> 7188.954, 5454.607, 7263.204, 6627.092, 7258.771, ...
## $ X2010.11    <dbl> 8071.134, 5759.153, 8839.605, 6900.912, 8193.739, ...
## $ X2011.12    <dbl> 8451.902, 5762.421, 9966.716, 7028.991, 9436.426, ...
## $ X2012.13    <dbl> 9098.069, 6026.143, 10133.503, 7286.580, 9360.574,...
## $ X2013.14    <dbl> 9358.929, 6012.445, 10296.200, 7408.495, 9274.193,...
## $ X2014.15    <dbl> 9496.084, 6148.808, 10413.844, 7606.410, 9186.824,...
## $ X2015.16    <dbl> 9751.101, 6571.340, 10646.278, 7867.297, 9269.844,...

colnames(usa_avg_tuition)

##  [1] "State"       "X2004.05"    "X2005.06"    "X2006.07"    "X..2007.08."
##  [6] "X2008.09"    "X2009.10"    "X2010.11"    "X2011.12"    "X2012.13"   
## [11] "X2013.14"    "X2014.15"    "X2015.16"

deal with annoying column names ==> regex time!

colnames(usa_avg_tuition) <- colnames(usa_avg_tuition) %>% 
  str_replace_all("X", "") %>% 
  str_replace("\\.", "-20")

# manually change 2007-2008 ...

usa_avg_tuition <- usa_avg_tuition %>% rename(`2007-2008` = `-20.2007.08.`)

gather to correct format + create "rank" column

usa_avg_tuition <- usa_avg_tuition %>% gather(key = "year", value = "tuition", -State)

usa_avg_tuition <- usa_avg_tuition %>% 
  arrange(year, desc(tuition)) %>% 
  group_by(year) %>% 
  mutate(rank = dense_rank(desc(tuition)))


usa_avg_tuition <- usa_avg_tuition %>% 
  mutate(state = as.character(State))

`pull()` out the top 10 states for start year and end year ==> assign color to them.

top_states <- usa_avg_tuition %>% 
  filter(year %in% c("2004-2005", "2015-2016") & rank %in% c(1:10)) %>% 
  pull(state) %>% 
  unique()

top_states

##  [1] "Vermont"        "Pennsylvania"   "Ohio"           "New Hampshire" 
##  [5] "New Jersey"     "Massachusetts"  "Maryland"       "Delaware"      
##  [9] "South Carolina" "Illinois"       "Michigan"       "Virginia"

usa_avg_tuition <- usa_avg_tuition %>% 
  mutate(top_tuition = state %in% top_states,
         tuition = round(tuition, digits = 2))

colors = c(
  Vermont = "#EE2C2C",          # red
  Pennsylvania = "lightgreen",        # dark blue
  Ohio = "#00441b",          # green
  `New Hampshire` = "#4a1486",        # purple
  `New Jersey` = "#636363",        # dark grey
  Massachusetts = "#fd8d3c",    # orange
  Maryland = "#000000",          # black
  Delaware = "blue",
  `South Carolina` = "brown",
  Illinois = "pink",
  Michigan = "yellow",
  Virginia = "violet"
)

bump chart theme

library(extrafont)

theme_tuition <-  
  theme(text = element_text(family = "Arial Narrow", color = "#444444", face = "bold"),
        plot.title = element_text(size = 24, hjust = 0.5),
        plot.subtitle = element_text(size = 12, hjust = 0.5),
        axis.title = element_text(size = 14),
        axis.title.y = element_text(angle = 0, vjust = 0.5, margin = margin(r = 15)),
        axis.text = element_text(size = 12),
        axis.text.x = element_text(angle = 35, vjust = 1.3, hjust = 1.1,
                                   margin = margin(t = 15, r = 0, b = 0, l = 0)),
        axis.title.x = element_text(margin = margin(t = 8)),
        panel.grid.minor.y = element_blank(),
        legend.title = element_blank(),
        legend.position = "none")

the final bump chart

usa_avg_tuition %>% 
  filter(top_tuition == TRUE) %>% 
  ggplot(aes(year, rank, group = state)) +
  geom_line(aes(color = state), size = 1.6, alpha = 0.75) +
  geom_point(aes(color = state), size = 3, alpha = 1) +
  scale_y_reverse(breaks = 1:10) +         # show only top 10!
  geom_text(data = usa_avg_tuition %>% filter(year == "2004-2005"),
            aes(label = state, x = -0.3), 
            fontface = "bold", color = "black", size = 3.5) +
  geom_label(data = usa_avg_tuition %>% filter(year == "2004-2005"),
            aes(label = paste("$", tuition)), nudge_x = -1.3, nudge_y = -0.4, 
            fontface = "bold", color = "black", size = 2.5) +
  geom_text(data = usa_avg_tuition %>% filter(year == "2015-2016"),
            aes(label = state, x = 13.25), 
            fontface = "bold", color = "black", size = 3.5) +
  geom_label(data = usa_avg_tuition %>% filter(year == "2015-2016"),
             aes(label = paste("$", tuition)), nudge_x = 1.3, nudge_y = -0.4, 
             fontface = "bold", color = "black", size = 2.5) +
  coord_cartesian(ylim = c(1, 10.3), xlim = c(-0.9, 14)) +
  theme_tuition +
  scale_color_manual(values = colors) +
  labs(x = "Year", 
       y = "Rank\n&\nTuition",
       title = "Average College Tuition Rankings in the United States",
       subtitle = "By State, 2004-2005 to 2015-2016",
       caption = "By: Ryo Nakagawara (@R_by_Ryo) \n Source: https://onlinembapage.com/average-tuition-and-educational-attainment-in-the-united-states/\n#TidyTuesday")

# ALL states
library(viridis)

usa_avg_tuition %>% 
  ggplot(aes(year, rank, group = state)) +
  geom_line(aes(color = state), size = 1.6, alpha = 0.75) +
  geom_point(aes(color = state), size = 3, alpha = 1) +
  #scale_color_viridis(option = "A") +
  scale_y_reverse(breaks = 1:50) +         # show only top 10!
  geom_text(data = usa_avg_tuition %>% filter(year == "2004-2005"),
            aes(label = state, x = -0.3), 
            fontface = "bold", color = "black", size = 2.7) +
  geom_label(data = usa_avg_tuition %>% filter(year == "2004-2005"),
            aes(label = paste("$", tuition)), nudge_x = -1.3, nudge_y = -0.4, 
            fontface = "bold", color = "black", size = 1.5) +
  geom_text(data = usa_avg_tuition %>% filter(year == "2015-2016"),
            aes(label = state, x = 13.25), 
            fontface = "bold", color = "black", size = 2.7) +
  geom_label(data = usa_avg_tuition %>% filter(year == "2015-2016"),
             aes(label = paste("$", tuition)), nudge_x = 1.3, nudge_y = -0.4, 
             fontface = "bold", color = "black", size = 1.5) +
  coord_cartesian(ylim = c(1, 51), xlim = c(-0.9, 14)) +
  theme_tuition +
  labs(x = "Year", 
       y = "Rank\n&\nTuition",
       title = "Average College Tuition Rankings in the United States",
       subtitle = "By State, 2004-2005 to 2015-2016",
       caption = "By: Ryo Nakagawara (@R_by_Ryo) \n Source: https://onlinembapage.com/average-tuition-and-educational-attainment-in-the-united-states/\n#TidyTuesday") +
  theme(axis.text.y = element_text(size = 6.5))

April 10

NFL salaries by position

(did not participate this week)

April 16

Causes of death across the world

load data

library(dplyr)
library(tidyr)
library(ggplot2)
library(xlsx)

## Load file
df <- read.csv(file = "../tidy_tuesday/april_16_week_3/df.csv")

## Easily clean names
df <- df %>% janitor::clean_names()

## Gather cause of deaths under single column 
df_gather <- df %>% 
  gather(key = "cause_of_death", value = "percentages", -country, -country_code, -year) 

## Use stringr functions to make cleaned names work on plot, leave HIV/AIDS for now...
library(stringr)

df_gather <- df_gather %>% 
  mutate(cause_of_death = stringr::str_replace_all(cause_of_death, "_", " "),
         cause_of_death = stringr::str_to_title(cause_of_death),
         cause_of_death = case_when(
           cause_of_death == "Hiv Aids" ~ "HIV/AIDS",
           TRUE ~ cause_of_death
         ))

Grab the "region name" from UNlocations dataset for easy subsetting! Ex. Eastern Europe, Western Africa, etc.

library(wpp2015)

data("UNlocations")

UNlocations <- UNlocations %>% select(ends_with("name"))

df2 <- df_gather %>% 
  inner_join(UNlocations, by = c("country" = "name"))

## Only look at countries in the "Eastern Africa" region
east_afr <- df2 %>% 
  filter(reg_name == "Eastern Africa")

Unfortunately geom_boxplot() does not support "frame" for gganimate...

Therefore, create plot for each year with purrr then use ImageMagick or other to compile into gif/video!

library(purrr)

nested_east_afr <- east_afr %>% 
  mutate(percentages = percentages/100) %>%
  group_by(year) %>% 
  nest() %>% 
  mutate(plot = map2(data, year, 
                     ~ggplot(data = .x, aes(reorder(cause_of_death, percentages), percentages)) +
                       geom_boxplot(fill = "white", color = "darkred", outlier.color = "black") +
                       scale_y_continuous(breaks = scales::pretty_breaks(10), 
                                          limits = c(0, 1),
                                          labels = scales::percent,
                                          expand = c(0.02, 0)) +
                       coord_flip() +
                       labs(title = "Median Proportion of 'Cause of Death' in Countries of the Eastern Africa Region",
                            subtitle = paste0("Year: ", .y), # .y == YEAR!
                            y = "Proportion of All Deaths", x = "",
                            caption = "By: Ryo Nakagawara (@R_by_Ryo) \n Source: ourworldindata.org\n#TidyTuesday") +
                       theme_bw() +
                       theme(panel.grid.major.y = element_blank(),
                             text = element_text(family = "Arial Narrow"),
                             plot.title = element_text(size = 14, hjust = 0.5),
                             plot.subtitle = element_text(size = 12, hjust = 0.5)
                       )
  ))

glimpse(nested_east_afr)

#nested_east_afr$plot[5]
#nested_east_afr$plot[20]
#nested_east_afr$plot[13]
#nested_east_afr$plot[1]

Save each plot (per year) with ggsave!

map2(paste0("april_16_week_3/boxplots/", nested_east_afr$year, ".jpg"), nested_east_afr$plot, ggsave)

Use ImageMagick through Command Console:

Final GIF:

April 23

Salary differences by gender in Australia

April 30

American Community Survey

May 7

Coffee chains across the USA and the World

library(dplyr)
library(readxl)
library(sf)

# read-in data
starbucks_raw <- read_excel("../tidy_tuesday/may_7_week_6/week6_coffee_chains.xlsx", sheet = 1)

# clean, select cols, filter USA, summarize
starbucks_usa <- starbucks_raw %>% 
  janitor::clean_names() %>% 
  select(brand, city, state_province, country, longitude, latitude) %>% 
  filter(country == "US") %>% 
  group_by(state_province) %>% 
  summarize(count = n()) %>% 
  ungroup()

# grab geometries of USA from tigris pkg, turn into sf
states_sf <- tigris::states(cb = TRUE) %>% 
  st_as_sf() %>% 
  select(STUSPS, NAME, geometry) %>% 
  filter(!STUSPS %in% c("VI", "MP", "GU", "PR", "AS")) # filter out territories and Puerto Rico

# join with starbucks data
starbucks_sf <- states_sf %>% left_join(starbucks_usa, by = c("STUSPS" = "state_province"))

# remove Alaska and Hawaii... try to rescale and put them back in another time
starbucks_sf2 <- starbucks_sf %>% filter(!NAME %in% c("Alaska", "Hawaii"))

library(spData) # us_states has pop for each state

states_pop <- us_states %>% 
  select(GEOID, NAME, total_pop_15) %>% 
  filter(!NAME == "District of Columbia") %>% 
  st_set_geometry(NULL)

starbucks_population <- starbucks_sf_norm2 %>% 
  left_join(states_pop, by = "NAME")

starbucks_population <- starbucks_population %>% 
  mutate(pop_norm = (count / total_pop_15 * 100000) %>% ceiling()) # try per 100,000 people?

library(cartogram)
starb_cartogram <- st_transform(starbucks_population, crs = 2163)

starb_sp <- as(starb_cartogram, "Spatial")

# weigh size by state pop
starb_cartogram <- nc_cartogram(starb_sp, weight = "total_pop_15", k = 0.5) 

library(tmap)

# Sequential single hue color palette :: http://colorbrewer2.org/#type=sequential&scheme=Greens&n=5
greenpal <- c('#edf8e9','#bae4b3','#74c476','#31a354','#006d2c')

# legend.reverse for HIGH values on TOP, slight sepia to offset white glare?
# fiddle with margins to fit legend and small title
# plot!
starbucks_cartogram <- tm_shape(starb_cartogram) + 
  tm_borders("grey10") +
  tm_fill(title = "", "pop_norm", 
          palette = greenpal, 
          legend.reverse = TRUE) +
  tm_layout(inner.margins = c(.04,.02, .08, .02),
            main.title = "Number of Starbucks per 100,000 people",
            title = "(Source: https://www.kaggle.com/starbucks/store-locations)\nState size by state population",
            title.position = c("center", "top"), title.size = 0.7,
            fontfamily = "Garamond", fontface = "bold",
            legend.text.size = 0.85, 
            sepia.intensity = 0.1)

starbucks_cartogram

save_tmap(starbucks_cartogram, "starbucks_cartogram_pop_fix.png")

save_tmap(starbucks_cartogram, "starbucks_cartogram.png")

Name		Name	Last commit message	Last commit date
Latest commit History 32 Commits
2019_week_10		2019_week_10
2019_week_3		2019_week_3
2019_week_5		2019_week_5
2019_week_9		2019_week_9
week_11_june_12		week_11_june_12
week_12_june_19		week_12_june_19
week_18_july_30		week_18_july_30
week_19_august_6		week_19_august_6
week_1_april_3		week_1_april_3
week_21_august_20		week_21_august_20
week_24_september_10		week_24_september_10
week_31_october_30		week_31_october_30
week_34_november_20		week_34_november_20
week_3_april_16		week_3_april_16
week_4_april_23		week_4_april_23
week_5_april_30		week_5_april_30
week_6_may_7		week_6_may_7
week_7_may_14		week_7_may_14
week_9_may_28		week_9_may_28
.gitignore		.gitignore
README.md		README.md
Rplot.pdf		Rplot.pdf
Rplot01.png		Rplot01.png
WomensWorldCup.Rmd		WomensWorldCup.Rmd
anime.Rmd		anime.Rmd
blah.html		blah.html
emperor_RiseFall_plot.png		emperor_RiseFall_plot.png
emperor_RiseFall_plot2.png		emperor_RiseFall_plot2.png
emperors_upsetplot.png		emperors_upsetplot.png
gundam_gt.html		gundam_gt.html
gundam_gt.png		gundam_gt.png
plastic_waste.Rmd		plastic_waste.Rmd
ramen.Rmd		ramen.Rmd
ramen_ridges.png		ramen_ridges.png
readme_tidytuesday.rmd		readme_tidytuesday.rmd
roman_emperors.Rmd		roman_emperors.Rmd
roman_emperors_plot.png		roman_emperors_plot.png
starbucks_cartogram.png		starbucks_cartogram.png
student-teacher.Rmd		student-teacher.Rmd
tennis.Rmd		tennis.Rmd
test.png		test.png
test1.png		test1.png
test_plot.png		test_plot.png
tidy_anime.RDS		tidy_anime.RDS
tidy_tuesday.Rproj		tidy_tuesday.Rproj
wines.rmd		wines.rmd
wwc_goal_dist_plot.png		wwc_goal_dist_plot.png

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Repository files navigation

April 3

College tuition data

load data, check data

deal with annoying column names ==> regex time!

gather to correct format + create "rank" column

`pull()` out the top 10 states for start year and end year ==> assign color to them.

bump chart theme

the final bump chart

April 10

NFL salaries by position

April 16

Causes of death across the world

load data

Grab the "region name" from UNlocations dataset for easy subsetting! Ex. Eastern Europe, Western Africa, etc.

Unfortunately geom_boxplot() does not support "frame" for gganimate...

Therefore, create plot for each year with purrr then use ImageMagick or other to compile into gif/video!

Save each plot (per year) with ggsave!

Use ImageMagick through Command Console:

Final GIF:

April 23

Salary differences by gender in Australia

April 30

American Community Survey

May 7

Coffee chains across the USA and the World

About

Releases

Packages

Languages

Ryo-N7/tidy_tuesday

Folders and files

Latest commit

History

Repository files navigation

April 3

College tuition data

load data, check data

deal with annoying column names ==> regex time!

gather to correct format + create "rank" column

pull() out the top 10 states for start year and end year ==> assign color to them.

bump chart theme

the final bump chart

April 10

NFL salaries by position

April 16

Causes of death across the world

load data

Grab the "region name" from UNlocations dataset for easy subsetting! Ex. Eastern Europe, Western Africa, etc.

Unfortunately geom_boxplot() does not support "frame" for gganimate...

Therefore, create plot for each year with purrr then use ImageMagick or other to compile into gif/video!

Save each plot (per year) with ggsave!

Use ImageMagick through Command Console:

Final GIF:

April 23

Salary differences by gender in Australia

April 30

American Community Survey

May 7

Coffee chains across the USA and the World

About

Resources

Stars

Watchers

Forks

Releases

Packages 0

Languages

`pull()` out the top 10 states for start year and end year ==> assign color to them.

Packages