-
Notifications
You must be signed in to change notification settings - Fork 0
/
class-2019-02-26.Rmd
98 lines (85 loc) · 3.57 KB
/
class-2019-02-26.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
---
title: "class-2019-02-26"
author: "Evelyn Cai"
date: "February 26, 2019"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = FALSE)
library(tidyverse)
library(dplyr)
library(readxl)
library(janitor)
library(devtools)
library(gt)
library(fs)
```
```{r enrollment decreases}
# Download the spring 2019 course enrollment data from the Harvard registrar
download.file("https://registrar.fas.harvard.edu/files/fas-registrar/files/class_enrollment_summary_by_term_2.28.19.xlsx",
destfile = "reg_2019.xlsx",
# Ensure compatibility with both OS and Windows
mode = "wb")
# Download the spring 2018 course enrollment data from the Harvard registrar
download.file("https://registrar.fas.harvard.edu/files/fas-registrar/files/class_enrollment_summary_by_term_03.06.18.xlsx",
destfile = "reg_2018.xlsx",
mode = "wb")
# Read in the .xlsx file for 2019 and skip the first three rows to prevent the first three lines of NA values from showing up in dataset
x_2019 <- read_xlsx("reg_2019.xlsx", skip = 3) %>%
# Use janitor library to clean column names
clean_names() %>%
# Filter out courses without names and select the relevant variables
filter(!is.na(course_name)) %>%
select(course_title, course_name, u_grad, course_id)
# Read in the .xlsx for 2018, skipping the first two rows of NA values
x_2018 <- read_xlsx("reg_2018.xlsx", skip = 2) %>%
clean_names() %>%
filter(!is.na(course_name)) %>%
select(course_title, course_name, u_grad, course_id)
# Delete both files that are stored locally
file_delete("reg_2019.xlsx")
file_delete("reg_2018.xlsx")
# Merge the two datasets by course ID, which stays constant, and append the year as the suffix to column names
all <- inner_join(x_2019, x_2018, by = "course_id", suffix = c(".2019", ".2018"))
# Create a gt graphic showing the courses with the greatest decreases in enrollment from spring 2018 to spring 2019
all %>%
# Select for appropriate columns
select(course_title.2019, course_name.2019, u_grad.2018, u_grad.2019) %>%
# Calculate the change in enrollment
mutate(Change = -(u_grad.2019 - u_grad.2018)) %>%
# Order from highest to lowest and take the top 10
arrange(desc(Change)) %>%
slice(1:10) %>%
# Negative Change to show the negative value
mutate(Change = -Change) %>%
# Call gt to create the table
gt() %>%
# Add a title and source note, and change the labels
tab_header(title = "Biggest Enrollment Decreases in Spring 2019") %>%
tab_source_note(
source_note = "Data from the Harvard Registrar") %>%
cols_label("course_title.2019" = "Number",
"course_name.2019" = "Name",
"u_grad.2019" = "2019",
"u_grad.2018" = "2018")
```
``` {r new classes}
# Find the classes present in the 2019 dataset that were not in the 2018 dataset, and merge into one dataset by course ID
new_classes <- anti_join(x_2019, x_2018, by = "course_id")
# Create a second gt graphic showing the new courses with the highest enrollment numbers
new_classes %>%
select(-course_id) %>%
# Arrange from highest to lowest enrollment and take the top 10 values
arrange(desc(u_grad)) %>%
slice(1:10) %>%
# Call gt to create the table
gt() %>%
# Add appropriate titles, source-note, and change the column labels
tab_header(title = "Biggest New Classes in Spring 2019") %>%
tab_source_note(
source_note = "Data from the Harvard Registrar") %>%
cols_label("course_title" = "Number",
"course_name" = "Name",
"u_grad" = "Enrollment")
```
Link to RPubs tables: http://rpubs.com/evelyncai/472664