forked from tidyverse/ggplot2
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtx-housing.R
64 lines (56 loc) · 1.92 KB
/
tx-housing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
library(rvest)
library(tidyr)
library(readr)
library(dplyr)
# Find list of all pages -------------------------------------------------------
root <- read_html("http://recenter.tamu.edu/Data/hs/")
links <- root %>%
html_nodes(".threecol a")
pages <- links %>%
html_attr("href") %>%
url_absolute(xml_url(root)) %>%
setNames(html_text(links))
# Extract table from each page -------------------------------------------------
to_char <- function(df) {
df[] <- lapply(df, as.character)
df
}
tamu_table <- . %>%
html() %>%
html_node(".dataTable") %>%
html_table()
tables <- lapply(pages, tamu_table)
data <- lapply(tables, . %>% .[-1, ] %>% to_char) %>%
Map(function(df, city) {
df$city <- city
df
}, ., names(.)) %>%
bind_rows() %>%
as_data_frame()
data[data == "-"] <- NA
months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep",
"Oct", "Nov", "Dec")
txhousing <- data %>%
mutate(
Sales = parse_numeric(Sales),
DollarVolume = parse_numeric(DollarVolume),
AveragePrice = parse_numeric(AveragePrice),
MedianPrice = parse_numeric(MedianPrice),
TotalListings = parse_numeric(TotalListings),
MonthsInventory = parse_numeric(MonthsInventory)
) %>%
extract(Date, c("Year", "Month"), "(\\d*)-?([a-zA-Z]*)", convert = TRUE) %>%
mutate(
Year = zoo::na.locf(ifelse(Year == "", NA, Year)),
Month = match(Month, months)) %>%
select(city, year = Year, month = Month, sales = Sales,
volume = DollarVolume, average = AveragePrice, median = MedianPrice,
listings = TotalListings, inventory = MonthsInventory) %>%
mutate(date = year + (month - 1) / 12) %>%
# Don't need totals & Palestine is v. low quality
filter(!(city %in% c("Texas Totals", "Palestine"))) %>%
# Reduce file size
filter(year >= 2000) %>%
select(-average)
write.csv(txhousing, "data-raw/tx-housing.csv", row.names = FALSE, quote = FALSE)
devtools::use_data(txhousing, overwrite = TRUE)