-
Notifications
You must be signed in to change notification settings - Fork 20
/
Copy pathfetchFSI.R
48 lines (37 loc) · 1.51 KB
/
fetchFSI.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# This script gets the Fund for Peace's Fragile States Index data files from the web and makes
# a country-year table from them
library(tidyverse)
library(rvest)
library(readxl)
library(lubridate)
options(stringsAsFactors = FALSE)
# scrape the urls for the excel files from the FFP download page
fsi_urls <- read_html("http://fundforpeace.org/fsi/excel/") %>%
# found this by inspecting the html in Chrome
html_nodes("td") %>%
# grab the urls from those nodes, which are basically long strings; I tried http_attr("html")
# and a couple of variations, but it didn't work
lapply(., function(x) str_extract(as.character(x), "http://fundforpeace.org/.{1,}.xlsx")) %>%
# each url is repeated, so get rid of the duplicates
unique(.) %>%
# get rid of the remaining NA
.[!is.na(.)] %>%
# one of the duplicates was a jackelope for some reason, so we need this to get rid of it
.[!grepl("a href", .)]
# download the excel files from those urls to tempfiles and read them into R as data tables
fsi_list <- lapply(fsi_urls, function(x) {
tmp <- tempfile()
download.file(x, tmp)
df <- read_excel(tmp)
})
# collapse that list of annual data tables into one big data frame
fsi_db <- bind_rows(fsi_list)
# clean up the column names
names(fsi_db) <- names(fsi_db) %>%
gsub("[A-Z]{1}[0-9]{1}: ", "", .) %>%
tolower(.) %>%
make.names(.)
# convert the date field to a simple year and drop the messy rank column
fsi_db <- fsi_db %>%
mutate(year = lubridate::year(year)) %>%
select(-rank)