Adds storms data set with documentation. (tidyverse#2431)

* Adds storms data set with documentation. * Adds band, instrument, and instrument2 data sets, which are toy data sets for demonstrating joins, along with documentation. * Hunted down non-ASCII character in storms.Rd (a dash) and fixed in documentation. * Update documentation style of storms and band_ datasets * Rename band, instruments, and instruments2 to prefixed band_members, band_insturments, and band_instruments2 * Adds code used to generate data to /data-raw * Collapses code for generating band_ data sets into a single file * Adds devtools::use_data() commands to data-raw files * Adds missing #' to data-storms.R * Reparses documentation to use @Rdname, adds description of difference between band_instruments and band_instruments2 Fixes tidyverse#2094
machow · Feb 16, 2017 · 4cbb308 · 4cbb308
1 parent ec64c5a
commit 4cbb308
Show file tree

Hide file tree

Showing 11 changed files with 242 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -56,10 +56,12 @@ Collate:
     'colwise.R'
     'compute-collect.r'
     'copy-to.r'
+    'data-bands.R'
     'data-lahman.r'
     'data-nasa.r'
     'data-nycflights13.r'
     'data-starwars.R'
+    'data-storms.R'
     'data-temp.r'
     'data.r'
     'dataframe.R'

diff --git a/R/data-bands.R b/R/data-bands.R
@@ -0,0 +1,23 @@
+#' Band membership
+#'
+#' These data sets describe band members of the Beatles and Rolling Stones. They
+#' are toy data sets that can be displayed in their entirety on a slide (e.g. to
+#' demonstrate a join).
+#'
+#' `band_instruments` and `band_instruments2` contain the same data but use
+#' different column names for the first column of the data set.
+#' `band_instruments` uses `name`, which matches the name of the key column of
+#' `band_members`; `band_instruments2` uses `artist`, which does not.
+#'
+#' @format Each is a tibble with two variables and three observations
+#' @examples
+#' band_members
+#' band_instruments
+#' band_instruments2
+"band_members"
+
+#' @rdname band_members
+"band_instruments"
+
+#' @rdname band_members
+"band_instruments2"
diff --git a/R/data-storms.R b/R/data-storms.R
@@ -0,0 +1,25 @@
+#' Storm tracks data
+#'
+#' This data is a subset of the NOAA Atlantic hurricane database best track
+#' data, \url{http://www.nhc.noaa.gov/data/#hurdat}. The data includes the
+#' positions and attributes of 198 tropical storms, measured every six hours
+#' during the lifetime of a storm.
+#'
+#' @format A tibble with 10,010 observations and 13 variables:
+#' \describe{
+#' \item{name}{Storm Name}
+#' \item{year,month,day}{Date of report}
+#' \item{hour}{Hour of report (in UTC)}
+#' \item{lat,long}{Location of storm center}
+#' \item{status}{Storm classification (Tropical Depression, Tropical Storm,
+#'   or Hurricane)}
+#' \item{category}{Saffir-Simpson storm category (estimated from wind speed.
+#' -1 = Tropical Depression, 0 = Tropical Storm)}
+#' \item{wind}{storm's maximum sustained wind speed (in knots)}
+#' \item{pressure}{Air pressure at the storm's center (in millibars)}
+#' \item{ts_diameter}{Diameter of the area experiencing tropical storm strength winds (34 knots or above)}
+#' \item{hu_diameter}{Diameter of the area experiencing hurricane strength winds (64 knots or above)}
+#' }
+#' @examples
+#' storms
+"storms"
diff --git a/data-raw/band_members.R b/data-raw/band_members.R
@@ -0,0 +1,28 @@
+library(tidyverse)
+
+band_members <- tribble(
+  ~name,     ~band,
+  "Mick",  "Stones",
+  "John", "Beatles",
+  "Paul", "Beatles"
+)
+
+devtools::use_data(band_members)
+
+band_instruments <- tribble(
+  ~name,   ~plays,
+  "John", "guitar",
+  "Paul",   "bass",
+  "Keith", "guitar"
+)
+
+devtools::use_data(band_instruments)
+
+band_instruments2 <- tribble(
+  ~artist,   ~plays,
+  "John", "guitar",
+  "Paul",   "bass",
+  "Keith", "guitar"
+)
+
+devtools::use_data(band_instruments2)
diff --git a/data-raw/storms.R b/data-raw/storms.R
@@ -0,0 +1,97 @@
+library(tidyverse)
+
+# Creates storms data set from NOAA Atlantic Hurricane data, which is provided
+# in an unorthodox format: a csv that alternates between header/identifier rows
+# and data rows.
+
+# Read in data set so each line is a character string
+storm_strings <- read_lines("http://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2015-070616.txt")
+
+# Identify the header lines that have three commas
+library(stringr)
+header_locations <- (1:length(storm_strings))[str_count(storm_strings, "\\,") == 3]
+
+# Extract length of each sub-dataset
+headers <- as.list(storm_strings[header_locations])
+headers_df <- headers %>%
+  map(str_sub, start = 1, end = -2) %>% # to remove trailing comma
+  map(paste0, "\n") %>%                 # to trigger literal read
+  map_df(read_csv, col_names = c("id", "name", "n_obs")) %>%
+  mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
+  select(name, skip, n_obs)
+
+# Read in the sub-datasets as data frames
+df_names <- c("date", "time", "record_type", "status", "lat", "long", "wind", "pressure",
+              "extent_34_NE",  "extent_34_SE",  "extent_34_SW",  "extent_34_NW",
+              "extent_50_NE",  "extent_50_SE",  "extent_50_SW",  "extent_50_NW",
+              "extent_64_NE",  "extent_64_SE",  "extent_64_SW",  "extent_64_NW", "nas")
+
+storm_dfs <- vector("list", nrow(headers_df))
+names(storm_dfs) <- headers_df$name
+
+for(i in seq_along(headers_df$name)) {
+  storm_dfs[[i]] <- read_csv("data-raw/hurdat2.txt",
+                             skip = headers_df$skip[i],
+                             n_max = headers_df$n_obs[i],
+                             col_names = df_names,
+                             na = c("", "-99", "-999"),
+                             col_types = list(time = col_character(),
+                                              pressure = col_integer(),
+                                              extent_34_NE = col_integer(),
+                                              extent_34_SE = col_integer(),
+                                              extent_34_SW = col_integer(),
+                                              extent_34_NW = col_integer(),
+                                              extent_50_NE = col_integer(),
+                                              extent_50_SE = col_integer(),
+                                              extent_50_SW = col_integer(),
+                                              extent_50_NW = col_integer(),
+                                              extent_64_NE = col_integer(),
+                                              extent_64_SE = col_integer(),
+                                              extent_64_SW = col_integer(),
+                                              extent_64_NW = col_integer()
+                                              ))
+}
+
+# Combine and clean the data sets
+library(lubridate)
+
+storms <- storm_dfs %>%
+  bind_rows(.id = "name") %>%
+  mutate(date = ymd(date),
+         year = year(date),
+         month = month(date),
+         day = day(date),
+         hour = as.numeric(str_sub(time, 1, 2)),
+         lat_hemisphere =  str_sub(lat, -1),
+         lat_sign = if_else(lat_hemisphere == "N", 1, -1),
+         lat = as.numeric(str_sub(lat, 1, -2)) * lat_sign,
+         long_hemisphere =  str_sub(long, -1),
+         long_sign = if_else(long_hemisphere == "E", 1, -1),
+         long = as.numeric(str_sub(long, 1, -2)) * long_sign,
+         category = cut(wind, breaks = c(0, 34, 64, 83, 96, 113, 137, 500),
+                        labels = c(-1, 0, 1, 2, 3, 4, 5),
+                        include.lowest = TRUE, ordered = TRUE),
+         # wind = wind * 1.15078, # transforms knots to mph,
+         TSradius1 = extent_34_NE + extent_34_SW,
+         TSradius2 = extent_34_NW + extent_34_SE,
+         ts_diameter = pmax(TSradius1, TSradius2) * 1.15078, # to convert from nautical miles to miles
+         HUradius1 = extent_64_NE + extent_64_SW,
+         HUradius2 = extent_64_NW + extent_64_SE,
+         hu_diameter = pmax(HUradius1, HUradius2) * 1.15078, # to convert from nautical miles to miles
+         status = recode(status, "HU" = "hurricane", "TS" = "tropical storm", "TD" = "tropical depression")) %>%
+  select(name, year, month, day, hour, lat, long, status, category, wind, pressure, ts_diameter, hu_diameter)
+
+# Narrow to storms that have complete pressure record
+completeish <- storms %>%
+  group_by(name) %>%
+  summarise(n_pressure = sum(!is.na(pressure)), p_pressure = mean(!is.na(pressure))) %>%
+  filter(p_pressure == 1) %>%
+  .[["name"]]
+
+storms <- storms %>%
+  filter(status %in% c("hurricane", "tropical storm", "tropical depression"),
+         name %in% completeish) %>%
+  mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))
+
+devtools::use_data(storms)
+
diff --git a/data/band_instruments.rda b/data/band_instruments.rda
diff --git a/data/band_instruments2.rda b/data/band_instruments2.rda
diff --git a/data/band_members.rda b/data/band_members.rda
diff --git a/data/storms.rda b/data/storms.rda
diff --git a/man/band_members.Rd b/man/band_members.Rd
diff --git a/man/storms.Rd b/man/storms.Rd