Update storms data to 2020 (tidyverse#6000)

* Update storms data to 2020 * try to make storms example pass the check * Add call to library(ggplot2) in example * Clarified storm diameter variables * NEWS Co-authored-by: Romain Francois <[email protected]>
machow · Sep 17, 2021 · 06e2fdd · 06e2fdd
1 parent a6edc4c
commit 06e2fdd
Show file tree

Hide file tree

Showing 6 changed files with 87 additions and 46 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -50,6 +50,7 @@ Suggests:
     covr,
     DBI,
     dbplyr (>= 1.4.3),
+    ggplot2,
     knitr,
     Lahman,
     lobstr,

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,7 @@
 # dplyr (development version)
 
+* `storms` data updated to 2020 (@steveharoz, #5899).
+
 * `coalesce()` accepts 1-D arrays (#5557).
 
 * `filter()` forbids matrix results (#5973) and warns about data frame 

diff --git a/R/data-storms.R b/R/data-storms.R
@@ -2,12 +2,12 @@
 #'
 #' This data is a subset of the NOAA Atlantic hurricane database best track
 #' data, \url{https://www.nhc.noaa.gov/data/#hurdat}. The data includes the
-#' positions and attributes of 198 tropical storms, measured every six hours
+#' positions and attributes of storms from 1975-2020, measured every six hours
 #' during the lifetime of a storm.
 #'
 #' @seealso The script to create the storms data set: \url{https://github.com/tidyverse/dplyr/blob/master/data-raw/storms.R}
 #'
-#' @format A tibble with 10,010 observations and 13 variables:
+#' @format A tibble with 11,859 observations and 13 variables:
 #' \describe{
 #' \item{name}{Storm Name}
 #' \item{year,month,day}{Date of report}
@@ -19,9 +19,20 @@
 #' -1 = Tropical Depression, 0 = Tropical Storm)}
 #' \item{wind}{storm's maximum sustained wind speed (in knots)}
 #' \item{pressure}{Air pressure at the storm's center (in millibars)}
-#' \item{ts_diameter}{Diameter of the area experiencing tropical storm strength winds (34 knots or above)}
-#' \item{hu_diameter}{Diameter of the area experiencing hurricane strength winds (64 knots or above)}
+#' \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the area experiencing tropical storm strength winds (34 knots or above)}
+#' \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area experiencing hurricane strength winds (64 knots or above)}
 #' }
 #' @examples
+#'
+#' # show a plot of the storm paths
+#' if (requireNamespace("ggplot2", quietly = TRUE)) {
+#'   library(ggplot2)
+#'   ggplot(storms) +
+#'     aes(x=long, y=lat, color=paste(year, name)) +
+#'     geom_path() +
+#'     guides(color='none') +
+#'     facet_wrap(~year)
+#' }
+#'
 #' storms
 "storms"
diff --git a/data-raw/storms.R b/data-raw/storms.R
@@ -4,11 +4,13 @@ library(tidyverse)
 # in an unorthodox format: a csv that alternates between header/identifier rows
 # and data rows.
 
+# TO UPDATE: get the latest URL from https://www.nhc.noaa.gov/data/#hurdat, and rerun this code
+
 # Read in data set so each line is a character string
-storm_strings <- read_lines("http://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2015-070616.txt")
+storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2020-052921.txt")
+storm_strings <- read_lines(storm_file_complete)
 
 # Identify the header lines that have three commas
-library(stringr)
 header_locations <- (1:length(storm_strings))[str_count(storm_strings, "\\,") == 3]
 
 # Extract length of each sub-dataset
@@ -20,47 +22,60 @@ headers_df <- headers %>%
   mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
   select(name, skip, n_obs)
 
-# Read in the sub-datasets as data frames
-df_names <- c(
-  "date", "time", "record_type", "status", "lat", "long", "wind", "pressure",
-  "extent_34_NE", "extent_34_SE", "extent_34_SW", "extent_34_NW",
-  "extent_50_NE", "extent_50_SE", "extent_50_SW", "extent_50_NW",
-  "extent_64_NE", "extent_64_SE", "extent_64_SW", "extent_64_NW", "nas"
+column_types <- list(
+  date = col_character(),
+  time = col_character(),
+  record_type = col_character(),
+  status = col_character(),
+  lat = col_character(),
+  long = col_character(),
+  wind = col_integer(),
+  pressure = col_integer(),
+  extent_34_NE = col_integer(),
+  extent_34_SE = col_integer(),
+  extent_34_SW = col_integer(),
+  extent_34_NW = col_integer(),
+  extent_50_NE = col_integer(),
+  extent_50_SE = col_integer(),
+  extent_50_SW = col_integer(),
+  extent_50_NW = col_integer(),
+  extent_64_NE = col_integer(),
+  extent_64_SE = col_integer(),
+  extent_64_SW = col_integer(),
+  extent_64_NW = col_integer(),
+  nas = col_integer()
 )
+column_names <- names(column_types)
 
-storm_dfs <- vector("list", nrow(headers_df))
-names(storm_dfs) <- headers_df$name
 
-for (i in seq_along(headers_df$name)) {
-  storm_dfs[[i]] <- read_csv("data-raw/hurdat2.txt",
-    skip = headers_df$skip[i],
-    n_max = headers_df$n_obs[i],
-    col_names = df_names,
-    na = c("", "-99", "-999"),
-    col_types = list(
-      time = col_character(),
-      pressure = col_integer(),
-      extent_34_NE = col_integer(),
-      extent_34_SE = col_integer(),
-      extent_34_SW = col_integer(),
-      extent_34_NW = col_integer(),
-      extent_50_NE = col_integer(),
-      extent_50_SE = col_integer(),
-      extent_50_SW = col_integer(),
-      extent_50_NW = col_integer(),
-      extent_64_NE = col_integer(),
-      extent_64_SE = col_integer(),
-      extent_64_SW = col_integer(),
-      extent_64_NW = col_integer()
-    )
+#### Parse each storm as its own sub-dataframe
+storm_dataframes <- vector("list", nrow(headers_df))
+for (i in 1:nrow(headers_df)) {
+  # get this storm's metadata
+  row_start = headers_df[i,]$skip + 1
+  row_end = headers_df[i,]$n_obs + row_start - 1
+  # subset of rows belonging to this storm
+  data_subset = storm_strings[row_start:row_end] %>%
+    paste(collapse = "\n") %>%
+    paste0("\n")
+  data_subset = read_csv(
+    data_subset,
+    col_names = column_names,
+    col_types = column_types,
+    na = c("", "-99", "-999")
   )
+  # name at the front
+  data_subset$name = headers_df[i,]$name
+  data_subset = data_subset %>% relocate(name)
+  # add to list of storms
+  storm_dataframes[[i]] = data_subset
 }
 
 # Combine and clean the data sets
 library(lubridate)
 
-storms <- storm_dfs %>%
-  bind_rows(.id = "name") %>%
+storms <- storm_dataframes %>%
+  bind_rows() %>%
   mutate(
     date = ymd(date),
     year = year(date),
@@ -81,13 +96,13 @@ storms <- storm_dfs %>%
     # wind = wind * 1.15078, # transforms knots to mph,
     TSradius1 = extent_34_NE + extent_34_SW,
     TSradius2 = extent_34_NW + extent_34_SE,
-    ts_diameter = pmax(TSradius1, TSradius2) * 1.15078, # to convert from nautical miles to miles
+    tropicalstorm_force_diameter = pmax(TSradius1, TSradius2),
     HUradius1 = extent_64_NE + extent_64_SW,
     HUradius2 = extent_64_NW + extent_64_SE,
-    hu_diameter = pmax(HUradius1, HUradius2) * 1.15078, # to convert from nautical miles to miles
+    hurricane_force_diameter = pmax(HUradius1, HUradius2),
     status = recode(status, "HU" = "hurricane", "TS" = "tropical storm", "TD" = "tropical depression")
   ) %>%
-  select(name, year, month, day, hour, lat, long, status, category, wind, pressure, ts_diameter, hu_diameter)
+  select(name, year, month, day, hour, lat, long, status, category, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)
 
 # Narrow to storms that have complete pressure record
 completeish <- storms %>%
@@ -103,4 +118,5 @@ storms <- storms %>%
   ) %>%
   mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))
 
-devtools::use_data(storms)
+# output for the package
+usethis::use_data(storms, overwrite = TRUE)
diff --git a/data/storms.rda b/data/storms.rda
diff --git a/man/storms.Rd b/man/storms.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -50,6 +50,7 @@ Suggests: @@
         covr,
         DBI,
         dbplyr (>= 1.4.3),
+        ggplot2,
         knitr,
         Lahman,
         lobstr,
@@ Expand Down @@