Skip to content

Commit

Permalink
Update storms data to 2020 (tidyverse#6000)
Browse files Browse the repository at this point in the history
* Update storms data to 2020

* try to make storms example pass the check

* Add call to library(ggplot2) in example

* Clarified storm diameter variables

* NEWS

Co-authored-by: Romain Francois <[email protected]>
  • Loading branch information
steveharoz and romainfrancois authored Sep 17, 2021
1 parent a6edc4c commit 06e2fdd
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 46 deletions.
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ Suggests:
covr,
DBI,
dbplyr (>= 1.4.3),
ggplot2,
knitr,
Lahman,
lobstr,
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# dplyr (development version)

* `storms` data updated to 2020 (@steveharoz, #5899).

* `coalesce()` accepts 1-D arrays (#5557).

* `filter()` forbids matrix results (#5973) and warns about data frame
Expand Down
19 changes: 15 additions & 4 deletions R/data-storms.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,12 @@
#'
#' This data is a subset of the NOAA Atlantic hurricane database best track
#' data, \url{https://www.nhc.noaa.gov/data/#hurdat}. The data includes the
#' positions and attributes of 198 tropical storms, measured every six hours
#' positions and attributes of storms from 1975-2020, measured every six hours
#' during the lifetime of a storm.
#'
#' @seealso The script to create the storms data set: \url{https://github.com/tidyverse/dplyr/blob/master/data-raw/storms.R}
#'
#' @format A tibble with 10,010 observations and 13 variables:
#' @format A tibble with 11,859 observations and 13 variables:
#' \describe{
#' \item{name}{Storm Name}
#' \item{year,month,day}{Date of report}
Expand All @@ -19,9 +19,20 @@
#' -1 = Tropical Depression, 0 = Tropical Storm)}
#' \item{wind}{storm's maximum sustained wind speed (in knots)}
#' \item{pressure}{Air pressure at the storm's center (in millibars)}
#' \item{ts_diameter}{Diameter of the area experiencing tropical storm strength winds (34 knots or above)}
#' \item{hu_diameter}{Diameter of the area experiencing hurricane strength winds (64 knots or above)}
#' \item{tropicalstorm_force_diameter}{Diameter (in nautical miles) of the area experiencing tropical storm strength winds (34 knots or above)}
#' \item{hurricane_force_diameter}{Diameter (in nautical miles) of the area experiencing hurricane strength winds (64 knots or above)}
#' }
#' @examples
#'
#' # show a plot of the storm paths
#' if (requireNamespace("ggplot2", quietly = TRUE)) {
#' library(ggplot2)
#' ggplot(storms) +
#' aes(x=long, y=lat, color=paste(year, name)) +
#' geom_path() +
#' guides(color='none') +
#' facet_wrap(~year)
#' }
#'
#' storms
"storms"
92 changes: 54 additions & 38 deletions data-raw/storms.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ library(tidyverse)
# in an unorthodox format: a csv that alternates between header/identifier rows
# and data rows.

# TO UPDATE: get the latest URL from https://www.nhc.noaa.gov/data/#hurdat, and rerun this code

# Read in data set so each line is a character string
storm_strings <- read_lines("http://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2015-070616.txt")
storm_file_complete <- read_file("https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2020-052921.txt")
storm_strings <- read_lines(storm_file_complete)

# Identify the header lines that have three commas
library(stringr)
header_locations <- (1:length(storm_strings))[str_count(storm_strings, "\\,") == 3]

# Extract length of each sub-dataset
Expand All @@ -20,47 +22,60 @@ headers_df <- headers %>%
mutate(name = recode(name, "UNNAMED" = id), skip = header_locations) %>%
select(name, skip, n_obs)

# Read in the sub-datasets as data frames
df_names <- c(
"date", "time", "record_type", "status", "lat", "long", "wind", "pressure",
"extent_34_NE", "extent_34_SE", "extent_34_SW", "extent_34_NW",
"extent_50_NE", "extent_50_SE", "extent_50_SW", "extent_50_NW",
"extent_64_NE", "extent_64_SE", "extent_64_SW", "extent_64_NW", "nas"
column_types <- list(
date = col_character(),
time = col_character(),
record_type = col_character(),
status = col_character(),
lat = col_character(),
long = col_character(),
wind = col_integer(),
pressure = col_integer(),
extent_34_NE = col_integer(),
extent_34_SE = col_integer(),
extent_34_SW = col_integer(),
extent_34_NW = col_integer(),
extent_50_NE = col_integer(),
extent_50_SE = col_integer(),
extent_50_SW = col_integer(),
extent_50_NW = col_integer(),
extent_64_NE = col_integer(),
extent_64_SE = col_integer(),
extent_64_SW = col_integer(),
extent_64_NW = col_integer(),
nas = col_integer()
)
column_names <- names(column_types)

storm_dfs <- vector("list", nrow(headers_df))
names(storm_dfs) <- headers_df$name

for (i in seq_along(headers_df$name)) {
storm_dfs[[i]] <- read_csv("data-raw/hurdat2.txt",
skip = headers_df$skip[i],
n_max = headers_df$n_obs[i],
col_names = df_names,
na = c("", "-99", "-999"),
col_types = list(
time = col_character(),
pressure = col_integer(),
extent_34_NE = col_integer(),
extent_34_SE = col_integer(),
extent_34_SW = col_integer(),
extent_34_NW = col_integer(),
extent_50_NE = col_integer(),
extent_50_SE = col_integer(),
extent_50_SW = col_integer(),
extent_50_NW = col_integer(),
extent_64_NE = col_integer(),
extent_64_SE = col_integer(),
extent_64_SW = col_integer(),
extent_64_NW = col_integer()
)
#### Parse each storm as its own sub-dataframe
storm_dataframes <- vector("list", nrow(headers_df))
for (i in 1:nrow(headers_df)) {
# get this storm's metadata
row_start = headers_df[i,]$skip + 1
row_end = headers_df[i,]$n_obs + row_start - 1
# subset of rows belonging to this storm
data_subset = storm_strings[row_start:row_end] %>%
paste(collapse = "\n") %>%
paste0("\n")
data_subset = read_csv(
data_subset,
col_names = column_names,
col_types = column_types,
na = c("", "-99", "-999")
)
# name at the front
data_subset$name = headers_df[i,]$name
data_subset = data_subset %>% relocate(name)
# add to list of storms
storm_dataframes[[i]] = data_subset
}

# Combine and clean the data sets
library(lubridate)

storms <- storm_dfs %>%
bind_rows(.id = "name") %>%
storms <- storm_dataframes %>%
bind_rows() %>%
mutate(
date = ymd(date),
year = year(date),
Expand All @@ -81,13 +96,13 @@ storms <- storm_dfs %>%
# wind = wind * 1.15078, # transforms knots to mph,
TSradius1 = extent_34_NE + extent_34_SW,
TSradius2 = extent_34_NW + extent_34_SE,
ts_diameter = pmax(TSradius1, TSradius2) * 1.15078, # to convert from nautical miles to miles
tropicalstorm_force_diameter = pmax(TSradius1, TSradius2),
HUradius1 = extent_64_NE + extent_64_SW,
HUradius2 = extent_64_NW + extent_64_SE,
hu_diameter = pmax(HUradius1, HUradius2) * 1.15078, # to convert from nautical miles to miles
hurricane_force_diameter = pmax(HUradius1, HUradius2),
status = recode(status, "HU" = "hurricane", "TS" = "tropical storm", "TD" = "tropical depression")
) %>%
select(name, year, month, day, hour, lat, long, status, category, wind, pressure, ts_diameter, hu_diameter)
select(name, year, month, day, hour, lat, long, status, category, wind, pressure, tropicalstorm_force_diameter, hurricane_force_diameter)

# Narrow to storms that have complete pressure record
completeish <- storms %>%
Expand All @@ -103,4 +118,5 @@ storms <- storms %>%
) %>%
mutate(name = if_else(str_sub(name, 1, 3) %in% c("AL0", "AL1"), name, str_to_title(name)))

devtools::use_data(storms)
# output for the package
usethis::use_data(storms, overwrite = TRUE)
Binary file modified data/storms.rda
Binary file not shown.
19 changes: 15 additions & 4 deletions man/storms.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 06e2fdd

Please sign in to comment.