diff --git a/.claude/settings.json b/.claude/settings.json index b0d45a18..18f85c0e 100644 --- a/.claude/settings.json +++ b/.claude/settings.json @@ -1,6 +1,19 @@ { "permissions": { "allow": [ + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" -e \":*)", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla -e \"cat\\(''R works\\\\n''\\)\")", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla check_maps.R)", + "Bash(/tmp/check_nhtsa.R:*)", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla /tmp/check_nhtsa.R)", + "Bash(/tmp/check_nhtsa2.R:*)", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla /tmp/check_nhtsa2.R)", + "Bash(/tmp/check_missing.R:*)", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla /tmp/check_missing.R)", + "Bash(/tmp/check_missing2.R:*)", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla /tmp/check_missing2.R)", + "Bash(/tmp/check_sd.R:*)", + "Bash(\"/c/Program Files/R/R-4.4.1/bin/Rscript.exe\" --vanilla /tmp/check_sd.R)", "WebFetch(domain:crisistrends.org)", "WebFetch(domain:mysolace.github.io)", "WebFetch(domain:www.globenewswire.com)", @@ -16,6 +29,9 @@ "Bash(curl -s \"https://app.sigmacomputing.com/static/js/index.CkWRZN0h.js\")", "WebFetch(domain:help.sigmacomputing.com)", "Bash(curl -s \"https://api.sigmacomputing.com/v2/workbooks?search=crisis\" -H \"Accept: application/json\")" + ], + "additionalDirectories": [ + "\\tmp" ] } } diff --git a/.gitignore b/.gitignore index 004b9d6d..bea50a9f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ renv/ *.Rdata .DS_Store data/vaers/raw/staging/ +data/nhtsa_crash/raw/*.zip data/abcs/raw/acs_cache/ pophive_data.db nul @@ -109,31 +110,7 @@ data_commons_export/wisqars/config.json data_commons_export/wisqars/data.tmcf data_commons_export/wisqars/variables.mcf data_commons_export/wisqars/wisqars.csv -_ul -data/nhtsa_crash/raw/FARS2000NationalCSV.zip -data/nhtsa_crash/raw/FARS2001NationalCSV.zip -data/nhtsa_crash/raw/FARS2002NationalCSV.zip -data/nhtsa_crash/raw/FARS2003NationalCSV.zip -data/nhtsa_crash/raw/FARS2004NationalCSV.zip -data/nhtsa_crash/raw/FARS2005NationalCSV.zip -data/nhtsa_crash/raw/FARS2006NationalCSV.zip -data/nhtsa_crash/raw/FARS2007NationalCSV.zip -data/nhtsa_crash/raw/FARS2008NationalCSV.zip -data/nhtsa_crash/raw/FARS2009NationalCSV.zip -data/nhtsa_crash/raw/FARS2010NationalCSV.zip -data/nhtsa_crash/raw/FARS2011NationalCSV.zip -data/nhtsa_crash/raw/FARS2012NationalCSV.zip -data/nhtsa_crash/raw/FARS2013NationalCSV.zip -data/nhtsa_crash/raw/FARS2014NationalCSV.zip -data/nhtsa_crash/raw/FARS2015NationalCSV.zip -data/nhtsa_crash/raw/FARS2016NationalCSV.zip -data/nhtsa_crash/raw/FARS2017NationalCSV.zip -data/nhtsa_crash/raw/FARS2018NationalCSV.zip -data/nhtsa_crash/raw/FARS2019NationalCSV.zip -data/nhtsa_crash/raw/FARS2020NationalCSV.zip -data/nhtsa_crash/raw/FARS2021NationalCSV.zip -data/nhtsa_crash/raw/FARS2022NationalCSV.zip -data/nhtsa_crash/raw/FARS2023NationalCSV.zip +_ul* _ul-MWMJ0G3P8D raw/abcs_census_age_stratified_pop_pre2009.csv raw/abcs_census_age_stratified_pop.csv diff --git a/data/nhtsa_crash/README.md b/data/nhtsa_crash/README.md new file mode 100644 index 00000000..5008ab9c --- /dev/null +++ b/data/nhtsa_crash/README.md @@ -0,0 +1,15 @@ +# nhtsa_crash + +This is a dcf data source project, initialized with `dcf::dcf_add_source`. + +You can use the `dcf` package to check the project: + +```R +dcf_check() +``` + +And process it: + +```R +dcf_process() +``` diff --git a/data/nhtsa_crash/check_maps.R b/data/nhtsa_crash/check_maps.R new file mode 100644 index 00000000..5c0ff478 --- /dev/null +++ b/data/nhtsa_crash/check_maps.R @@ -0,0 +1,440 @@ +# ============================================================================= +# NHTSA FARS – County Fatality Rate Choropleth Maps (3 years) +# Run from: data/nhtsa_crash/ +# ============================================================================= + +library(dplyr) +library(tidyr) +library(vroom) +library(sf) +library(tigris) +library(ggplot2) +library(scales) +library(patchwork) + +# Three years spread across the 2000-2023 range +selected_years <- c("2005", "2013", "2023") +selected_times <- paste0(selected_years, "-12-31") + +# ----------------------------------------------------------------------------- +# 1. Load county-level NHTSA data +# Fix SD FIPS: FARS uses 46113 (Shannon Co.) throughout all years, but +# 2023 tigris geometry uses 46102 (Oglala Lakota Co.) after the 2015 rename. +# ----------------------------------------------------------------------------- +cat("Loading NHTSA county data...\n") +county_data <- vroom("standard/data.csv.gz", show_col_types = FALSE) |> + filter(nchar(geography) == 5) |> + mutate( + geography = if_else(geography == "46113", "46102", geography), + time = as.character(time) + ) +View(county_data) +# ----------------------------------------------------------------------------- +# 2. Build county geometry +# - Base: cached 2023 counties (excludes AK, HI, territories) +# - CT fix: 2023 geometry has new planning region codes (09110-09190) but +# FARS uses old county codes (09001-09015). Replace CT with year=2021 +# boundaries so geometries match FARS FIPS codes. +# ----------------------------------------------------------------------------- +cat("Loading county geometries...\n") +counties_base <- readRDS("C:/Users/ashap/OneDrive/Desktop/Ingest/data/noaa_heat_risk/raw/counties.rds") |> + mutate(geography = as.character(geography)) |> + filter( + !substr(geography, 1, 2) %in% c("02", "15"), # drop AK, HI + substr(geography, 1, 2) != "09" # drop new CT planning regions + ) + +cat("Fetching old CT county boundaries (year=2021)...\n") +ct_old <- tigris::counties( + state = "09", cb = TRUE, resolution = "5m", + year = 2021, progress_bar = FALSE +) |> + select(geography = GEOID) |> + mutate(geography = as.character(geography)) + +counties_sf <- bind_rows(counties_base, ct_old) + +# ----------------------------------------------------------------------------- +# 3. Build complete county × year grid; fill missing with 0 +# Missing = county in geometry with no FARS rows for that year → zero +# fatalities, not a boundary mismatch (those were fixed above). +# ----------------------------------------------------------------------------- +complete_grid <- expand.grid( + geography = counties_sf$geography, + time = selected_times, + stringsAsFactors = FALSE +) + +county_filled <- complete_grid |> + left_join( + county_data |> filter(time %in% selected_times), + by = c("geography", "time") + ) |> + mutate(across( + c(nhtsa_fatalities, nhtsa_fatal_crashes, nhtsa_fatality_rate), + \(x) replace_na(x, 0) + )) + +# ----------------------------------------------------------------------------- +# 4. Join geometry and prepare map data +# ----------------------------------------------------------------------------- +map_data <- counties_sf |> + left_join(county_filled, by = "geography") |> + mutate( + date_label = factor( + time, + levels = selected_times, + labels = paste("Year", selected_years) + ) + ) + +cat( + "Rate range (per 100k):", + min(county_filled$nhtsa_fatality_rate, na.rm = TRUE), + "to", + max(county_filled$nhtsa_fatality_rate, na.rm = TRUE), "\n" +) + +# Cap display at 99th percentile to reduce outlier distortion from small counties +rate_cap <- quantile( + county_filled$nhtsa_fatality_rate[county_filled$nhtsa_fatality_rate > 0], + 0.99, na.rm = TRUE +) +map_data <- map_data |> + mutate(rate_display = pmin(nhtsa_fatality_rate, rate_cap)) + +# ----------------------------------------------------------------------------- +# 5. Plot +# ----------------------------------------------------------------------------- +cat("Rendering maps...\n") +p <- ggplot(map_data) + + geom_sf(aes(fill = rate_display), color = NA) + + scale_fill_viridis_c( + option = "inferno", + direction = 1, + trans = "sqrt", + name = "Traffic\nFatalities\nper 100k", + labels = label_number(accuracy = 1), + na.value = "grey80", + guide = guide_colorbar( + barwidth = unit(0.5, "cm"), + barheight = unit(6, "cm"), + ticks = TRUE + ) + ) + + facet_wrap(~date_label, ncol = 1) + + coord_sf(crs = 5070, datum = NA) + + labs( + title = "NHTSA FARS \u2013 County Traffic Fatality Rate", + subtitle = paste0( + "Fatalities per 100,000 population (2021 census denominator)\n", + "Zero-fatality counties shown as black; ", + "fill capped at 99th pctile (~", round(rate_cap, 0), " per 100k); sqrt scale" + ), + caption = paste0( + "Source: NHTSA Fatality Analysis Reporting System (FARS)\n", + "https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/" + ) + ) + + theme_void(base_size = 11) + + theme( + plot.title = element_text(face = "bold", hjust = 0.5, size = 13), + plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 8), + plot.caption = element_text(hjust = 0.5, color = "grey50", size = 7), + strip.text = element_text(face = "bold", size = 10), + legend.position = "right", + plot.margin = margin(10, 10, 10, 10) + ) +p +out_path <- "standard/nhtsa_fatality_rate_choropleth.png" +ggsave(out_path, p, width = 9, height = 14, dpi = 150, bg = "white") +cat("Saved:", out_path, "\n") + +# ============================================================================= +# Shared: State geometry (reused by both NHTSA state and WISQARS sections) +# ============================================================================= + +cat("Loading state geometries...\n") +states_sf <- tigris::states(cb = TRUE, resolution = "5m", year = 2021, + progress_bar = FALSE) |> + filter(!STUSPS %in% c("AK", "HI", "PR", "VI", "GU", "MP", "AS")) |> + select(geography = GEOID) |> + mutate(geography = as.character(geography)) + +# ============================================================================= +# NHTSA FARS – State Fatality Rate Choropleth Maps (3 years) +# ============================================================================= + +cat("Loading NHTSA state data...\n") +nhtsa_state_data <- vroom("standard/data.csv.gz", show_col_types = FALSE) |> + filter( + nchar(geography) == 2, + geography != "00", + time %in% selected_times + ) |> + select(geography, time, nhtsa_fatality_rate) |> + mutate(time = as.character(time)) + +complete_nhtsa_state_grid <- expand.grid( + geography = states_sf$geography, + time = selected_times, + stringsAsFactors = FALSE +) + +nhtsa_state_filled <- complete_nhtsa_state_grid |> + left_join(nhtsa_state_data, by = c("geography", "time")) |> + mutate(nhtsa_fatality_rate = replace_na(nhtsa_fatality_rate, 0)) + +nhtsa_state_map_data <- states_sf |> + left_join(nhtsa_state_filled, by = "geography") |> + mutate( + date_label = factor( + time, + levels = selected_times, + labels = paste("Year", selected_years) + ) + ) + +nhtsa_state_rate_cap <- quantile( + nhtsa_state_filled$nhtsa_fatality_rate[nhtsa_state_filled$nhtsa_fatality_rate > 0], + 0.99, na.rm = TRUE +) +nhtsa_state_map_data <- nhtsa_state_map_data |> + mutate(rate_display = pmin(nhtsa_fatality_rate, nhtsa_state_rate_cap)) + +cat("Rendering NHTSA state maps...\n") +p_nhtsa_state <- ggplot(nhtsa_state_map_data) + + geom_sf(aes(fill = rate_display), color = "white", linewidth = 0.2) + + scale_fill_viridis_c( + option = "inferno", + direction = 1, + trans = "sqrt", + name = "Traffic\nFatalities\nper 100k", + labels = label_number(accuracy = 1), + na.value = "grey80", + guide = guide_colorbar( + barwidth = unit(0.5, "cm"), + barheight = unit(6, "cm"), + ticks = TRUE + ) + ) + + facet_wrap(~date_label, ncol = 1) + + coord_sf(crs = 5070, datum = NA) + + labs( + title = "NHTSA FARS – State Traffic Fatality Rate", + subtitle = paste0( + "Fatalities per 100,000 population (2021 census denominator, crude)\n", + "Fill capped at 99th pctile (~", round(nhtsa_state_rate_cap, 1), " per 100k); sqrt scale" + ), + caption = paste0( + "Source: NHTSA Fatality Analysis Reporting System (FARS)\n", + "https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/" + ) + ) + + theme_void(base_size = 11) + + theme( + plot.title = element_text(face = "bold", hjust = 0.5, size = 13), + plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 8), + plot.caption = element_text(hjust = 0.5, color = "grey50", size = 7), + strip.text = element_text(face = "bold", size = 10), + legend.position = "right", + plot.margin = margin(10, 10, 10, 10) + ) +p_nhtsa_state +out_path_nhtsa_state <- "standard/nhtsa_state_fatality_rate_choropleth.png" +ggsave(out_path_nhtsa_state, p_nhtsa_state, width = 9, height = 14, dpi = 150, bg = "white") +cat("Saved:", out_path_nhtsa_state, "\n") + +# ============================================================================= +# WISQARS – State Motor Vehicle Traffic Fatality Rate Choropleth Maps (3 years) +# ============================================================================= + +# wisqars uses YYYY-01-01 as time; match the same 3 reference years +wisqars_times <- paste0(selected_years, "-01-01") + +cat("Loading WISQARS state data...\n") +wisqars_data <- vroom("../wisqars/standard/data.csv.gz", show_col_types = FALSE) |> + filter( + nchar(geography) == 2, + geography != "00", + age == "Total", + sex == "All", + race == "All", + ethnicity == "All", + time %in% wisqars_times + ) |> + select(geography, time, wisqars_rate_motor_vehicle_traffic) |> + mutate(time = as.character(time)) + +complete_state_grid <- expand.grid( + geography = states_sf$geography, + time = wisqars_times, + stringsAsFactors = FALSE +) + +wisqars_filled <- complete_state_grid |> + left_join(wisqars_data, by = c("geography", "time")) + +wisqars_map_data <- states_sf |> + left_join(wisqars_filled, by = "geography") |> + mutate( + date_label = factor( + time, + levels = wisqars_times, + labels = paste("Year", selected_years) + ) + ) + +cat( + "WISQARS rate range (per 100k):", + min(wisqars_filled$wisqars_rate_motor_vehicle_traffic, na.rm = TRUE), + "to", + max(wisqars_filled$wisqars_rate_motor_vehicle_traffic, na.rm = TRUE), "\n" +) + +wisqars_rate_cap <- quantile( + wisqars_filled$wisqars_rate_motor_vehicle_traffic, + 0.99, na.rm = TRUE +) +wisqars_map_data <- wisqars_map_data |> + mutate(rate_display = pmin(wisqars_rate_motor_vehicle_traffic, wisqars_rate_cap)) + +cat("Rendering WISQARS maps...\n") +p_wisqars <- ggplot(wisqars_map_data) + + geom_sf(aes(fill = rate_display), color = "white", linewidth = 0.2) + + scale_fill_viridis_c( + option = "inferno", + direction = 1, + trans = "sqrt", + name = "Traffic\nFatalities\nper 100k", + labels = label_number(accuracy = 1), + na.value = "grey80", + guide = guide_colorbar( + barwidth = unit(0.5, "cm"), + barheight = unit(6, "cm"), + ticks = TRUE + ) + ) + + facet_wrap(~date_label, ncol = 1) + + coord_sf(crs = 5070, datum = NA) + + labs( + title = "WISQARS – State Motor Vehicle Traffic Fatality Rate", + subtitle = paste0( + "Fatalities per 100,000 population (age-adjusted, all ages)\n", + "Fill capped at 99th pctile (~", round(wisqars_rate_cap, 1), " per 100k); sqrt scale" + ), + caption = paste0( + "Source: CDC WISQARS Fatal Injury Reports\n", + "https://wisqars.cdc.gov/" + ) + ) + + theme_void(base_size = 11) + + theme( + plot.title = element_text(face = "bold", hjust = 0.5, size = 13), + plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 8), + plot.caption = element_text(hjust = 0.5, color = "grey50", size = 7), + strip.text = element_text(face = "bold", size = 10), + legend.position = "right", + plot.margin = margin(10, 10, 10, 10) + ) +p_wisqars +out_path_wisqars <- "standard/wisqars_motor_vehicle_rate_choropleth.png" +ggsave(out_path_wisqars, p_wisqars, width = 9, height = 14, dpi = 150, bg = "white") +cat("Saved:", out_path_wisqars, "\n") + +# ============================================================================= +# Combined: NHTSA state (left) vs WISQARS (right), side by side +# ============================================================================= + +cat("Rendering combined NHTSA state vs WISQARS maps...\n") +p_combined <- patchwork::wrap_plots(p_nhtsa_state, p_wisqars, ncol = 2) + + plot_annotation( + title = "Motor Vehicle Traffic Fatality Rate by State: NHTSA FARS vs. WISQARS", + subtitle = paste0( + "Left: NHTSA FARS crude rate (fatalities per 100k, 2021 census denominator) | ", + "Right: WISQARS age-adjusted rate (fatalities per 100k)\n", + "Years: ", paste(selected_years, collapse = ", ") + ), + theme = theme( + plot.title = element_text(face = "bold", hjust = 0.5, size = 14), + plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 9) + ) + ) +p_combined +out_path_combined <- "standard/nhtsa_vs_wisqars_state_choropleth.png" +ggsave(out_path_combined, p_combined, width = 18, height = 14, dpi = 150, bg = "white") +cat("Saved:", out_path_combined, "\n") + +# ============================================================================= +# Scatter: NHTSA vs WISQARS state rates, faceted by year +# Each point = one state; dashed diagonal = perfect agreement +# ============================================================================= + +state_abbrevs <- vroom("../../resources/all_fips.csv.gz", show_col_types = FALSE) |> + filter(nchar(geography) == 2, geography != "00") |> + select(geography, state) + +nhtsa_all_years <- vroom("standard/data.csv.gz", show_col_types = FALSE) |> + filter(nchar(geography) == 2, geography != "00") |> + mutate(year = substr(as.character(time), 1, 4)) |> + filter(as.integer(year) >= 2005) |> + select(geography, year, nhtsa_fatality_rate) + +wisqars_all_years <- vroom("../wisqars/standard/data.csv.gz", show_col_types = FALSE) |> + filter( + nchar(geography) == 2, geography != "00", + age == "Total", sex == "All", race == "All", ethnicity == "All" + ) |> + mutate(year = substr(as.character(time), 1, 4)) |> + filter(as.integer(year) >= 2005) |> + select(geography, year, wisqars_rate_motor_vehicle_traffic) + +overlap_years <- sort(intersect(unique(nhtsa_all_years$year), unique(wisqars_all_years$year))) +cat("Overlapping years:", paste(overlap_years, collapse = ", "), "\n") + +scatter_data <- nhtsa_all_years |> + filter(year %in% overlap_years) |> + inner_join( + wisqars_all_years |> filter(year %in% overlap_years), + by = c("geography", "year") + ) |> + left_join(state_abbrevs, by = "geography") |> + mutate(year = factor(year, levels = overlap_years)) + +rate_max <- max( + max(scatter_data$nhtsa_fatality_rate, na.rm = TRUE), + max(scatter_data$wisqars_rate_motor_vehicle_traffic, na.rm = TRUE) +) * 1.05 + +p_scatter <- ggplot(scatter_data, + aes(x = nhtsa_fatality_rate, + y = wisqars_rate_motor_vehicle_traffic)) + + geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "grey55") + + geom_point(size = 2.2, alpha = 0.75, color = "#2166ac") + + geom_text(aes(label = state), size = 2.2, vjust = -0.6, check_overlap = TRUE) + + facet_wrap(~year, ncol = ceiling(sqrt(length(overlap_years)))) + + coord_fixed(xlim = c(0, rate_max), ylim = c(0, rate_max)) + + labs( + title = "NHTSA FARS vs. WISQARS Motor Vehicle Traffic Fatality Rate by State", + subtitle = paste0("Each point = one state. Dashed line = 1:1 reference. Years: ", + min(overlap_years), "–", max(overlap_years)), + x = "NHTSA crude rate (fatalities per 100k)", + y = "WISQARS age-adjusted rate (fatalities per 100k)", + caption = "Sources: NHTSA Fatality Analysis Reporting System; CDC WISQARS Fatal Injury Reports." + ) + + theme_bw(base_size = 11) + + theme( + plot.title = element_text(face = "bold", hjust = 0.5, size = 13), + plot.subtitle = element_text(hjust = 0.5, color = "grey40", size = 9), + strip.text = element_text(face = "bold", size = 11), + plot.caption = element_text(color = "grey50", size = 7) + ) +p_scatter +out_path_scatter <- "standard/nhtsa_vs_wisqars_scatter.png" +n_cols <- ceiling(sqrt(length(overlap_years))) +n_rows <- ceiling(length(overlap_years) / n_cols) +ggsave(out_path_scatter, p_scatter, + width = n_cols * 4, + height = n_rows * 4 + 1.5, + dpi = 150, bg = "white") +cat("Saved:", out_path_scatter, "\n") diff --git a/data/nhtsa_crash/ingest.R b/data/nhtsa_crash/ingest.R new file mode 100644 index 00000000..540d7899 --- /dev/null +++ b/data/nhtsa_crash/ingest.R @@ -0,0 +1,365 @@ +# ============================================================================= +# NHTSA FARS (Fatality Analysis Reporting System) Data Ingestion +# Source: https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/ +# Coverage: 2000-2023, annual fatal motor vehicle crash data +# Geography: County (primary), State, National +# ============================================================================= + +library(dplyr) +library(purrr) +library(tidyr) +library(vroom) + +# Initialize process record +process <- dcf::dcf_process_record() + +# ----------------------------------------------------------------------------- +# 1. Download raw FARS zip files (2000-latest) +# ----------------------------------------------------------------------------- +find_latest_fars_year <- function() { + yr <- as.integer(format(Sys.Date(), "%Y")) + while (yr >= 2020) { + url <- sprintf( + paste0( + "https://static.nhtsa.gov/nhtsa/downloads/FARS/", + "%d/National/FARS%dNationalCSV.zip" + ), + yr, yr + ) + status <- tryCatch( + attr(curlGetHeaders(url), "status"), + error = function(e) 0L + ) + if (identical(status, 200L)) return(yr) + yr <- yr - 1 + } + stop("Could not determine latest available FARS year") +} + +years <- 200:find_latest_fars_year() + +dir.create("raw", showWarnings = FALSE) + +download_fars_zip <- function(yr) { + url <- sprintf( + "https://static.nhtsa.gov/nhtsa/downloads/FARS/%d/National/FARS%dNationalCSV.zip", + yr, yr + ) + dest <- sprintf("raw/FARS%dNationalCSV.zip", yr) + if (!file.exists(dest)) { + message(sprintf("Downloading FARS %d...", yr)) + download.file(url, dest, mode = "wb", quiet = TRUE) + } + list(hash = unname(tools::md5sum(dest))) +} + +raw_state_new <- setNames(lapply(years, download_fars_zip), as.character(years)) + +# Only re-process if any year's data has changed +if (!identical(process$raw_state, raw_state_new)) { + + # --------------------------------------------------------------------------- + # 2. Load reference data + # --------------------------------------------------------------------------- + all_fips <- vroom("../../resources/all_fips.csv.gz", show_col_types = FALSE) + + # Valid county FIPS codes (5-digit) for filtering + valid_county_fips <- all_fips %>% + filter(nchar(geography) == 5) %>% + pull(geography) + + # County population from 2021 Census (single-year reference denominator) + pop_county <- vroom( + "../../resources/census_population_2021.csv.xz", + show_col_types = FALSE + ) %>% + filter(nchar(GEOID) == 5) %>% + select(geography = GEOID, population = Total) + + pop_state <- vroom( + "../../resources/census_population_2021.csv.xz", + show_col_types = FALSE + ) %>% + filter(nchar(GEOID) == 2) %>% + select(geography = GEOID, population = Total) + + pop_national <- pop_state %>% + summarize(population = sum(population)) %>% + mutate(geography = "00") + + pop_all <- bind_rows(pop_county, pop_state, pop_national) + + # --------------------------------------------------------------------------- + # 3. Helper: read a named file from a FARS zip archive + # Handles case-insensitive file names and subdirectory paths + # --------------------------------------------------------------------------- + read_fars_file <- function(yr, file_pattern) { + zip_path <- sprintf("raw/FARS%dNationalCSV.zip", yr) + + # List contents to find the right file (case-insensitive) + zip_contents <- tryCatch( + unzip(zip_path, list = TRUE), + error = function(e) { + warning(sprintf("Cannot list zip %s: %s", zip_path, e$message)) + return(NULL) + } + ) + if (is.null(zip_contents)) return(NULL) + + match_idx <- grep(file_pattern, basename(zip_contents$Name), ignore.case = TRUE) + if (length(match_idx) == 0) { + warning(sprintf("Pattern '%s' not found in FARS %d zip", file_pattern, yr)) + return(NULL) + } + target_file <- zip_contents$Name[match_idx[1]] + + readr::read_csv( + unz(zip_path, target_file), + show_col_types = FALSE, + col_types = readr::cols(.default = "c") + ) %>% + mutate(YEAR = yr) + } + + # --------------------------------------------------------------------------- + # 4. Read and combine all years of accident and person data + # --------------------------------------------------------------------------- + message("Reading accident files...") + accident_all <- map_dfr(years, read_fars_file, file_pattern = "^accident\\.csv$") + + message("Reading person files...") + person_all <- map_dfr(years, read_fars_file, file_pattern = "^person\\.csv$") + + # --------------------------------------------------------------------------- + # 5. Build geography fields and normalize key columns + # --------------------------------------------------------------------------- + accident_all <- accident_all %>% + mutate( + # County FIPS: 2-digit state + 3-digit county + geography_county = sprintf("%02d%03d", as.integer(STATE), as.integer(COUNTY)), + geography_state = sprintf("%02d", as.integer(STATE)), + time = paste0(YEAR, "-12-31"), + # Unknown county codes: 0 = not reported, 999 = unknown + county_known = as.integer(COUNTY) > 0 & as.integer(COUNTY) < 999, + # Rural/urban: handle both old (LAND_USE) and new (RUR_URB) column names + rural_urban = dplyr::coalesce( + if ("RUR_URB" %in% names(.)) as.integer(RUR_URB) else NA_integer_, + if ("LAND_USE" %in% names(.)) as.integer(LAND_USE) else NA_integer_ + ), + single_vehicle = !is.na(VE_TOTAL) & as.integer(VE_TOTAL) == 1 + ) + + # --------------------------------------------------------------------------- + # 6. Output 1: data.csv.gz — Annual fatalities & crashes + # Primary: county-level. Also state and national. + # --------------------------------------------------------------------------- + summarize_crashes <- function(df) { + df %>% + group_by(geography, time) %>% + summarize( + nhtsa_fatalities = sum(as.integer(FATALS), na.rm = TRUE), + nhtsa_fatal_crashes = n(), + .groups = "drop" + ) + } + + data_county <- accident_all %>% + filter(county_known) %>% + rename(geography = geography_county) %>% + summarize_crashes() + + data_state <- accident_all %>% + rename(geography = geography_state) %>% + summarize_crashes() + + data_national <- accident_all %>% + mutate(geography = "00") %>% + summarize_crashes() + + data_main <- bind_rows(data_county, data_state, data_national) %>% + left_join(pop_all, by = "geography") %>% + mutate( + nhtsa_fatality_rate = round(nhtsa_fatalities / population * 100000, 2) + ) %>% + select(geography, time, nhtsa_fatalities, nhtsa_fatal_crashes, + nhtsa_fatality_rate) + + vroom_write(data_main, "standard/data.csv.gz", delim = ",") + + # --------------------------------------------------------------------------- + # 7. Output 2: data_person_type.csv.gz — Fatalities by person type + # Based on fatal persons (INJ_SEV == 4) in person.csv + # --------------------------------------------------------------------------- + + # Join person data with accident geography + accident_geo <- accident_all %>% + select(ST_CASE, YEAR, geography_county, geography_state, county_known, time) + + person_typed <- person_all %>% + mutate(YEAR = as.integer(YEAR), ST_CASE = as.integer(ST_CASE)) %>% + left_join( + accident_geo %>% + mutate(ST_CASE = as.integer(ST_CASE), YEAR = as.integer(YEAR)), + by = c("ST_CASE", "YEAR") + ) %>% + filter(as.integer(INJ_SEV) == 4) %>% # fatal persons only + mutate( + person_type = case_when( + as.integer(PER_TYP) == 1 ~ "driver", + as.integer(PER_TYP) %in% 2:4 ~ "passenger", + as.integer(PER_TYP) == 5 ~ "pedestrian", + as.integer(PER_TYP) %in% 6:7 ~ "cyclist", + TRUE ~ "other" + ) + ) %>% + filter(person_type != "other") + + agg_person_type <- function(df, geo_col) { + df %>% + rename(geography = !!sym(geo_col)) %>% + group_by(geography, time, person_type) %>% + summarize(nhtsa_fatalities = n(), .groups = "drop") + } + + data_person_type <- bind_rows( + person_typed %>% filter(county_known) %>% agg_person_type("geography_county"), + person_typed %>% agg_person_type("geography_state"), + person_typed %>% mutate(geography_national = "00") %>% agg_person_type("geography_national") + ) + + vroom_write(data_person_type, "standard/data_person_type.csv.gz", delim = ",") + + # --------------------------------------------------------------------------- + # 8. Output 3: data_age_sex.csv.gz — Fatalities by age group and sex + # --------------------------------------------------------------------------- + person_demo <- person_all %>% + mutate(YEAR = as.integer(YEAR), ST_CASE = as.integer(ST_CASE)) %>% + left_join( + accident_geo %>% + mutate(ST_CASE = as.integer(ST_CASE), YEAR = as.integer(YEAR)), + by = c("ST_CASE", "YEAR") + ) %>% + filter(as.integer(INJ_SEV) == 4) %>% + mutate( + age_num = as.integer(AGE), + age = case_when( + age_num < 15 ~ "0-14", + age_num < 25 ~ "15-24", + age_num < 45 ~ "25-44", + age_num < 65 ~ "45-64", + age_num < 998 ~ "65+", # 998 = not reported, 999 = unknown + TRUE ~ NA_character_ + ), + sex = case_when( + as.integer(SEX) == 1 ~ "Male", + as.integer(SEX) == 2 ~ "Female", + TRUE ~ NA_character_ + ) + ) %>% + filter(!is.na(age), !is.na(sex)) + + agg_demo <- function(df, geo_col) { + df %>% + rename(geography = !!sym(geo_col)) %>% + group_by(geography, time, age, sex) %>% + summarize(nhtsa_fatalities = n(), .groups = "drop") + } + + data_age_sex <- bind_rows( + person_demo %>% filter(county_known) %>% agg_demo("geography_county"), + person_demo %>% agg_demo("geography_state"), + person_demo %>% mutate(geography_national = "00") %>% agg_demo("geography_national") + ) + + vroom_write(data_age_sex, "standard/data_age_sex.csv.gz", delim = ",") + + # --------------------------------------------------------------------------- + # 9. Output 4: data_crash_type.csv.gz — Fatalities by crash characteristic + # Wide format: one column per crash type, stratified by age and sex + # --------------------------------------------------------------------------- + + # Build person-level dataset with crash type flags inherited from the crash + crash_flags <- accident_all %>% + mutate(ST_CASE = as.integer(ST_CASE)) %>% + select(ST_CASE, YEAR, single_vehicle, rural_urban) + + person_crash_flags <- person_all %>% + mutate(YEAR = as.integer(YEAR), ST_CASE = as.integer(ST_CASE)) %>% + left_join( + accident_geo %>% mutate(ST_CASE = as.integer(ST_CASE), YEAR = as.integer(YEAR)), + by = c("ST_CASE", "YEAR") + ) %>% + filter(as.integer(INJ_SEV) == 4) %>% + left_join(crash_flags, by = c("ST_CASE", "YEAR")) %>% + mutate( + age_num = as.integer(AGE), + age = case_when( + age_num < 15 ~ "0-14", + age_num < 25 ~ "15-24", + age_num < 45 ~ "25-44", + age_num < 65 ~ "45-64", + age_num < 998 ~ "65+", + TRUE ~ NA_character_ + ), + sex = case_when( + as.integer(SEX) == 1 ~ "Male", + as.integer(SEX) == 2 ~ "Female", + TRUE ~ NA_character_ + ), + per_typ_int = as.integer(PER_TYP), + nhtsa_single_vehicle = !is.na(single_vehicle) & single_vehicle, + nhtsa_rural = !is.na(rural_urban) & rural_urban == 1, + nhtsa_urban = !is.na(rural_urban) & rural_urban == 2, + nhtsa_pedestrian_involved = per_typ_int == 5, + nhtsa_cyclist_involved = per_typ_int %in% 6:7 + ) + + crash_type_cols <- c("nhtsa_single_vehicle", "nhtsa_rural", "nhtsa_urban", + "nhtsa_pedestrian_involved", "nhtsa_cyclist_involved") + + agg_crash_demo <- function(df, geo_col, group_vars) { + df %>% + rename(geography = !!sym(geo_col)) %>% + group_by(geography, time, across(all_of(group_vars))) %>% + summarize( + nhtsa_fatalities = n(), + nhtsa_fatal_crashes = n_distinct(ST_CASE, YEAR), + across(all_of(crash_type_cols), ~ sum(.x, na.rm = TRUE)), + .groups = "drop" + ) + } + + make_crash_type <- function(df, geo_col) { + overall <- agg_crash_demo(df, geo_col, character(0)) %>% + mutate(age = "Overall", sex = "Overall") + by_age <- df %>% + filter(!is.na(age)) %>% + agg_crash_demo(geo_col, "age") %>% + mutate(sex = "Overall") + by_sex <- df %>% + filter(!is.na(sex)) %>% + agg_crash_demo(geo_col, "sex") %>% + mutate(age = "Overall") + bind_rows(overall, by_age, by_sex) + } + + data_crash_type <- bind_rows( + person_crash_flags %>% filter(county_known) %>% make_crash_type("geography_county"), + person_crash_flags %>% make_crash_type("geography_state"), + person_crash_flags %>% mutate(geography_national = "00") %>% + make_crash_type("geography_national") + ) %>% + select(geography, time, age, sex, nhtsa_fatalities, nhtsa_fatal_crashes, + all_of(crash_type_cols)) + + vroom_write(data_crash_type, "standard/data_crash_type.csv.gz", delim = ",") + + # --------------------------------------------------------------------------- + # 10. Record processed state + # --------------------------------------------------------------------------- + process$raw_state <- raw_state_new + dcf::dcf_process_record(updated = process) + + message("NHTSA FARS ingestion complete.") +} + \ No newline at end of file diff --git a/data/nhtsa_crash/measure_info.json b/data/nhtsa_crash/measure_info.json new file mode 100644 index 00000000..32444fe1 --- /dev/null +++ b/data/nhtsa_crash/measure_info.json @@ -0,0 +1,161 @@ +{ + "nhtsa_fatalities": { + "id": "nhtsa_fatalities", + "short_name": "Motor vehicle fatalities", + "long_name": "Motor vehicle crash fatalities", + "category": "injury", + "short_description": "Annual count of persons killed in motor vehicle traffic crashes.", + "long_description": "Total number of persons killed in fatal motor vehicle traffic crashes, as reported in the NHTSA Fatality Analysis Reporting System (FARS). FARS is a census of all fatal crashes occurring on public roads in the United States in which a motor vehicle was involved and at least one person died within 30 days of the crash. Data are reported annually at the county, state, and national level.", + "statement": "In {location}, {value} people were killed in motor vehicle crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [ + { + "title": "FARS Analytical User's Manual 1975-2023", + "url": "https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/813706" + } + ] + }, + "nhtsa_fatal_crashes": { + "id": "nhtsa_fatal_crashes", + "short_name": "Fatal crashes", + "long_name": "Fatal motor vehicle crashes", + "category": "injury", + "short_description": "Annual count of motor vehicle crashes resulting in at least one fatality.", + "long_description": "Number of motor vehicle traffic crashes in which at least one person died within 30 days of the crash, as reported by NHTSA FARS. A single crash may result in multiple fatalities. This measure counts the crash event rather than individual deaths.", + "statement": "In {location}, there were {value} fatal motor vehicle crashes in {time}.", + "measure_type": "Count", + "unit": "Crashes", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_fatality_rate": { + "id": "nhtsa_fatality_rate", + "short_name": "Fatality rate (per 100k)", + "long_name": "Motor vehicle fatality rate per 100,000 population", + "category": "injury", + "short_description": "Annual motor vehicle fatalities per 100,000 residents. Population denominator is 2021 Census.", + "long_description": "Rate of motor vehicle crash fatalities per 100,000 resident population, calculated using NHTSA FARS death counts and 2021 Census Bureau population estimates as the denominator. Because a single reference-year population is used across all years (2000-2023), this rate reflects changes in crash fatalities more than changes in population size.", + "statement": "In {location}, the motor vehicle fatality rate was {value} per 100,000 people in {time}.", + "measure_type": "Rate", + "unit": "Deaths per 100,000", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_alcohol_related": { + "id": "nhtsa_alcohol_related", + "short_name": "Alcohol-related fatalities", + "long_name": "Fatalities in alcohol-related crashes", + "category": "injury", + "short_description": "Annual fatalities in crashes involving at least one drunk driver.", + "long_description": "Total fatalities (all persons) in crashes where at least one driver was recorded as alcohol-impaired (DRUNK_DR >= 1 in FARS accident file). A crash is classified as alcohol-related when any driver involved had a blood alcohol concentration (BAC) at or above 0.08 g/dL or was otherwise identified as impaired. This measure counts all fatalities in such crashes, not just the impaired driver. Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in alcohol-related crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_speeding_related": { + "id": "nhtsa_speeding_related", + "short_name": "Speeding-related fatalities", + "long_name": "Fatalities in speeding-related crashes", + "category": "injury", + "short_description": "Annual fatalities in crashes where speeding was a contributing factor (2010+).", + "long_description": "Total fatalities in crashes flagged as speeding-related (SPEEDREL in 1-4 in FARS accident file), indicating the driver exceeded the speed limit, was racing, or drove too fast for conditions. The SPEEDREL variable was added to FARS in 2010; data are available from 2010 onward only. Values are 0 for years prior to 2010. Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in speeding-related crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_single_vehicle": { + "id": "nhtsa_single_vehicle", + "short_name": "Single-vehicle crash fatalities", + "long_name": "Fatalities in single-vehicle crashes", + "category": "injury", + "short_description": "Annual fatalities in crashes involving only one vehicle.", + "long_description": "Total fatalities in crashes involving exactly one motor vehicle (VE_TOTAL = 1 in FARS accident file). Single-vehicle crashes include run-off-road events, rollovers, and collisions with fixed objects or pedestrians. Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in single-vehicle crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_rural": { + "id": "nhtsa_rural", + "short_name": "Rural crash fatalities", + "long_name": "Fatalities in rural-area crashes", + "category": "injury", + "short_description": "Annual fatalities in crashes occurring on rural roadways.", + "long_description": "Total fatalities in crashes classified as occurring in a rural land use area (RUR_URB = 1 or LAND_USE = 1 in FARS accident file, depending on year). Rural crashes tend to have higher fatality rates per crash due to higher speeds, longer emergency response times, and less infrastructure. Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in rural-area crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_urban": { + "id": "nhtsa_urban", + "short_name": "Urban crash fatalities", + "long_name": "Fatalities in urban-area crashes", + "category": "injury", + "short_description": "Annual fatalities in crashes occurring on urban roadways.", + "long_description": "Total fatalities in crashes classified as occurring in an urban land use area (RUR_URB = 2 or LAND_USE = 2 in FARS accident file, depending on year). Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in urban-area crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_pedestrian_involved": { + "id": "nhtsa_pedestrian_involved", + "short_name": "Pedestrian-involved crash fatalities", + "long_name": "Fatalities in crashes involving a pedestrian", + "category": "injury", + "short_description": "Annual fatalities in crashes where at least one pedestrian was killed.", + "long_description": "Total fatalities in crashes where at least one pedestrian (FARS person type 5) was killed. This counts all persons who died in such crashes, not only the pedestrian victims. Derived by linking the FARS person file (INJ_SEV = 4, PER_TYP = 5) back to the accident file. Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in pedestrian-involved crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + "nhtsa_cyclist_involved": { + "id": "nhtsa_cyclist_involved", + "short_name": "Cyclist-involved crash fatalities", + "long_name": "Fatalities in crashes involving a pedalcyclist", + "category": "injury", + "short_description": "Annual fatalities in crashes where at least one cyclist was killed.", + "long_description": "Total fatalities in crashes where at least one pedalcyclist (FARS person types 6-7) was killed. This counts all persons who died in such crashes. Derived by linking the FARS person file (INJ_SEV = 4, PER_TYP in 6-7) back to the accident file. Available in data_crash_type.csv.gz, stratified by age and sex.", + "statement": "In {location}, {value} people were killed in cyclist-involved crashes in {time}.", + "measure_type": "Count", + "unit": "Deaths", + "time_resolution": "Year", + "sources": [{ "id": "nhtsa_fars" }], + "citations": [] + }, + + "_sources": { + "nhtsa_fars": { + "name": "Fatality Analysis Reporting System (FARS)", + "url": "https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/", + "organization": "National Highway Traffic Safety Administration (NHTSA)", + "organization_url": "https://www.nhtsa.gov/", + "location": "NHTSA File Downloads — FARS National CSV archives", + "location_url": "https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/FARS/", + "description": "FARS is a nationwide census providing NHTSA, Congress, and the American public with yearly data on fatal injuries suffered in motor vehicle traffic crashes. FARS contains data on all crashes in the United States involving a fatality in a motor vehicle traffic crash on a public road. A fatal crash is one in which a motor vehicle is involved and at least one person dies within 30 days of the crash. Data are collected from police crash reports, state vehicle registration files, state driver licensing files, state highway department data, vital statistics, and other sources. FARS has been operational since 1975 and covers all 50 states, the District of Columbia, and Puerto Rico (national files cover the 50 states and DC).", + "restrictions": "Public domain. NHTSA data is generally not subject to copyright restrictions.", + "date_accessed": 2025 + } + } +} diff --git a/data/nhtsa_crash/process.json b/data/nhtsa_crash/process.json new file mode 100644 index 00000000..c7394935 --- /dev/null +++ b/data/nhtsa_crash/process.json @@ -0,0 +1,37 @@ +{ + "name": "nhtsa_crash", + "type": "source", + "scripts": [ + { + "path": "ingest.R", + "manual": false, + "frequency": 0, + "last_run": "", + "run_time": "", + "last_status": { + "log": "", + "success": true + } + } + ], + "checked": "", + "check_results": [], + "standalone": false, + "raw_state": { + "2020": { + "hash": "58d91fb16a87ebea5f56778d50c7ddcf" + }, + "2021": { + "hash": "70f8683e6b96f2c1d74d6396b7068f7a" + }, + "2022": { + "hash": "c356f65e0c73f39849992fab8354271f" + }, + "2023": { + "hash": "1b4e902a6f51919f3860b7442d42889b" + }, + "2024": { + "hash": "deae782d5d5f8eaacfb750b62fde55b3" + } + } +} diff --git a/data/nhtsa_crash/standard/data.csv.gz b/data/nhtsa_crash/standard/data.csv.gz new file mode 100644 index 00000000..61a4d04e Binary files /dev/null and b/data/nhtsa_crash/standard/data.csv.gz differ diff --git a/data/nhtsa_crash/standard/data_age_sex.csv.gz b/data/nhtsa_crash/standard/data_age_sex.csv.gz new file mode 100644 index 00000000..96fed8b1 Binary files /dev/null and b/data/nhtsa_crash/standard/data_age_sex.csv.gz differ diff --git a/data/nhtsa_crash/standard/data_crash_type.csv.gz b/data/nhtsa_crash/standard/data_crash_type.csv.gz new file mode 100644 index 00000000..d235ddc7 Binary files /dev/null and b/data/nhtsa_crash/standard/data_crash_type.csv.gz differ diff --git a/data/nhtsa_crash/standard/data_person_type.csv.gz b/data/nhtsa_crash/standard/data_person_type.csv.gz new file mode 100644 index 00000000..433752c5 Binary files /dev/null and b/data/nhtsa_crash/standard/data_person_type.csv.gz differ diff --git a/data/nhtsa_crash/standard/datapackage.json b/data/nhtsa_crash/standard/datapackage.json new file mode 100644 index 00000000..982ba7c1 --- /dev/null +++ b/data/nhtsa_crash/standard/datapackage.json @@ -0,0 +1,6 @@ +{ + "name": "nhtsa_crash", + "title": "Nhtsa Crash", + "licence": [], + "resources": [] +} diff --git a/data/nhtsa_crash/standard/nhtsa_fatality_rate_choropleth.png b/data/nhtsa_crash/standard/nhtsa_fatality_rate_choropleth.png new file mode 100644 index 00000000..0c30a21f Binary files /dev/null and b/data/nhtsa_crash/standard/nhtsa_fatality_rate_choropleth.png differ