UI-Research · wcurrangroome · May 8, 2026 · Mar 5, 2026 · May 8, 2026
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ docs
 inst/doc
 /data
 /temporary-scripts
+/cache
diff --git a/CLAUDE.md b/CLAUDE.md
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -31,6 +31,7 @@ Imports:
     ggplot2,
     ipumsr,
     janitor,
+    jsonlite,
     lehdr,
     lubridate,
     magrittr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(cache_it)
 export(convert_table_text_to_dataframe)
+export(download_openfema_datasets)
 export(estimate_units_per_parcel)
 export(estimate_zoning_envelope)
 export(get_box_path)
@@ -28,6 +29,7 @@ export(get_system_username)
 export(get_wildfire_burn_zones)
 export(inflation_adjust)
 export(interpolate_demographics)
+export(list_openfema_endpoints)
 export(polygons_to_linestring)
 export(qualtrics_define_missing)
 export(qualtrics_format_metadata)

diff --git a/R/CLAUDE.md b/R/CLAUDE.md
diff --git a/R/download_openfema_datasets.R b/R/download_openfema_datasets.R
@@ -0,0 +1,183 @@
+#' @title List available OpenFEMA dataset endpoints
+#'
+#' @description Queries the OpenFEMA metadata API and returns a tibble of all
+#'   available datasets with their names, versions, record counts, and download
+#'   URLs by format.
+#'
+#' @returns A tibble with columns:
+#'   \describe{
+#'     \item{name}{The API endpoint name (e.g., "DisasterDeclarationsSummaries").}
+#'     \item{title}{Human-readable dataset title.}
+#'     \item{version}{API version number.}
+#'     \item{record_count}{Number of records in the dataset.}
+#'     \item{formats}{Comma-separated list of available download formats.}
+#'     \item{url_parquet}{Download URL for parquet format (NA if unavailable).}
+#'     \item{url_csv}{Download URL for CSV format (NA if unavailable).}
+#'   }
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' endpoints <- list_openfema_endpoints()
+#' endpoints |> dplyr::filter(stringr::str_detect(name, "Nfip"))
+#' }
+list_openfema_endpoints <- function() {
+
+  metadata_url <- "https://www.fema.gov/api/open/v1/OpenFemaDataSets"
+  response <- jsonlite::fromJSON(metadata_url, simplifyDataFrame = FALSE)
+
+  records <- response[["OpenFemaDataSets"]]
+  if (is.null(records)) {
+    stop("Unexpected API response structure from OpenFEMA metadata endpoint.")
+  }
+
+  rows <- purrr::map(records, function(rec) {
+    dist <- rec[["distribution"]]
+    if (is.null(dist)) dist <- list()
+
+    format_urls <- purrr::map(dist, function(d) {
+      list(format = tolower(d[["format"]] %||% ""), url = d[["accessURL"]] %||% NA_character_)
+    })
+
+    all_formats <- purrr::map_chr(format_urls, "format")
+    get_url <- function(fmt) {
+      match <- purrr::keep(format_urls, ~ .x$format == fmt)
+      if (length(match) > 0) match[[1]]$url else NA_character_
+    }
+
+    tibble::tibble(
+      name = rec[["name"]] %||% NA_character_,
+      title = rec[["title"]] %||% NA_character_,
+      version = rec[["version"]] %||% NA_integer_,
+      record_count = rec[["recordCount"]] %||% NA_integer_,
+      formats = paste(all_formats, collapse = ", "),
+      url_parquet = get_url("parquet"),
+      url_csv = get_url("csv")
+    )
+  })
+
+  dplyr::bind_rows(rows)
+}
+
+
+#' @title Download full OpenFEMA datasets
+#'
+#' @description Downloads full data files for OpenFEMA API endpoints. Prefers
+#'   parquet format, falling back to CSV. Uses the OpenFEMA metadata API to
+#'   dynamically resolve download URLs.
+#'
+#' @param endpoints A character vector of dataset endpoint names (e.g.,
+#'   `"DisasterDeclarationsSummaries"`). Use [list_openfema_endpoints()] to see
+#'   available names. Default `NULL` downloads all endpoints.
+#' @param download_directory Directory path where files will be saved. Created if it does
+#'   not exist. Defaults to `"."`.
+#' @param format_preference Character vector specifying format preference order.
+#'   The first available format is used. Defaults to `c("parquet", "csv")`.
+#' @param overwrite Logical. If `FALSE` (default), skips files that already
+#'   exist in `download_directory`.
+#'
+#' @returns A tibble summarizing the results with columns: `name`, `format`,
+#'   `file_path`, `status` (one of "downloaded", "skipped", "failed", "no_format").
+#'
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' # Download a single small dataset
+#' download_openfema_datasets(
+#'   endpoints = "DisasterDeclarationsSummaries",
+#'   download_directory = "data/openfema")
+#'
+#' # Download all datasets
+#' download_openfema_datasets(download_directory = "data/openfema")
+#'
+#' # See what's available first
+#' list_openfema_endpoints()
+#' }
+download_openfema_datasets <- function(
+    endpoints = NULL,
+    download_directory = ".",
+    format_preference = c("parquet", "csv"),
+    overwrite = FALSE) {
+
+  metadata <- list_openfema_endpoints()
+
+  # Validate requested endpoints
+  if (!is.null(endpoints)) {
+    unknown <- setdiff(endpoints, metadata$name)
+    if (length(unknown) > 0) {
+      stop(
+        "Unknown endpoint(s): ", paste(unknown, collapse = ", "),
+        "\nUse list_openfema_endpoints() to see available names."
+      )
+    }
+    metadata <- metadata |> dplyr::filter(.data$name %in% endpoints)
+  }
+
+  if (!dir.exists(download_directory)) {
+    dir.create(download_directory, recursive = TRUE)
+    message("Created directory: ", download_directory)
+  }
+
+  # Process each dataset
+  results <- purrr::pmap(
+    list(metadata$name, metadata$url_parquet, metadata$url_csv, metadata$formats),
+    function(name, url_parquet, url_csv, formats) {
+
+      # Build lookup of format -> URL
+      url_lookup <- c(parquet = url_parquet, csv = url_csv)
+
+      # Pick best available format
+      chosen_format <- NA_character_
+      chosen_url <- NA_character_
+      for (fmt in format_preference) {
+        url_candidate <- url_lookup[[fmt]]
+        if (!is.na(url_candidate)) {
+          chosen_format <- fmt
+          chosen_url <- url_candidate
+          break
+        }
+      }
+
+      if (is.na(chosen_format)) {
+        message("[", name, "] No preferred format available (has: ", formats, "). Skipping.")
+        return(tibble::tibble(
+          name = name, format = NA_character_,
+          file_path = NA_character_, status = "no_format"
+        ))
+      }
+
+      date_stamp <- format(Sys.Date(), "%Y_%m_%d")
+      safe_name <- gsub("-", "_", name)
+      file_name <- paste0(safe_name, "_", date_stamp, ".", chosen_format)
+      file_path <- file.path(download_directory, file_name)
+
+      if (file.exists(file_path) && !overwrite) {
+        message("[", name, "] File already exists, skipping. Use overwrite = TRUE to re-download.")
+        return(tibble::tibble(
+          name = name, format = chosen_format,
+          file_path = file_path, status = "skipped"
+        ))
+      }
+
+      message("[", name, "] Downloading ", chosen_format, " from: ", chosen_url)
+      old_timeout <- getOption("timeout")
+      on.exit(options(timeout = old_timeout), add = TRUE)
+      options(timeout = 1200)
+      dl_result <- tryCatch({
+        utils::download.file(chosen_url, destfile = file_path, mode = "wb", quiet = FALSE)
+        "downloaded"
+      }, error = function(e) {
+        warning("[", name, "] Download failed: ", conditionMessage(e))
+        "failed"
+      })
+
+      tibble::tibble(
+        name = name, format = chosen_format,
+        file_path = file_path, status = dl_result)
+    }
+  )
+
+  dplyr::bind_rows(results)
+}
diff --git a/R/get_sba_loans.R b/R/get_sba_loans.R
@@ -1,5 +1,3 @@
-# Author: Will Curran-Groome
-
 #' @importFrom magrittr %>%
 
 #' @title Access SBA data on disaster loans
@@ -49,7 +47,7 @@ get_sba_loans = function() {
       sba_eidl_declaration_number = dplyr::if_else(is.na(sba_eidl_declaration_number), sba_eidl_declaration, sba_eidl_declaration_number),
       damaged_property_zip_code = dplyr::if_else(is.na(damaged_property_zip_code), damaged_property_zip, damaged_property_zip_code),
       damaged_property_city_name = dplyr::if_else(is.na(damaged_property_city_name), damaged_property_city, damaged_property_city_name),
-      damaged_property_state_code = dplyr::if_else(is.na(damaged_property_state_code), damaged_property_state, damaged_property_state_code),
+      damaged_property_state_abbreviation = dplyr::if_else(is.na(damaged_property_state_code), damaged_property_state, damaged_property_state_code),
       total_approved_loan_amount = dplyr::if_else(is.na(total_approved_loan_amount), total_approved, total_approved_loan_amount),
       approved_amount_real_estate = dplyr::if_else(is.na(approved_amount_real_estate), approved_amount_real, approved_amount_real_estate),
       total_verified_loss = dplyr::if_else(is.na(total_verified_loss), total_verified, total_verified_loss)) %>%
@@ -77,7 +75,7 @@ get_sba_loans = function() {
       sba_eidl_declaration_number = dplyr::if_else(is.na(sba_eidl_declaration_number), sba_eidl_declaration, sba_eidl_declaration_number),
       damaged_property_zip_code = dplyr::if_else(is.na(damaged_property_zip_code), damaged_property_zip, damaged_property_zip_code),
       damaged_property_city_name = dplyr::if_else(is.na(damaged_property_city_name), damaged_property_city, damaged_property_city_name),
-      damaged_property_state_code = dplyr::if_else(is.na(damaged_property_state_code), damaged_property_state, damaged_property_state_code),
+      damaged_property_state_abbreviation = dplyr::if_else(is.na(damaged_property_state_code), damaged_property_state, damaged_property_state_code),
       total_approved_loan_amount = dplyr::if_else(is.na(total_approved_loan_amount), total_approved, total_approved_loan_amount),
       approved_amount_real_estate = dplyr::if_else(is.na(approved_amount_real_estate), approved_amount_real, approved_amount_real_estate),
       total_verified_loss = dplyr::if_else(is.na(total_verified_loss), total_verified, total_verified_loss)) %>%
@@ -87,18 +85,40 @@ get_sba_loans = function() {
       approved_amount_real)) %>%
     dplyr::rename(
       disaster_number_fema = fema_disaster_number,
+      disaster_number_sba = sba_disaster_number,
       disaster_number_sba_physical = sba_physical_declaration_number,
       disaster_number_sba_eidl = sba_eidl_declaration_number,
       verified_loss_total = total_verified_loss,
       approved_amount_total = total_approved_loan_amount)
 
   result = dplyr::bind_rows(
-    business_loans %>% dplyr::mutate(loan_type = "business"),
-    home_loans %>% dplyr::mutate(loan_type = "residential")) %>%
-    ## these are weird, meaningless records that are either embedded in the raw data
-    ## or that are accidentally created as rows when data are read-in from file
-    dplyr::filter(!stringr::str_detect(disaster_number_sba_physical, "Business Data Only|United States Small Business"))
-
+      business_loans %>% dplyr::mutate(loan_type = "business"),
+      home_loans %>% dplyr::mutate(loan_type = "residential")) %>%
+    suppressWarnings({dplyr::mutate(
+      ## state codes should be characters, not numbers (e.g., "AL")
+      damaged_property_state_code = dplyr::if_else(!is.na(as.numeric(damaged_property_state_code)), NA, damaged_property_state_code),
+      damaged_property_zip_code = dplyr::case_when(
+        ## sometimes these are represented as three digits in the raw data, but in PR, all zip codes are prefixed with "00", so padding is safe/correct
+        damaged_property_state_code == "PR" ~ stringr::str_pad(damaged_property_zip_code, width = 5, side = "left", pad = "0"),
+        ## in the raw data, these columns are transposed in some cases; as.numeric(city) should be NA if city is accurate, so this is safe  
+        is.na(as.numeric(damaged_property_zip_code)) ~ as.numeric(damaged_property_city_name) %>% stringr::str_pad(width = 5, side = "left", pad = "0"),
+        TRUE ~ damaged_property_zip_code),
+      damaged_property_state_code = dplyr::case_when(
+        ## we infer state codes from disaster codes, where state codes are prefixed on the disaster number
+        is.na(damaged_property_state_code) & stringr::str_detect(sba_disaster_number, "^[A-Z]{2}-") ~ stringr::str_sub(sba_disaster_number, 1, 2),
+        is.na(damaged_property_state_code) & stringr::str_detect(disaster_number_fema, "^[A-Z]{2}-") ~ stringr::str_sub(disaster_number_fema, 1, 2),
+        ## another transposition issue; we only use the city name when it is two, uppercase characters
+        is.na(damaged_property_state_code) & stringr::str_detect(damaged_property_city_name, "^[A-Z]{2}$") ~ damaged_property_city_name,
+        TRUE ~ damaged_property_state_code),
+      damaged_property_city_name = dplyr::case_when(
+        ## yet another raw data transposition issue
+        stringr::str_detect(damaged_property_city_name, "^[A-Z]{2}$") & !stringr::str_detect(sba_disaster_number, "-") ~ sba_disaster_number,
+        TRUE ~ damaged_property_city_name))}) %>%
+    dplyr::filter(
+      ## these are weird, meaningless records that are either embedded in the raw data
+      ## or that are accidentally created as rows when data are read-in from file
+      !stringr::str_detect(disaster_number_sba_physical, "Business Data Only|United States Small Business|Home Data Only"))
+
   return(result)
 }
 

diff --git a/man/download_openfema_datasets.Rd b/man/download_openfema_datasets.Rd
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ docs @@
     inst/doc
     /data
     /temporary-scripts
+    /cache