diff --git a/DESCRIPTION b/DESCRIPTION index c688f60..32f88b2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -20,6 +20,7 @@ Imports: rvest, xml2 Suggests: + connections, dm, re2, ggplot2, diff --git a/R/connect.R b/R/connect.R index 1905fe1..374b61f 100644 --- a/R/connect.R +++ b/R/connect.R @@ -7,18 +7,33 @@ #' `.Renviron`. #' @param read_only Open in read-only mode. Default `TRUE` for analysis; #' use `FALSE` only when rebuilding via [setup_baseball_db()]. +#' @param connections_pane If `TRUE`, registers the connection with the RStudio +#' Connections pane via the `connections` package (must be installed). The +#' returned object is still a standard `DBIConnection` and works identically +#' with all DBI and `db_query()` calls; the only difference is that RStudio +#' will display it in the Connections pane with a browse/disconnect button. +#' Silently falls back to a plain `DBI::dbConnect()` when `connections` is +#' not installed. #' -#' @return A `DBIConnection` object. Close with +#' @return A `DBIConnection` object (or a `connections`-wrapped equivalent when +#' `connections_pane = TRUE`). Close with #' `DBI::dbDisconnect(con, shutdown = TRUE)`. #' @export #' #' @examples #' \dontrun{ +#' # Plain connection #' con <- connect_baseball_db() #' DBI::dbListTables(con) #' DBI::dbDisconnect(con, shutdown = TRUE) +#' +#' # Visible in RStudio Connections pane +#' con <- connect_baseball_db(connections_pane = TRUE) +#' DBI::dbListTables(con) +#' DBI::dbDisconnect(con, shutdown = TRUE) #' } -connect_baseball_db <- function(dbdir = NULL, read_only = TRUE) { +connect_baseball_db <- function(dbdir = NULL, read_only = TRUE, + connections_pane = FALSE) { if (is.null(dbdir)) { dbdir <- Sys.getenv( "LAHMANS_DBDIR", @@ -28,5 +43,16 @@ connect_baseball_db <- function(dbdir = NULL, read_only = TRUE) { if (!file.exists(dbdir)) { stop(dbdir, " not found. Run setup_baseball_db() first.") } - DBI::dbConnect(duckdb::duckdb(), dbdir = dbdir, read_only = read_only) + + if (connections_pane && requireNamespace("connections", quietly = TRUE)) { + connections::connection_open(duckdb::duckdb(), + dbdir = dbdir, + read_only = read_only) + } else { + if (connections_pane) { + message("Package 'connections' not installed; ", + "using plain DBI::dbConnect() instead.") + } + DBI::dbConnect(duckdb::duckdb(), dbdir = dbdir, read_only = read_only) + } } diff --git a/R/setup_db.R b/R/setup_db.R index 2c2afe4..de8bf26 100644 --- a/R/setup_db.R +++ b/R/setup_db.R @@ -1,17 +1,25 @@ #' Build the baseball DuckDB database #' -#' Writes all Lahman package tables plus scraped USA Today salary data into a -#' persistent DuckDB file, then creates a `SalariesAll` view that unions both -#' salary sources on a common schema. +#' Writes all Lahman package tables plus scraped salary data into a persistent +#' DuckDB file, then creates a `SalariesAll` view that unions all salary +#' sources on a common schema: +#' - **Lahman** (`Salaries`): authoritative 1985–2016 +#' - **Spotrac** (`SalariesSpotrac`): player-level actuals 2017–2021 +#' - **USA Today** (`SalariesUSAToday`): player-level actuals 2022–2025 #' #' @param dbdir Path for the output `baseball.duckdb` file. Defaults to the #' value of the `LAHMANS_DBDIR` environment variable if set, otherwise #' `~/Documents/Data/baseball/baseball.duckdb`. #' @param sal_file Path to the combined USA Today salary CSV produced by #' [scrape_salaries()]. When `NULL` (default), looks for -#' `salaries_*_with_playerID.csv` in the same directory as `dbdir`. USA -#' Today data is not bundled with the package — users must run +#' `salaries_*_with_playerID.csv` (non-Spotrac) in the same directory as +#' `dbdir`. USA Today data is not bundled — users must run #' [scrape_salaries()] to obtain it. +#' @param spotrac_file Path to the combined Spotrac salary CSV produced by +#' `data-raw/salaries.R`. When `NULL` (default), looks for +#' `salaries_spotrac_*_with_playerID.csv` in the same directory as `dbdir`. +#' Spotrac data is not bundled — users must run `data-raw/salaries.R` to +#' obtain it. #' @param overwrite If `TRUE`, drop and recreate existing tables. Default #' `FALSE` aborts if the file already exists. #' @@ -22,9 +30,10 @@ #' \dontrun{ #' setup_baseball_db() #' } -setup_baseball_db <- function(dbdir = NULL, - sal_file = NULL, - overwrite = FALSE) { +setup_baseball_db <- function(dbdir = NULL, + sal_file = NULL, + spotrac_file = NULL, + overwrite = FALSE) { if (is.null(dbdir)) { dbdir <- Sys.getenv( "LAHMANS_DBDIR", @@ -51,35 +60,92 @@ setup_baseball_db <- function(dbdir = NULL, message(sprintf(" %-25s %d rows", nm, nrow(dt))) })) - # -- USA Today scraped salaries ----------------------------------------------- + # -- Spotrac scraped salaries (2017-2021) ------------------------------------- + if (is.null(spotrac_file)) { + data_dir <- dirname(dbdir) + candidates <- list.files(data_dir, + pattern = "salaries_spotrac_.*with_playerID\\.csv$", + full.names = TRUE) + spotrac_file <- if (length(candidates)) candidates[[1L]] else "" + } + has_spotrac <- nzchar(spotrac_file) && file.exists(spotrac_file) + if (!has_spotrac) { + message(" Spotrac salary file not found -- SalariesSpotrac will not be loaded.\n", + " Run data-raw/salaries.R then rerun setup_baseball_db(spotrac_file = ).") + } else { + sp <- data.table::fread(spotrac_file) + DBI::dbWriteTable(con, "SalariesSpotrac", sp, overwrite = TRUE) + message(sprintf(" %-25s %d rows", "SalariesSpotrac", nrow(sp))) + } + + # -- USA Today scraped salaries (2022+) --------------------------------------- if (is.null(sal_file)) { data_dir <- dirname(dbdir) candidates <- list.files(data_dir, pattern = "salaries.*with_playerID\\.csv$", full.names = TRUE) + candidates <- candidates[!grepl("spotrac", candidates, fixed = TRUE)] sal_file <- if (length(candidates)) candidates[[1L]] else "" } - if (!nzchar(sal_file) || !file.exists(sal_file)) { - warning("Scraped salary file not found -- SalariesUSAToday will not be loaded.\n", + has_usatoday <- nzchar(sal_file) && file.exists(sal_file) + if (!has_usatoday) { + warning("USA Today salary file not found -- SalariesUSAToday will not be loaded.\n", "Run scrape_salaries() then rerun setup_baseball_db(sal_file = ).") } else { sal <- data.table::fread(sal_file) sal[, average_annual := as.numeric(gsub("[$,]", "", average_annual))] DBI::dbWriteTable(con, "SalariesUSAToday", sal, overwrite = TRUE) message(sprintf(" %-25s %d rows", "SalariesUSAToday", nrow(sal))) + } - # -- SalariesAll view: Lahman (<=2016) + USA Today expanded with imputed years -- - DBI::dbExecute(con, " - CREATE OR REPLACE VIEW SalariesAll AS - WITH + # -- SalariesAll view: union all three salary sources ------------------------- + # Only build the view if at least one supplemental source loaded + if (has_spotrac || has_usatoday) { + # Normalise USA Today full-name team strings to Lahman 3-letter teamIDs. + usa_team_case <- " + CASE team + WHEN 'Angels' THEN 'LAA' WHEN 'L.A. Angels' THEN 'LAA' + WHEN 'Arizona' THEN 'ARI' WHEN 'Diamondbacks' THEN 'ARI' + WHEN 'Astros' THEN 'HOU' WHEN 'Houston' THEN 'HOU' + WHEN 'Athletics' THEN 'OAK' WHEN 'Oakland' THEN 'OAK' + WHEN 'Atlanta' THEN 'ATL' WHEN 'Braves' THEN 'ATL' + WHEN 'Baltimore' THEN 'BAL' WHEN 'Orioles' THEN 'BAL' + WHEN 'Blue Jays' THEN 'TOR' WHEN 'Toronto' THEN 'TOR' + WHEN 'Boston' THEN 'BOS' WHEN 'Red Sox' THEN 'BOS' + WHEN 'Brewers' THEN 'MIL' WHEN 'Milwaukee' THEN 'MIL' + WHEN 'Cardinals' THEN 'SLN' WHEN 'St. Louis' THEN 'SLN' + WHEN 'Chi. Cubs' THEN 'CHN' WHEN 'Chicago Cubs' THEN 'CHN' + WHEN 'Cubs' THEN 'CHN' + WHEN 'Chic. White Sox' THEN 'CHA' WHEN 'Chicago White Sox' THEN 'CHA' + WHEN 'White Sox' THEN 'CHA' + WHEN 'Cincinnati' THEN 'CIN' WHEN 'Reds' THEN 'CIN' + WHEN 'Cleveland' THEN 'CLE' WHEN 'Guardians' THEN 'CLE' + WHEN 'Colorado' THEN 'COL' WHEN 'Rockies' THEN 'COL' + WHEN 'Detroit' THEN 'DET' WHEN 'Tigers' THEN 'DET' + WHEN 'Dodgers' THEN 'LAN' WHEN 'L.A. Dodgers' THEN 'LAN' + WHEN 'Giants' THEN 'SFN' WHEN 'San Francisco' THEN 'SFN' + WHEN 'Kansas City' THEN 'KCA' WHEN 'Royals' THEN 'KCA' + WHEN 'Mariners' THEN 'SEA' WHEN 'Seattle' THEN 'SEA' + WHEN 'Marlins' THEN 'MIA' WHEN 'Miami' THEN 'MIA' + WHEN 'Mets' THEN 'NYN' WHEN 'N.Y. Mets' THEN 'NYN' + WHEN 'Minnesota' THEN 'MIN' WHEN 'Twins' THEN 'MIN' + WHEN 'Nationals' THEN 'WAS' WHEN 'Washington' THEN 'WAS' + WHEN 'N.Y. Yankees' THEN 'NYA' WHEN 'Yankees' THEN 'NYA' + WHEN 'Philadelphia' THEN 'PHI' WHEN 'Phillies' THEN 'PHI' + WHEN 'Pittsburgh' THEN 'PIT' WHEN 'Pirates' THEN 'PIT' + WHEN 'Rangers' THEN 'TEX' WHEN 'Texas' THEN 'TEX' + WHEN 'Rays' THEN 'TBA' WHEN 'Tampa Bay' THEN 'TBA' + WHEN 'San Diego' THEN 'SDN' WHEN 'Padres' THEN 'SDN' + ELSE team + END" + usatoday_cte <- if (has_usatoday) paste0(" -- Parse each USA Today row: clean AAV and extract contract start/end year. -- Handles patterns: 'N (YYYY-YY)', 'NN (YYYY-YY)', 'N(YYYY-YY)'. -- Rows with NULL years (1-year deals) get NULL c_start/c_end and pass -- through as actual records via the FULL JOIN below. usa_parsed AS ( SELECT - playerID, - team AS teamID, + playerID,", usa_team_case, " AS teamID, salary::DOUBLE AS salary, yearID, average_annual::DOUBLE AS aav, @@ -96,17 +162,12 @@ setup_baseball_db <- function(dbdir = NULL, FROM SalariesUSAToday WHERE playerID IS NOT NULL ), - - -- One actual record per player-season (handles any duplicate scrapes). actual AS ( SELECT DISTINCT ON (playerID, yearID) playerID, teamID, salary, yearID FROM usa_parsed ORDER BY playerID, yearID ), - - -- One contract per player + contract span; most-recent scrape's AAV wins - -- when the same contract appears in multiple scraped years. contracts AS ( SELECT DISTINCT ON (playerID, c_start, c_end) playerID, teamID, aav, c_start, c_end @@ -117,20 +178,11 @@ setup_baseball_db <- function(dbdir = NULL, AND aav IS NOT NULL ORDER BY playerID, c_start, c_end, yearID DESC ), - - -- Expand each contract to one row per year in the contract range. contract_years AS ( SELECT c.playerID, c.teamID, c.aav, gs::INTEGER AS yearID FROM contracts c, LATERAL (SELECT unnest(generate_series(c.c_start, c.c_end)) AS gs) t ), - - -- Merge contract-year grid with actual records. - -- * Matching rows -> use actual salary (is_actual = TRUE) - -- * Contract only -> impute with AAV (is_actual = FALSE) - -- * Actual only -> 1-year deals / NULL years pass through as-is - -- DISTINCT ON deduplicates the rare case where overlapping contracts - -- produce two rows for the same player-season; actual wins. usa_expanded AS ( SELECT DISTINCT ON (playerID, yearID) COALESCE(a.playerID, cy.playerID)::VARCHAR AS playerID, @@ -142,24 +194,71 @@ setup_baseball_db <- function(dbdir = NULL, FULL JOIN actual a ON cy.playerID = a.playerID AND cy.yearID = a.yearID ORDER BY playerID, yearID, is_actual DESC - ) - - -- Lahman (authoritative through 2016) - SELECT playerID, yearID, teamID, lgID, - salary::DOUBLE AS salary, - 'lahman' AS source, - TRUE AS is_actual - FROM Salaries + )") else "" + usatoday_union <- if (has_usatoday) " UNION ALL - - -- USA Today: actual salaries (2022-2025) + AAV-imputed contract years + -- USA Today: actual salaries (2022+) + AAV-imputed contract years SELECT playerID, yearID, teamID, NULL AS lgID, salary, 'usatoday' AS source, is_actual - FROM usa_expanded - ") + FROM usa_expanded" else "" + + # Normalise Spotrac abbreviations that differ from Lahman teamIDs. + spotrac_team_case <- " + CASE team + WHEN 'CHC' THEN 'CHN' WHEN 'CHW' THEN 'CHA' + WHEN 'KC' THEN 'KCA' WHEN 'LAD' THEN 'LAN' + WHEN 'NYM' THEN 'NYN' WHEN 'NYY' THEN 'NYA' + WHEN 'SD' THEN 'SDN' WHEN 'SF' THEN 'SFN' + WHEN 'STL' THEN 'SLN' WHEN 'TB' THEN 'TBA' + WHEN 'WSH' THEN 'WAS' + ELSE team + END" + + spotrac_union <- if (has_spotrac) paste0(" + UNION ALL + -- Spotrac: actual player salaries 2017-2021, filtered to MLB-contract players + -- (salary >= MLB minimum for that year) for consistency with Lahman. + -- Minor leaguers on 40-man rosters are excluded. + -- MLB minimums: 2017=$535K, 2018=$545K, 2019=$555K, 2020=$208K(prorated 60g), 2021=$570.5K + SELECT playerID::VARCHAR, + yearID::INTEGER,", spotrac_team_case, "::VARCHAR AS teamID, + NULL::VARCHAR AS lgID, + salary::DOUBLE, + 'spotrac' AS source, + TRUE AS is_actual + FROM SalariesSpotrac + WHERE playerID IS NOT NULL + AND salary >= CASE yearID + WHEN 2017 THEN 535000 + WHEN 2018 THEN 545000 + WHEN 2019 THEN 555000 + WHEN 2020 THEN 208000 + WHEN 2021 THEN 570500 + ELSE 500000 + END") else "" + + # Comma after last CTE only if usa CTEs are present + cte_comma <- if (has_usatoday) "," else "" + + DBI::dbExecute(con, paste0(" + CREATE OR REPLACE VIEW SalariesAll AS + WITH + -- placeholder CTE so the WITH clause is always valid even with no usa CTEs + _dummy AS (SELECT 1 AS x)", cte_comma, + usatoday_cte, " + + -- Lahman (authoritative through 2016) + SELECT playerID, yearID, teamID, lgID, + salary::DOUBLE AS salary, + 'lahman' AS source, + TRUE AS is_actual + FROM Salaries", + spotrac_union, + usatoday_union + )) message(sprintf(" %-25s (view)", "SalariesAll")) } diff --git a/README.md b/README.md index 23a8e72..f35b76f 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,26 @@ The design choice matters at scale: DuckDB executes columnar SQL directly on the file, so aggregations over 150 years of play-by-play data that would choke an in-memory data frame run in milliseconds. +## Data model + +20 Lahman tables loaded into DuckDB, colour-coded by functional group. +Arrows show primary-key → foreign-key relationships. + +![lahmanTools schema](man/figures/lahmanTools_schema.svg) + +| Colour | Group | Tables | +|--------|-------|--------| +| 🔵 Blue | Identity spine | `People`, `Teams`, `TeamsFranchises` | +| 🟢 Green | Regular-season | `Batting`, `Pitching`, `Fielding`, `Appearances` | +| 🟣 Purple | Postseason | `BattingPost`, `PitchingPost`, `FieldingPost` | +| 🩷 Pink | Salary | `Salaries` | +| 🟠 Orange | Honours | `AwardsPlayers`, `AllstarFull`, `HallOfFame` | +| 🩵 Teal | Management | `Managers` | +| 🤎 Brown | College | `CollegePlaying`, `Schools` | +| ⚫ Grey | Lookups | `Parks`, `HomeGames`, `SeriesPost` | + +To regenerate after schema changes: `Rscript analysis/schema_dm.R` (requires `dm`, `DiagrammeR`, `DiagrammeRsvg`). + ## Requirements - R ≥ 4.1.0 @@ -126,26 +146,6 @@ db_query(con, " | `SalariesAll` | `Salaries`, `SalariesUSAToday` | Lahman (≤ 2016) + USA Today (2017+); AAV imputation for multi-year contracts | | `TeamPayroll` | `SalariesAll` | Total and per-position payroll by team-season | -## Data model - -20 Lahman tables loaded into DuckDB, colour-coded by functional group. -Arrows show primary-key → foreign-key relationships. - -![lahmanTools schema](man/figures/lahmanTools_schema.svg) - -| Colour | Group | Tables | -|--------|-------|--------| -| 🔵 Blue | Identity spine | `People`, `Teams`, `TeamsFranchises` | -| 🟢 Green | Regular-season | `Batting`, `Pitching`, `Fielding`, `Appearances` | -| 🟣 Purple | Postseason | `BattingPost`, `PitchingPost`, `FieldingPost` | -| 🩷 Pink | Salary | `Salaries` | -| 🟠 Orange | Honours | `AwardsPlayers`, `AllstarFull`, `HallOfFame` | -| 🩵 Teal | Management | `Managers` | -| 🤎 Brown | College | `CollegePlaying`, `Schools` | -| ⚫ Grey | Lookups | `Parks`, `HomeGames`, `SeriesPost` | - -To regenerate after schema changes: `Rscript analysis/schema_dm.R` (requires `dm`, `DiagrammeR`, `DiagrammeRsvg`). - ## Package structure ``` diff --git a/data-raw/salaries.R b/data-raw/salaries.R index ff3af3f..ce38789 100644 --- a/data-raw/salaries.R +++ b/data-raw/salaries.R @@ -11,82 +11,132 @@ # # The scraped data (CSV output) must NOT be committed to version control # or redistributed. It is excluded via .gitignore. +# +# NOTE: This scraper uses an AJAX POST endpoint that returns all ~1500 players +# in a single request per year. The old CSS selectors (.player-name, .rank-position) +# are obsolete as of Spotrac's 2023 redesign. # ----------------------------------------------------------------------------- library(rvest) +library(httr2) library(data.table) -library(stringr) -years <- c(2017:2022) +output_dir <- "mlb_salaries" +dir.create(output_dir, showWarnings = FALSE) -for (year in years) { - print(year) - url <- paste0("https://www.spotrac.com/mlb/rankings/", year, "/salary/") - Sys.sleep(3 + runif(1, 0, 2)) # ~3-5 second polite delay between years - html <- read_html(url) - - players <- html %>% - html_nodes(".player-name") %>% - html_text() %>% - str_replace_all("\n", "") %>% - str_trim() - - positions <- html %>% - html_nodes(".rank-position") %>% - html_text() %>% - str_replace_all("\n", "") %>% - str_trim() - - ages <- html %>% - html_nodes(".center:nth-child(4)") %>% - html_text() %>% - str_replace_all("\n", "") %>% - str_trim() - - bats <- html %>% - html_nodes(".center:nth-child(5)") %>% - html_text() %>% - str_replace_all("\n", "") %>% - str_trim() - - throws <- html %>% - html_nodes(".center:nth-child(6)") %>% - html_text() %>% - str_replace_all("\n", "") %>% - str_trim() - - salaries <- html %>% - html_nodes(".right") %>% - html_text() %>% - str_replace_all("\n", "") %>% - str_trim() - - salaries_table <- - data.frame( - year = year, - player = players, - position = positions, - age = ages, - bats = bats, - throws = throws, - salary = salaries +years <- 2017:2021 # USA Today covers 2022+; Spotrac fills the gap + +# -- Parse one year's AJAX HTML ----------------------------------------------- +parse_spotrac_table <- function(html, year) { + rows <- html_elements(html, "li.list-group-item.d-flex") + if (length(rows) == 0L) return(NULL) + + result <- data.table::rbindlist(lapply(rows, function(li) { + name_node <- html_element(li, "div.link a") + if (is.na(name_node)) return(NULL) + + player <- html_text(name_node, trim = TRUE) + sm_text <- html_text(html_element(li, "small"), trim = TRUE) + # sm_text looks like "LAA, CF" — split on ", " + tp <- strsplit(sm_text, ",\\s*", perl = TRUE)[[1L]] + team <- if (length(tp) >= 1L) trimws(tp[[1L]]) else NA_character_ + pos <- if (length(tp) >= 2L) trimws(tp[[2L]]) else NA_character_ + + sal_raw <- html_text(html_element(li, "span.medium"), trim = TRUE) + salary <- suppressWarnings( + as.numeric(gsub("[$,\\s]", "", sal_raw, perl = TRUE)) + ) + + data.table::data.table( + yearID = as.integer(year), + player = player, + team = team, + position = pos, + salary = salary ) - if (length(players) == length(positions) && - length(positions) == length(ages) && - length(ages) == length(bats) && - length(bats) == length(throws) && - length(throws) == length(salaries)) { - salaries_table <- - data.frame( - year = year, - player = players, - position = positions, - age = ages, - bats = bats, - throws = throws, - salary = salaries - ) - salary_list <- - append(salary_list, list(salaries_table)) + }), fill = TRUE) + + # Drop rows where salary parsed to NA (team-total summary rows at bottom) + result[!is.na(salary) & !is.na(player)] +} + +# -- Scrape year by year ------------------------------------------------------- +for (year in years) { + out_file <- file.path(output_dir, paste0("salaries_spotrac_", year, ".csv")) + if (file.exists(out_file)) { + message("Skipping ", year, " -- already saved.") + next + } + + message("Scraping Spotrac ", year, "...") + Sys.sleep(3 + runif(1L, 0, 2)) # polite delay + + url <- paste0( + "https://www.spotrac.com/mlb/rankings/player/_/year/", year, "/sort/cash_total" + ) + + resp <- tryCatch( + request(url) |> + req_method("POST") |> + req_body_raw("ajax=table", "application/x-www-form-urlencoded") |> + req_headers(`User-Agent` = "Mozilla/5.0") |> + req_perform(), + error = function(e) { message(" ERROR: ", conditionMessage(e)); NULL } + ) + + if (is.null(resp) || resp_status(resp) != 200L) { + warning("No response for year ", year) + next + } + + html <- resp_body_html(resp) + yr_data <- parse_spotrac_table(html, year) + + if (!is.null(yr_data) && nrow(yr_data) > 0L) { + data.table::fwrite(yr_data, out_file) + message(sprintf(" Saved %d players for %s -> %s", nrow(yr_data), year, out_file)) + } else { + warning("No data parsed for year ", year) } } + +# -- Combine and join to Lahman playerID -------------------------------------- +spotrac_files <- list.files(output_dir, + pattern = "salaries_spotrac_\\d{4}\\.csv", + full.names = TRUE) + +if (length(spotrac_files) == 0L) stop("No Spotrac CSV files found in ", output_dir) + +all_sal <- data.table::rbindlist(lapply(spotrac_files, data.table::fread), fill = TRUE) + +# Reformat player column to "Last, First" to match Lahman People format +# Spotrac stores as "First Last" — reverse the order +name_parts <- strsplit(all_sal$player, "\\s+", perl = TRUE) +all_sal[, player_lahman := vapply(name_parts, function(p) { + if (length(p) < 2L) return(p[[1L]]) + paste0(p[[length(p)]], ", ", paste(p[-length(p)], collapse = " ")) +}, character(1L))] + +people <- data.table::as.data.table(Lahman::People) +people[, player_lahman := paste0(nameLast, ", ", nameFirst)] + +sal_linked <- merge( + all_sal, people[, .(playerID, player_lahman)], + by = "player_lahman", all.x = TRUE +) + +match_pct <- mean(!is.na(sal_linked$playerID)) * 100 +message(sprintf("Matched: %.1f%% of %d rows", match_pct, nrow(sal_linked))) + +yr_range <- range(sal_linked$yearID, na.rm = TRUE) +out_combined <- file.path( + output_dir, + sprintf("salaries_spotrac_%d_%d_with_playerID.csv", yr_range[[1L]], yr_range[[2L]]) +) +data.table::fwrite(sal_linked[, player_lahman := NULL], out_combined) +data.table::fwrite( + unique(sal_linked[is.na(playerID), .(player)]), + file.path(output_dir, "unmatched_spotrac.csv") +) + +message("Done. Combined file: ", out_combined) diff --git a/man/connect_baseball_db.Rd b/man/connect_baseball_db.Rd index 93e9d00..dc14299 100644 --- a/man/connect_baseball_db.Rd +++ b/man/connect_baseball_db.Rd @@ -4,7 +4,7 @@ \alias{connect_baseball_db} \title{Open a connection to the baseball DuckDB database} \usage{ -connect_baseball_db(dbdir = NULL, read_only = TRUE) +connect_baseball_db(dbdir = NULL, read_only = TRUE, connections_pane = FALSE) } \arguments{ \item{dbdir}{Path to the \code{baseball.duckdb} file. Defaults to the value of @@ -15,9 +15,18 @@ the \code{LAHMANS_DBDIR} environment variable if set, otherwise \item{read_only}{Open in read-only mode. Default \code{TRUE} for analysis; use \code{FALSE} only when rebuilding via \code{\link[=setup_baseball_db]{setup_baseball_db()}}.} + +\item{connections_pane}{If \code{TRUE}, registers the connection with the RStudio +Connections pane via the \code{connections} package (must be installed). The +returned object is still a standard \code{DBIConnection} and works identically +with all DBI and \code{db_query()} calls; the only difference is that RStudio +will display it in the Connections pane with a browse/disconnect button. +Silently falls back to a plain \code{DBI::dbConnect()} when \code{connections} is +not installed.} } \value{ -A \code{DBIConnection} object. Close with +A \code{DBIConnection} object (or a \code{connections}-wrapped equivalent when +\code{connections_pane = TRUE}). Close with \code{DBI::dbDisconnect(con, shutdown = TRUE)}. } \description{ @@ -25,8 +34,14 @@ Open a connection to the baseball DuckDB database } \examples{ \dontrun{ +# Plain connection con <- connect_baseball_db() DBI::dbListTables(con) DBI::dbDisconnect(con, shutdown = TRUE) + +# Visible in RStudio Connections pane +con <- connect_baseball_db(connections_pane = TRUE) +DBI::dbListTables(con) +DBI::dbDisconnect(con, shutdown = TRUE) } } diff --git a/tests/testthat/test-setup.R b/tests/testthat/test-setup.R index 03124a9..14362e9 100644 --- a/tests/testthat/test-setup.R +++ b/tests/testthat/test-setup.R @@ -46,3 +46,197 @@ test_that("setup_baseball_db() errors if file exists and overwrite = FALSE", { "already exists" ) }) + +# ── helpers ─────────────────────────────────────────────────────────────────── + +# Write a minimal Spotrac CSV and return its path. +make_spotrac_csv <- function(rows) { + f <- tempfile(fileext = ".csv") + data.table::fwrite(as.data.table(rows), f) + f +} + +# Write a minimal USA Today CSV and return its path. +make_usatoday_csv <- function(rows) { + f <- tempfile(fileext = ".csv") + data.table::fwrite(as.data.table(rows), f) + f +} + +# ── Spotrac team-ID normalisation ───────────────────────────────────────────── + +test_that("SalariesAll normalises Spotrac team codes to Lahman teamIDs", { + skip_on_ci() + skip_if_not_installed("Lahman") + + # Spotrac uses non-Lahman abbreviations for several teams + spotrac_rows <- list( + yearID = c(2017L, 2018L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L, 2019L), + player = c("P1", "P2", "P3", "P4", "P5", "P6", "P7", "P8", "P9", "P10", "P11"), + team = c("CHC", "CHW", "KC", "LAD", "NYM", "NYY", "SD", "SF", "STL", "TB", "WSH"), + position = rep("SP", 11), + salary = rep(600000, 11), # above 2019 minimum ($555K) + playerID = paste0("testP", seq_len(11)) + ) + + sp_file <- make_spotrac_csv(spotrac_rows) + on.exit(unlink(sp_file), add = TRUE) + + db_path <- tempfile(fileext = ".duckdb") + on.exit(unlink(db_path), add = TRUE) + + suppressWarnings( + setup_baseball_db(dbdir = db_path, spotrac_file = sp_file, overwrite = TRUE) + ) + + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path, read_only = TRUE) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE) + + teams_in_view <- db_query(con, + "SELECT DISTINCT teamID FROM SalariesAll WHERE source = 'spotrac' ORDER BY teamID" + )$teamID + + # All remapped to Lahman codes + expect_true("CHN" %in% teams_in_view) # CHC → CHN + expect_true("CHA" %in% teams_in_view) # CHW → CHA + expect_true("KCA" %in% teams_in_view) # KC → KCA + expect_true("LAN" %in% teams_in_view) # LAD → LAN + expect_true("NYN" %in% teams_in_view) # NYM → NYN + expect_true("NYA" %in% teams_in_view) # NYY → NYA + expect_true("SDN" %in% teams_in_view) # SD → SDN + expect_true("SFN" %in% teams_in_view) # SF → SFN + expect_true("SLN" %in% teams_in_view) # STL → SLN + expect_true("TBA" %in% teams_in_view) # TB → TBA + expect_true("WAS" %in% teams_in_view) # WSH → WAS + + # Original Spotrac codes must NOT appear + expect_false(any(c("CHC","CHW","KC","LAD","NYM","NYY","SD","SF","STL","TB","WSH") + %in% teams_in_view)) +}) + +test_that("SalariesAll Spotrac branch excludes sub-minimum salary players", { + skip_on_ci() + skip_if_not_installed("Lahman") + + sp_file <- make_spotrac_csv(list( + yearID = c(2019L, 2019L, 2019L), + player = c("Below", "AtMin", "Above"), + team = c("NYY", "NYY", "NYY"), + position = c("RP", "SP", "1B"), + salary = c(200000, 555000, 1000000), + playerID = c("subP01", "minP01", "mlbP01") + )) + on.exit(unlink(sp_file), add = TRUE) + + db_path <- tempfile(fileext = ".duckdb") + on.exit(unlink(db_path), add = TRUE) + + suppressWarnings( + setup_baseball_db(dbdir = db_path, spotrac_file = sp_file, overwrite = TRUE) + ) + + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path, read_only = TRUE) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE) + + ids <- db_query(con, + "SELECT playerID FROM SalariesAll WHERE source = 'spotrac' ORDER BY playerID" + )$playerID + + expect_false("subP01" %in% ids) # $200K — below 2019 minimum + expect_true ("minP01" %in% ids) # exactly at minimum + expect_true ("mlbP01" %in% ids) # comfortably above minimum +}) + +# ── USA Today team-name normalisation ───────────────────────────────────────── + +test_that("SalariesAll normalises USA Today team names to Lahman teamIDs", { + skip_on_ci() + skip_if_not_installed("Lahman") + + # Representative sample of USA Today name variants + usa_rows <- list( + playerID = paste0("usaP", seq_len(14)), + yearID = rep(2023L, 14), + team = c("Cubs", "White Sox", "Dodgers", "Yankees", "Mets", + "Royals", "Cardinals", "Rays", "Guardians", + "N.Y. Yankees", "L.A. Dodgers", "Chi. Cubs", + "Chic. White Sox", "Padres"), + salary = rep(1e6, 14), + average_annual = rep(1e6, 14), + years = c("1 (2023-24)", rep(NA_character_, 13)) + ) + + usa_file <- make_usatoday_csv(usa_rows) + on.exit(unlink(usa_file), add = TRUE) + + db_path <- tempfile(fileext = ".duckdb") + on.exit(unlink(db_path), add = TRUE) + + suppressWarnings( + setup_baseball_db(dbdir = db_path, sal_file = usa_file, overwrite = TRUE) + ) + + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path, read_only = TRUE) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE) + + teams_in_view <- db_query(con, + "SELECT DISTINCT teamID FROM SalariesAll WHERE source = 'usatoday' ORDER BY teamID" + )$teamID + + expect_true("CHN" %in% teams_in_view) # Cubs / Chi. Cubs → CHN + expect_true("CHA" %in% teams_in_view) # White Sox / Chic. White Sox → CHA + expect_true("LAN" %in% teams_in_view) # Dodgers / L.A. Dodgers → LAN + expect_true("NYA" %in% teams_in_view) # Yankees / N.Y. Yankees → NYA + expect_true("NYN" %in% teams_in_view) # Mets → NYN + expect_true("KCA" %in% teams_in_view) # Royals → KCA + expect_true("SLN" %in% teams_in_view) # Cardinals → SLN + expect_true("TBA" %in% teams_in_view) # Rays → TBA + expect_true("CLE" %in% teams_in_view) # Guardians → CLE + expect_true("SDN" %in% teams_in_view) # Padres → SDN + + # Original name-style strings must NOT appear + raw_names <- c("Cubs","White Sox","Dodgers","Yankees","Mets","Royals", + "Cardinals","Rays","Guardians","N.Y. Yankees","L.A. Dodgers", + "Chi. Cubs","Chic. White Sox","Padres") + expect_false(any(raw_names %in% teams_in_view)) +}) + +# ── Full-range coverage: SalariesAll joins to Teams ─────────────────────────── + +test_that("SalariesAll teamIDs join cleanly to Teams for 2017-2023", { + skip_on_ci() + skip_if_not_installed("Lahman") + + # Build a Spotrac CSV covering the four teams whose codes differ most + sp_file <- make_spotrac_csv(list( + yearID = c(2019L, 2019L, 2019L, 2019L), + player = c("P1", "P2", "P3", "P4"), + team = c("CHC", "LAD", "NYM", "STL"), + position = rep("SP", 4), + salary = rep(600000, 4), + playerID = paste0("jnTest", seq_len(4)) + )) + on.exit(unlink(sp_file), add = TRUE) + + db_path <- tempfile(fileext = ".duckdb") + on.exit(unlink(db_path), add = TRUE) + + suppressWarnings( + setup_baseball_db(dbdir = db_path, spotrac_file = sp_file, overwrite = TRUE) + ) + + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = db_path, read_only = TRUE) + on.exit(DBI::dbDisconnect(con, shutdown = TRUE), add = TRUE) + + # An unmatched teamID produces zero rows in the join + unmatched <- db_query(con, " + SELECT DISTINCT s.teamID + FROM SalariesAll s + WHERE s.source = 'spotrac' + AND s.teamID NOT IN (SELECT DISTINCT teamID FROM Teams) + ") + + expect_equal(nrow(unmatched), 0L, + info = paste("Unmatched Spotrac teamIDs:", paste(unmatched$teamID, collapse = ", ")) + ) +})