diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md index 0b55bb6..e983ed1 100644 --- a/.github/copilot-instructions.md +++ b/.github/copilot-instructions.md @@ -81,6 +81,20 @@ rsync -a /tmp/lahmans-git-work/.git/objects/ $PROJ/.git/objects/ `git checkout` in the project dir will fail — files are correct but the local branch pointer may lag. +## Interactive R Sessions (Analysis Development) + +When developing analysis scripts or iterating on charts, use an **interactive R session** instead of re-running the full script each time: + +1. Start R in async mode: `bash mode="async" command="R --no-save"` +2. Source shared setup (DB connection, libraries) once +3. Send individual code blocks via `write_bash` to iterate on specific charts or queries +4. Use the `view` tool on saved PNG files to inspect chart output visually +5. Only assemble the final `.R` script once the individual pieces are working + +This avoids the 60-90 second penalty of re-running a full analysis script on every change and enables tight visual feedback loops. + +**DuckDB CLI for ad-hoc queries:** Use `duckdb ~/Documents/Data/baseball/baseball.duckdb` for quick schema checks (`DESCRIBE`, `SUMMARIZE`) rather than writing throwaway R code. + ## R CMD Check - Non-ASCII characters (em-dashes, box-drawing) in R source cause WARNING — use ASCII `--`. diff --git a/.gitignore b/.gitignore index c8a99be..9fb0da2 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ tests/testthat/_snaps/ # Copilot CLI config (environment-specific, not for contributors) .github/lsp.json .copilot/mcp-config.json +.mcp.json # Old scratch notebooks (superseded by analysis/) inst/notebooks/ diff --git a/AGENTS.md b/AGENTS.md index 6b59da8..82bbb09 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -84,6 +84,20 @@ rsync -a /tmp/lahmans-git-work/.git/objects/ $PROJ/.git/objects/ `git checkout` in the project dir will fail — files are correct but the local branch pointer may lag. +## Interactive R Sessions (Analysis Development) + +When developing analysis scripts or iterating on charts, use an **interactive R session** instead of re-running the full script each time: + +1. Start R in async mode: `bash mode="async" command="R --no-save"` +2. Source shared setup (DB connection, libraries) once +3. Send individual code blocks via `write_bash` to iterate on specific charts or queries +4. Use the `view` tool on saved PNG files to inspect chart output visually +5. Only assemble the final `.R` script once the individual pieces are working + +This avoids the 60-90 second penalty of re-running a full analysis script on every change and enables tight visual feedback loops. + +**DuckDB CLI for ad-hoc queries:** Use `duckdb ~/Documents/Data/baseball/baseball.duckdb` for quick schema checks (`DESCRIBE`, `SUMMARIZE`) rather than writing throwaway R code. + ## R CMD Check - Non-ASCII characters (em-dashes, box-drawing) in R source cause WARNING — use ASCII `--`. diff --git a/DESCRIPTION b/DESCRIPTION index 160fd23..6ac8cb3 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,16 @@ Package: lahmanTools Title: Baseball Analytics with Lahman and DuckDB -Version: 0.1.0 +Version: 0.2.0 Authors@R: person("David", "Lucey", role = c("aut", "cre"), email = "david@example.com") -Description: Provides a persistent DuckDB database populated with all Lahman - baseball tables and supplemental MLB salary data scraped from USA Today - (2017+). Includes helpers to connect, rebuild, and extend the database. +Description: Loads all Sean Lahman baseball tables (1871-2025) into a persistent + DuckDB database and exposes pre-built sabermetric SQL views (BattingStats, + PitchingStats, SalaryPerWAR, etc.). Optionally extends salary coverage to + 2017-2025 from USA Today and Spotrac, and supplements with FanGraphs WAR + (1985+) and Chadwick Bureau player ID crosswalk via the baseballr package. + Includes write_mcp_config() to connect the database to GitHub Copilot CLI + or Claude via a local DuckDB MCP server. No third-party data is bundled; + all supplemental data is fetched at runtime. License: MIT + file LICENSE Encoding: UTF-8 Depends: R (>= 4.1.0) diff --git a/NAMESPACE b/NAMESPACE index 0005da8..f4e0000 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -5,8 +5,15 @@ export(connect_baseball_db) export(create_stats_views) export(db_query) export(dt_factors_to_char) +export(load_chadwick_ids) +export(load_fangraphs_war) +export(load_statcast) +export(match_player_ids) +export(normalise_player_name) export(scrape_salaries) export(setup_baseball_db) +export(team_name_map) +export(write_mcp_config) importFrom(data.table,":=") importFrom(data.table,.SD) importFrom(data.table,as.data.table) diff --git a/NEWS.md b/NEWS.md index 47384d6..d98414d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,54 @@ -# lahmanTools (development version) +# lahmanTools 0.2.0 + +## New features + +* Three new runtime data loaders in `R/loaders.R`: + - `load_chadwick_ids(con)` -- downloads the Chadwick Bureau player ID + crosswalk via `baseballr` and writes it as `ChadwickIDs` to DuckDB. + Creates `PlayerIDs` view joining Lahman `playerID` to MLBAM, FanGraphs, + Retrosheet and Baseball Reference IDs. Licensed ODC-BY 1.0 (attribution + required). + - `load_fangraphs_war(con, years)` -- fetches FanGraphs batter and pitcher + WAR leaderboards (batting 1871+, pitching 1985+) and creates `PlayerWAR` + and `SalaryPerWAR` views. Requires `ChadwickIDs` for the FanGraphs-to-Lahman + join. `SalaryPerWAR` includes a `war_reliable` flag (TRUE for all rows in + the salary era 1985+; retained for backward compatibility). + - `load_statcast(con, years)` -- fetches Baseball Savant pitch-level data + (2015+ only, ~700 MB/season) and creates `StatcastSeason` batter aggregates + (exit velocity, launch angle, hard-hit rate, xBA, xwOBA). + +* `setup_baseball_db()` gains three new parameters: + - `load_chadwick = FALSE` -- pass `TRUE` to load the Chadwick crosswalk + during initial database build. + - `load_war = FALSE` -- pass `TRUE` to also fetch FanGraphs WAR (implies + `load_chadwick`). + - `war_years = 1985:2025` -- seasons to fetch for WAR data. + +* `baseballr` added to `Suggests`; required only by the three new loaders. + +* `write_mcp_config()` -- generates the JSON config entry needed to connect + GitHub Copilot CLI or Claude Code to `baseball.duckdb` via a local DuckDB + MCP server. Resolves `~` to an absolute path (required by Python-based MCP + servers), merges into an existing config without clobbering other server + entries, and always enforces `--readonly`. Defaults to `dry_run = TRUE` so + nothing is written until the user opts in. + +* Three new analytical views created by `create_stats_views()` / `setup_baseball_db()`: + - `PlayerAcquisitionType` -- one row per player-team; `acq_type` column + classifies as `homegrown` (debut year = first year with team), + `young_acq` (arrived post-debut, age < 26), or `veteran_acq`. + Eliminates the repeated 3-CTE acquisition-classification pattern in + analysis queries. + - `LeagueMedianSalary` -- `med_sal`, `avg_sal`, `n_players` by season from + `SalariesAll`. Use `salary / med_sal` for relative-salary normalisation. + - `TeamPayroll` -- `total_salary`, `n_players`, `median_salary`, `max_salary` + by team-season from `SalariesAll`. Was documented in README but missing + from the code; now implemented. + +* `era_label(yr)` SQL macro registered by `create_stats_views()`. Replaces + the repeated `CASE WHEN yearID <= 2002 THEN 'Pre-Moneyball' ...` block in + every analysis query. Returns `'Pre-Moneyball'`, `'Moneyball'`, `'Big Data'`, + or `NULL` for years outside 1998-present. ## New features diff --git a/R/globals.R b/R/globals.R index d358602..3799276 100644 --- a/R/globals.R +++ b/R/globals.R @@ -6,8 +6,11 @@ utils::globalVariables(c( # scrape.R "salary", "average_annual", "player", "playerID", "yearID", # setup_db.R (also average_annual, playerID) - # People columns used in scrape.R + # People columns used in scrape.R / match_player_ids "nameLast", "nameFirst", + # match_player_ids internal columns + "player_exact", "player_norm", "debut_year", "final_year", + ".row_idx", "n_matches", ".match_teamID", "last_norm", "first_init", "n", # utils.R (dt_factors_to_char) "factor_cols", # loaders.R -- Chadwick register column references diff --git a/R/loaders.R b/R/loaders.R index c446a11..62b4612 100644 --- a/R/loaders.R +++ b/R/loaders.R @@ -75,13 +75,8 @@ create_war_views_ <- function(con) { # SalaryPerWAR: dollars per WAR by player-season. # - # war_reliable flag: - # FanGraphs pitching WAR is only available from 2002 onward. A pitcher - # with salary data before 2002 will have near-zero total_war (batting - # contribution only), making dollars_per_war badly wrong for that row. - # war_reliable = FALSE when the player had pitching appearances AND - # yearID < 2002. Filter WHERE war_reliable = TRUE for clean analysis. - # Batting WAR is reliable for all seasons 1985+. + # war_reliable flag: kept for backward compatibility; now always TRUE + # since FanGraphs pitching WAR covers the full salary era (1985+). DBI::dbExecute(con, " CREATE OR REPLACE VIEW SalaryPerWAR AS WITH pitcher_seasons AS ( @@ -100,7 +95,7 @@ create_war_views_ <- function(con) { w.total_war, s.salary / NULLIF(w.total_war, 0) AS dollars_per_war, era_label(s.yearID) AS era, - NOT (ps.playerID IS NOT NULL AND s.yearID < 2002) AS war_reliable + NOT (ps.playerID IS NOT NULL AND s.yearID < 1985) AS war_reliable FROM SalariesAll s JOIN PlayerWAR w USING (playerID, yearID) LEFT JOIN pitcher_seasons ps USING (playerID, yearID) @@ -265,7 +260,7 @@ load_fangraphs_war <- function(con, years = 1985:2025, overwrite = FALSE) { bat <- data.table::rbindlist(Filter(Negate(is.null), bat_list), fill = TRUE) message(sprintf("Fetching FanGraphs pitching WAR %d-%d...", start_yr, end_yr)) - pit_list <- lapply(years[years >= 2002L], function(yr) { + pit_list <- lapply(years, function(yr) { tryCatch({ d <- data.table::as.data.table( baseballr::fg_pitch_leaders(startseason = yr, endseason = yr, qual = 0) @@ -280,7 +275,7 @@ load_fangraphs_war <- function(con, years = 1985:2025, overwrite = FALSE) { pit <- data.table::rbindlist(Filter(Negate(is.null), pit_list), fill = TRUE) if (nrow(bat) == 0L) stop("No FanGraphs batting WAR data retrieved.") - if (nrow(pit) == 0L) warning("No FanGraphs pitching WAR data retrieved (pitching WAR only available 2002+).") + if (nrow(pit) == 0L) warning("No FanGraphs pitching WAR data retrieved.") DBI::dbWriteTable(con, "FangraphsBattingWAR", bat, overwrite = overwrite) DBI::dbWriteTable(con, "FangraphsPitchingWAR", pit, overwrite = overwrite) diff --git a/R/scrape.R b/R/scrape.R index c095290..6f21023 100644 --- a/R/scrape.R +++ b/R/scrape.R @@ -91,21 +91,18 @@ scrape_salaries <- function(years = 2017:2025, # -- Join to Lahman playerID -------------------------------------------------- people <- data.table::as.data.table(Lahman::People) - people[, player := paste0(nameLast, ", ", nameFirst)] + match_player_ids(all_salaries, people) - sal_linked <- merge(all_salaries, people[, .(playerID, player)], - by = "player", all.x = TRUE) + match_pct <- mean(!is.na(all_salaries$playerID)) * 100 + message(sprintf("Final match rate: %.1f%% of %d rows", match_pct, nrow(all_salaries))) - match_pct <- mean(!is.na(sal_linked$playerID)) * 100 - message(sprintf("Matched: %.1f%% of %d rows", match_pct, nrow(sal_linked))) - - yr_range <- range(sal_linked$yearID, na.rm = TRUE) + yr_range <- range(all_salaries$yearID, na.rm = TRUE) out_combined <- file.path( output_dir, sprintf("salaries_%d_%d_with_playerID.csv", yr_range[1], yr_range[2]) ) - data.table::fwrite(sal_linked, out_combined) - data.table::fwrite(unique(sal_linked[is.na(playerID), .(player)]), + data.table::fwrite(all_salaries, out_combined) + data.table::fwrite(unique(all_salaries[is.na(playerID), .(player)]), file.path(output_dir, "unmatched_players.csv")) message("Done. Combined file: ", out_combined) diff --git a/R/setup_db.R b/R/setup_db.R index 8be8199..903a686 100644 --- a/R/setup_db.R +++ b/R/setup_db.R @@ -12,9 +12,8 @@ #' and creates the `PlayerIDs` view (ODC-BY 1.0 licensed; safe to use locally). #' - `load_war = TRUE` additionally fetches FanGraphs WAR leaderboards and #' creates the `PlayerWAR` and `SalaryPerWAR` views. Implies -#' `load_chadwick = TRUE`. Pitching WAR is available from FanGraphs from -#' 2002 onward only; `SalaryPerWAR` includes a `war_reliable` flag to mark -#' pre-2002 pitcher rows where WAR values are incomplete. +#' `load_chadwick = TRUE`. Both batting and pitching WAR are available +#' from FanGraphs for the full salary era (1985+). #' #' @param dbdir Path for the output `baseball.duckdb` file. Defaults to the #' value of the `LAHMANS_DBDIR` environment variable if set, otherwise @@ -38,8 +37,7 @@ #' `PlayerWAR` and `SalaryPerWAR` views. Implies `load_chadwick = TRUE`. #' Requires an internet connection and \pkg{baseballr}. Default `FALSE`. #' @param war_years Integer vector of seasons to fetch for WAR data. -#' Defaults to `1985:2025` (full salary era). Pitching WAR before 2002 is -#' not available from FanGraphs; see `SalaryPerWAR.war_reliable`. +#' Defaults to `1985:2025` (full salary era). #' #' @return Invisibly returns `dbdir`. #' @export diff --git a/R/utils.R b/R/utils.R index 7b181b2..df3f39c 100644 --- a/R/utils.R +++ b/R/utils.R @@ -63,3 +63,311 @@ clean_names <- function(x) { db_query <- function(con, sql, ...) { data.table::as.data.table(DBI::dbGetQuery(con, sql, ...)) } + +#' Map common team display names to Lahman teamID codes +#' +#' Returns a `data.table` with columns `team_name` and `teamID`. +#' Covers all 30 current franchises with common aliases used by +#' USA Today, Spotrac, and other public salary sources. +#' +#' @return A `data.table` with two character columns. +#' @export +team_name_map <- function() { + # Each franchise: city names, nicknames, abbreviations + aliases <- list( + ARI = c("Arizona", "Diamondbacks", "D-backs", "ARI"), + ATL = c("Atlanta", "Braves", "ATL"), + BAL = c("Baltimore", "Orioles", "BAL"), + BOS = c("Boston", "Red Sox", "BOS"), + CHN = c("Chi. Cubs", "Chicago Cubs", "Cubs", "CHC"), + CHA = c("Chic. White Sox", "Chicago White Sox", "White Sox", "CHW", "CWS"), + CIN = c("Cincinnati", "Reds", "CIN"), + CLE = c("Cleveland", "Guardians", "Indians", "CLE"), + COL = c("Colorado", "Rockies", "COL"), + DET = c("Detroit", "Tigers", "DET"), + HOU = c("Houston", "Astros", "HOU"), + KCA = c("Kansas City", "Royals", "KC", "KCR"), + LAA = c("L.A. Angels", "Los Angeles Angels", "Angels", "Anaheim", "LAA"), + LAN = c("L.A. Dodgers", "Los Angeles Dodgers", "Dodgers", "LAD"), + MIA = c("Miami", "Marlins", "MIA"), + MIL = c("Milwaukee", "Brewers", "MIL"), + MIN = c("Minnesota", "Twins", "MIN"), + NYN = c("N.Y. Mets", "New York Mets", "Mets", "NYM"), + NYA = c("N.Y. Yankees", "New York Yankees", "Yankees", "NYY"), + OAK = c("Oakland", "Athletics", "A's", "OAK"), + ATH = c("Sacramento"), + PHI = c("Philadelphia", "Phillies", "PHI"), + PIT = c("Pittsburgh", "Pirates", "PIT"), + SDN = c("San Diego", "Padres", "SD", "SDP"), + SFN = c("San Francisco", "Giants", "SF", "SFG"), + SEA = c("Seattle", "Mariners", "SEA"), + SLN = c("St. Louis", "Cardinals", "STL"), + TBA = c("Tampa Bay", "Rays", "TB", "TBR"), + TEX = c("Texas", "Rangers", "TEX"), + TOR = c("Toronto", "Blue Jays", "TOR"), + WAS = c("Washington", "Nationals", "WSH", "WSN") + ) + rows <- lapply(names(aliases), function(tid) { + data.table::data.table(team_name = aliases[[tid]], teamID = tid) + }) + data.table::rbindlist(rows) +} + +#' Normalise a player name for fuzzy matching +#' +#' Strips suffixes (Jr., Sr., II, III, IV), injury markers (*), accents, +#' punctuation in initials (J.D. -> J D), apostrophes, and extra whitespace. +#' Returns lowercase "last, first" form suitable for exact-match joining. +#' +#' @param x Character vector of player names in "Last, First" format. +#' +#' @return Character vector the same length as \code{x}, normalised. +#' @export +#' +#' @examples +#' normalise_player_name(c("Acuna Jr., Ronald", "Martinez, JD", "Harper, Bryce*")) +#' # [1] "acuna, ronald" "martinez, j d" "harper, bryce" +normalise_player_name <- function(x) { + x <- gsub("\\*", "", x) # injury marker + x <- gsub("\\b(Jr\\.?|Sr\\.?|II|III|IV)\\b", "", x) # suffixes + # Fix UTF-8 mojibake (e.g. Spotrac "Canó" -> "Canó") + x <- vapply(x, function(s) { + if (!grepl("\u00c3", s, fixed = TRUE)) return(s) + tryCatch({ + raw <- iconv(s, from = "UTF-8", to = "latin1", toRaw = TRUE)[[1L]] + result <- rawToChar(raw) + Encoding(result) <- "UTF-8" + if (validUTF8(result)) result else s + }, error = function(e) s) + }, character(1L), USE.NAMES = FALSE) + x <- iconv(x, from = "UTF-8", to = "ASCII//TRANSLIT") # accents + x <- gsub("[\u2018\u2019\u0027]", "", x) # straight + smart apostrophes + # Expand bare initials: "Martinez, JD" -> "Martinez, J D" (only after comma) + x <- gsub(",\\s*([A-Z])([A-Z])(?=[^a-z]|$)", ", \\1 \\2", x, perl = TRUE) + x <- gsub("\\.", " ", x) # J.D. -> J D + x <- gsub("[^[:alnum:], ]", "", x) # other punctuation + x <- gsub("\\s+", " ", trimws(x)) # collapse whitespace + x <- gsub("\\s+,", ",", x) # space before comma + x <- gsub(",\\s+", ", ", x) # normalise comma spacing + tolower(x) +} + +#' Match salary data to Lahman playerIDs via multi-pass name matching +#' +#' Performs progressive matching from strict to fuzzy: +#' \enumerate{ +#' \item Exact "Last, First" match (unique names only) +#' \item Normalised names (strips accents, suffixes, punctuation, mojibake) +#' \item Normalised name + active-year filter for ambiguous names +#' \item Team-constrained: last name within team-year roster (if \code{team} +#' or \code{teamID} column present). This is the big-picture win -- +#' constraining to ~50 roster spots resolves nicknames, formal names, +#' and most ambiguous names without complex normalization. +#' } +#' +#' @param sal_dt A `data.table` with a `player` column in "Last, First" format +#' and a `yearID` column. Optionally a `team` (display name) or `teamID` +#' (Lahman code) column for roster-constrained matching. +#' @param people_dt A `data.table` from `Lahman::People` with at least +#' `playerID`, `nameFirst`, `nameLast`, `debut`, `finalGame`. +#' @param roster_dt Optional `data.table` with `playerID`, `yearID`, `teamID` +#' columns (e.g., from Appearances). If NULL, built automatically from +#' Lahman::Batting + Lahman::Pitching when team info is available. +#' +#' @return \code{sal_dt} with a `playerID` column filled where matches succeed. +#' Modified by reference; also returned invisibly. +#' @export +match_player_ids <- function(sal_dt, people_dt, roster_dt = NULL) { + stopifnot( + data.table::is.data.table(sal_dt), + data.table::is.data.table(people_dt), + "player" %in% names(sal_dt), + "yearID" %in% names(sal_dt), + all(c("playerID", "nameFirst", "nameLast") %in% names(people_dt)) + ) + + people <- data.table::copy(people_dt) + + # Build exact-match key: "Last, First" + people[, player_exact := paste0(nameLast, ", ", nameFirst)] + people[, player_norm := normalise_player_name(player_exact)] + + # Derive active range (generous +/- 1 year) + people[, debut_year := as.integer(substr(as.character(debut), 1L, 4L))] + people[, final_year := as.integer(substr(as.character(finalGame), 1L, 4L))] + people[is.na(final_year), final_year := 2099L] + people[is.na(debut_year), debut_year := 1800L] + + if (!"playerID" %in% names(sal_dt)) sal_dt[, playerID := NA_character_] + + # --- Pass 1: Exact match on "Last, First" --- + unmatched_idx <- which(is.na(sal_dt$playerID)) + msg_pass1 <- 0L + if (length(unmatched_idx)) { + exact_lookup <- people[, .(playerID, player_exact)] + exact_lookup <- exact_lookup[, .SD[.N == 1L], by = player_exact] + m1 <- data.table::data.table( + player = sal_dt$player[unmatched_idx], + .row_idx = unmatched_idx + ) + m1 <- merge(m1, exact_lookup, by.x = "player", by.y = "player_exact", + all.x = TRUE, sort = FALSE) + matched <- !is.na(m1$playerID) + if (any(matched)) { + data.table::set(sal_dt, i = m1$.row_idx[matched], j = "playerID", + value = m1$playerID[matched]) + } + msg_pass1 <- sum(matched) + } + + # --- Pass 2: Normalised name match --- + unmatched_idx <- which(is.na(sal_dt$playerID)) + msg_pass2 <- 0L + if (length(unmatched_idx)) { + norm_lookup <- people[, .(playerID, player_norm)] + norm_lookup <- norm_lookup[, .SD[.N == 1L], by = player_norm] + m2 <- data.table::data.table( + player = sal_dt$player[unmatched_idx], + .row_idx = unmatched_idx + ) + m2[, player_norm := normalise_player_name(player)] + m2 <- merge(m2, norm_lookup, by = "player_norm", all.x = TRUE, sort = FALSE) + matched <- !is.na(m2$playerID) + if (any(matched)) { + data.table::set(sal_dt, i = m2$.row_idx[matched], j = "playerID", + value = m2$playerID[matched]) + } + msg_pass2 <- sum(matched) + } + + # --- Pass 3: Normalised name + active-year disambiguation --- + unmatched_idx <- which(is.na(sal_dt$playerID)) + msg_pass3 <- 0L + if (length(unmatched_idx)) { + m3 <- data.table::data.table( + player = sal_dt$player[unmatched_idx], + yearID = sal_dt$yearID[unmatched_idx], + .row_idx = unmatched_idx + ) + m3[, player_norm := normalise_player_name(player)] + all_norm <- people[, .(playerID, player_norm, debut_year, final_year)] + m3_joined <- merge(m3, all_norm, by = "player_norm", all.x = TRUE, + allow.cartesian = TRUE, sort = FALSE) + m3_joined <- m3_joined[!is.na(playerID) & + yearID >= debut_year - 1L & + yearID <= final_year + 1L] + m3_joined[, n_matches := .N, by = .row_idx] + m3_unique <- m3_joined[n_matches == 1L] + if (nrow(m3_unique)) { + data.table::set(sal_dt, i = m3_unique$.row_idx, j = "playerID", + value = m3_unique$playerID) + } + msg_pass3 <- nrow(m3_unique) + } + + # --- Pass 4: Team-constrained last-name + first-initial matching --- + # This is the power move: within a team-year roster of ~50 players, + # last-name alone resolves 96.4% and last+initial resolves 99.6%. + # Handles nicknames, formal names, and ambiguous names in one pass. + has_team <- "teamID" %in% names(sal_dt) + has_team_name <- "team" %in% names(sal_dt) + msg_pass4 <- 0L + + unmatched_idx <- which(is.na(sal_dt$playerID)) + if (length(unmatched_idx) && (has_team || has_team_name)) { + # Map team display names to Lahman teamIDs if needed + if (!has_team && has_team_name) { + tmap <- team_name_map() + sal_dt[tmap, .match_teamID := i.teamID, on = .(team = team_name)] + } else { + sal_dt[, .match_teamID := teamID] + } + + # Build roster if not provided + if (is.null(roster_dt)) { + roster_dt <- tryCatch({ + bat <- data.table::as.data.table(Lahman::Batting) + pit <- data.table::as.data.table(Lahman::Pitching) + unique(rbind( + bat[, .(playerID, yearID, teamID)], + pit[, .(playerID, yearID, teamID)] + )) + }, error = function(e) NULL) + } + + if (!is.null(roster_dt)) { + # Build roster lookup with normalised last name + first initial + rost <- merge(roster_dt, people[, .(playerID, nameLast, nameFirst)], + by = "playerID") + rost[, last_norm := tolower(iconv(nameLast, to = "ASCII//TRANSLIT"))] + rost[, last_norm := gsub("[^a-z]", "", last_norm)] + rost[, first_init := substr(tolower(iconv(nameFirst, to = "ASCII//TRANSLIT")), 1L, 1L)] + + # Prepare unmatched salary rows + unmatched_idx <- which(is.na(sal_dt$playerID) & !is.na(sal_dt$.match_teamID)) + if (length(unmatched_idx)) { + m4 <- data.table::data.table( + player = sal_dt$player[unmatched_idx], + yearID = sal_dt$yearID[unmatched_idx], + .match_teamID = sal_dt$.match_teamID[unmatched_idx], + .row_idx = unmatched_idx + ) + m4[, last_norm := sub(",.*", "", normalise_player_name(player))] + m4[, first_init := substr(sub(".*,\\s*", "", normalise_player_name(player)), 1L, 1L)] + + # 4a: team + year + last name (unique within team) + m4a <- merge(m4, rost[, .(playerID, yearID, teamID, last_norm)], + by.x = c("yearID", ".match_teamID", "last_norm"), + by.y = c("yearID", "teamID", "last_norm"), + all.x = TRUE, allow.cartesian = TRUE, sort = FALSE) + m4a[, n := .N, by = .row_idx] + m4a_ok <- m4a[n == 1L & !is.na(playerID)] + if (nrow(m4a_ok)) { + data.table::set(sal_dt, i = m4a_ok$.row_idx, j = "playerID", + value = m4a_ok$playerID) + msg_pass4 <- msg_pass4 + nrow(m4a_ok) + } + + # 4b: team + year + last name + first initial (for same-lastname teammates) + unmatched_idx2 <- which(is.na(sal_dt$playerID) & !is.na(sal_dt$.match_teamID)) + if (length(unmatched_idx2)) { + m4b <- data.table::data.table( + player = sal_dt$player[unmatched_idx2], + yearID = sal_dt$yearID[unmatched_idx2], + .match_teamID = sal_dt$.match_teamID[unmatched_idx2], + .row_idx = unmatched_idx2 + ) + m4b[, last_norm := sub(",.*", "", normalise_player_name(player))] + m4b[, first_init := substr(sub(".*,\\s*", "", normalise_player_name(player)), 1L, 1L)] + + m4b <- merge(m4b, rost[, .(playerID, yearID, teamID, last_norm, first_init)], + by.x = c("yearID", ".match_teamID", "last_norm", "first_init"), + by.y = c("yearID", "teamID", "last_norm", "first_init"), + all.x = TRUE, allow.cartesian = TRUE, sort = FALSE) + m4b[, n := .N, by = .row_idx] + m4b_ok <- m4b[n == 1L & !is.na(playerID)] + if (nrow(m4b_ok)) { + data.table::set(sal_dt, i = m4b_ok$.row_idx, j = "playerID", + value = m4b_ok$playerID) + msg_pass4 <- msg_pass4 + nrow(m4b_ok) + } + } + } + } + + # Clean up temp column + if (".match_teamID" %in% names(sal_dt)) { + sal_dt[, .match_teamID := NULL] + } + } + + total <- nrow(sal_dt) + matched_total <- sum(!is.na(sal_dt$playerID)) + message(sprintf( + "match_player_ids: %d/%d matched (%.1f%%). Pass1(exact)=%d, Pass2(norm)=%d, Pass3(year)=%d, Pass4(team)=%d", + matched_total, total, 100 * matched_total / total, + msg_pass1, msg_pass2, msg_pass3, msg_pass4 + )) + + invisible(sal_dt) +} diff --git a/README.md b/README.md index 16cebfa..2b63f67 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,9 @@ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) [![Data: CC BY-SA 3.0](https://img.shields.io/badge/Data-CC%20BY--SA%203.0-blue.svg)](https://creativecommons.org/licenses/by-sa/3.0/) -`lahmanTools` loads the full [Lahman](https://cran.r-project.org/package=Lahman) baseball database (1871–2025) into a persistent, file-backed **DuckDB** instance and exposes pre-built sabermetric SQL views. Analysis runs via `data.table` and plain SQL — no tidyverse dependency, no loading 27 tables into memory. +`lahmanTools` loads the [Lahman](https://cran.r-project.org/package=Lahman) baseball database (1871–2025) into a persistent **DuckDB** instance and supplements it with salary data through 2025 (via Spotrac and USA Today) and FanGraphs WAR back to 1985. Pre-built SQL views handle the common sabermetric patterns — OPS, FIP, salary-per-WAR, team payroll, acquisition type. Connect the database to **GitHub Copilot CLI** or **Claude** via the included MCP server config and query 150 years of baseball in plain English. -The design choice matters at scale: DuckDB executes columnar SQL directly on the file, so aggregations over 150 years of play-by-play data that would choke an in-memory data frame run in milliseconds. +Analysis in R runs via `data.table` and plain SQL — no tidyverse dependency, no loading 30+ tables into memory. DuckDB executes columnar SQL directly on the file, so aggregations across the full history run in milliseconds. ## Data model @@ -30,7 +30,7 @@ To regenerate after schema changes: `Rscript analysis/schema_dm.R` (requires `dm ### Derived views and macros -Eight views and one scalar macro are created by `setup_baseball_db()`. +Ten views and one scalar macro are created by `setup_baseball_db()`. Query them directly via SQL — no R wrangling required for the common patterns. **Per-player stats views** (one row per player-year-stint-team): @@ -42,6 +42,14 @@ Query them directly via SQL — no R wrangling required for the common patterns. | `FieldingStats` | `Fielding` | FPCT, RF/9, RF/G by position | | `SalariesAll` | `Salaries`, `SalariesSpotrac`, `SalariesUSAToday` | Lahman (1985-2016) + Spotrac (2017-2021) + USA Today (2022-2025); filter `is_actual = TRUE` for confirmed figures | +**WAR and salary efficiency views** (require `load_war = TRUE`; see [Setup](#setup)): + +| View | Description | +|------|-------------| +| `PlayerIDs` | Lahman `playerID` joined to MLBAM, FanGraphs, Retrosheet, and BBREF IDs via Chadwick crosswalk | +| `PlayerWAR` | `bat_war` + `pit_war` + `total_war` per player-season (1985+) | +| `SalaryPerWAR` | `dollars_per_war` by player-season with `era` label | + **Analytical views** (pre-built patterns for multi-era salary analysis): | View | Description | @@ -108,22 +116,20 @@ at runtime to your local database — no data is bundled with the package: setup_baseball_db(load_war = TRUE, overwrite = TRUE) ``` -This adds three supplemental tables and two derived views: +This adds three supplemental tables and three derived views: | Added | Type | Description | |-------|------|-------------| | `ChadwickIDs` | Table | Chadwick Bureau player ID crosswalk (ODC-BY 1.0) | | `FangraphsBattingWAR` | Table | FanGraphs batter WAR leaderboard (1871–present) | -| `FangraphsPitchingWAR` | Table | FanGraphs pitcher WAR leaderboard (2002–present) | +| `FangraphsPitchingWAR` | Table | FanGraphs pitcher WAR leaderboard (1985–present) | | `PlayerIDs` | View | Lahman `playerID` joined to MLBAM, FanGraphs, Retrosheet, and BBREF IDs | | `PlayerWAR` | View | `bat_war` + `pit_war` + `total_war` per player-season | -| `SalaryPerWAR` | View | `dollars_per_war` by player-season; includes `war_reliable` flag | +| `SalaryPerWAR` | View | `dollars_per_war` by player-season with `era` label | -> **`war_reliable` flag:** FanGraphs pitching WAR is only available from 2002 onward. -> Pre-2002 pitcher rows in `SalaryPerWAR` will have near-zero `total_war` (batting -> contribution only), making `dollars_per_war` misleading. Filter -> `WHERE war_reliable = TRUE` for trustworthy analysis. Batting WAR is reliable for -> all seasons 1985+. +> FanGraphs WAR now covers batting and pitching back to 1985, so `war_reliable` +> is TRUE for all rows in the salary era. The flag is retained for backward +> compatibility. Loaders can also be run independently on an existing open connection: @@ -143,7 +149,7 @@ library(lahmanTools) con <- connect_baseball_db() # read-only by default on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) -DBI::dbListTables(con) # all 27 Lahman tables + 8 views (more with load_war) +DBI::dbListTables(con) # 27+ Lahman tables + 10 views (with load_war) ``` ### Example: does an elite strikeout rotation pay off? @@ -274,6 +280,7 @@ R/ stats_views.R # create_stats_views() — register sabermetric SQL views loaders.R # load_chadwick_ids(), load_fangraphs_war(), load_statcast() scrape.R # scrape_salaries() — fetch USA Today salary data + mcp_config.R # write_mcp_config() — generate MCP server config for AI tools utils.R # db_query(), dt_factors_to_char(), clean_names() globals.R # globalVariables() declarations ``` @@ -288,11 +295,23 @@ and must not be redistributed. ## Attribution -Baseball statistics provided by [Sean Lahman](http://www.seanlahman.com/) -via the `Lahman` R package, licensed under -[CC BY-SA 3.0](https://creativecommons.org/licenses/by-sa/3.0/). -Any derivative work must carry the same attribution and license. +`lahmanTools` is a tooling package — it does not bundle third-party data. +All data is fetched at runtime from the sources below. When you publish +analysis that uses these datasets, your attribution obligations depend on +the source license. + +| Source | License | Obligation | +|--------|---------|------------| +| [Sean Lahman Baseball Database](http://www.seanlahman.com/) | [CC BY-SA 3.0](https://creativecommons.org/licenses/by-sa/3.0/) | Credit Sean Lahman and carry the same license in any derivative work. | +| [Chadwick Baseball Bureau Register](https://github.com/chadwickbureau/register) | [ODC-BY 1.0](https://opendatacommons.org/licenses/by/1.0/) | Credit the Chadwick Baseball Bureau when publishing work that uses the player ID crosswalk. | +| [FanGraphs WAR Leaderboards](https://www.fangraphs.com) | Copyright FanGraphs | Do not redistribute the fetched data. | +| [Baseball Savant / Statcast](https://baseballsavant.mlb.com/) | Copyright MLB Advanced Media | Do not redistribute the fetched data. | +| USA Today / Spotrac salary data | Proprietary — ToS applies | See [`data-raw/README.md`](data-raw/README.md). Do not redistribute. | + +FanGraphs, Chadwick, and Statcast data are fetched via the +[`baseballr`](https://billpetti.github.io/baseballr/) package +(MIT, Bill Petti and contributors). ## License -MIT © David Lucey · Baseball data: CC BY-SA 3.0 Sean Lahman +MIT © David Lucey diff --git a/data-raw/salaries.R b/data-raw/salaries.R index ce38789..b361d3b 100644 --- a/data-raw/salaries.R +++ b/data-raw/salaries.R @@ -110,32 +110,30 @@ if (length(spotrac_files) == 0L) stop("No Spotrac CSV files found in ", output_d all_sal <- data.table::rbindlist(lapply(spotrac_files, data.table::fread), fill = TRUE) # Reformat player column to "Last, First" to match Lahman People format -# Spotrac stores as "First Last" — reverse the order +# Spotrac stores as "First Last" -- reverse the order +# Strip suffixes (Jr., Sr., II, III) BEFORE reversing to avoid "Jr., Jackie Bradley" +suffix_pat <- "\\s+(Jr\\.?|Sr\\.?|II|III|IV)$" +all_sal[, player := gsub(suffix_pat, "", player)] name_parts <- strsplit(all_sal$player, "\\s+", perl = TRUE) -all_sal[, player_lahman := vapply(name_parts, function(p) { +all_sal[, player := vapply(name_parts, function(p) { if (length(p) < 2L) return(p[[1L]]) paste0(p[[length(p)]], ", ", paste(p[-length(p)], collapse = " ")) }, character(1L))] people <- data.table::as.data.table(Lahman::People) -people[, player_lahman := paste0(nameLast, ", ", nameFirst)] +match_player_ids(all_sal, people) -sal_linked <- merge( - all_sal, people[, .(playerID, player_lahman)], - by = "player_lahman", all.x = TRUE -) - -match_pct <- mean(!is.na(sal_linked$playerID)) * 100 -message(sprintf("Matched: %.1f%% of %d rows", match_pct, nrow(sal_linked))) +match_pct <- mean(!is.na(all_sal$playerID)) * 100 +message(sprintf("Final match rate: %.1f%% of %d rows", match_pct, nrow(all_sal))) -yr_range <- range(sal_linked$yearID, na.rm = TRUE) +yr_range <- range(all_sal$yearID, na.rm = TRUE) out_combined <- file.path( output_dir, sprintf("salaries_spotrac_%d_%d_with_playerID.csv", yr_range[[1L]], yr_range[[2L]]) ) -data.table::fwrite(sal_linked[, player_lahman := NULL], out_combined) +data.table::fwrite(all_sal, out_combined) data.table::fwrite( - unique(sal_linked[is.na(playerID), .(player)]), + unique(all_sal[is.na(playerID), .(player)]), file.path(output_dir, "unmatched_spotrac.csv") ) diff --git a/man/create_stats_views.Rd b/man/create_stats_views.Rd index f03b30f..8f56d84 100644 --- a/man/create_stats_views.Rd +++ b/man/create_stats_views.Rd @@ -13,11 +13,11 @@ create_stats_views(con) Invisibly returns \code{con}. } \description{ -Adds three views that extend the raw Lahman tables with derived rate -statistics. The raw tables are never modified. +Adds views and a scalar SQL macro that extend the raw Lahman tables with +derived statistics. The raw tables are never modified. } \details{ -\tabular{lll}{ +\strong{Per-player views} (one row per player-year-stint-team):\tabular{lll}{ View \tab Base table \tab Key metrics added \cr \code{BattingStats} \tab \code{Batting} \tab PA, AVG, OBP, SLG, OPS, ISO, BABIP, BB\%, K\% \cr \code{PitchingStats} \tab \code{Pitching} \tab IP, WHIP, K/9, BB/9, HR/9, H/9, K/BB, FIP, FIP_constant, Win\% \cr @@ -25,6 +25,30 @@ statistics. The raw tables are never modified. } +\strong{Analytical views} (pre-built patterns used across analysis queries):\tabular{lll}{ + View \tab Base tables \tab Description \cr + \code{PlayerAcquisitionType} \tab \code{Batting}, \code{Pitching}, \code{People} \tab One row per player-team; classifies as \code{homegrown}, \code{young_acq}, or \code{veteran_acq} \cr + \code{LeagueMedianSalary} \tab \code{SalariesAll} \tab League-wide median and mean salary by season; use for relative-salary normalisation \cr + \code{TeamPayroll} \tab \code{SalariesAll} \tab Total payroll, player count, median and max salary by team-season \cr +} + + +\strong{Scalar macro:}\tabular{lll}{ + Macro \tab Argument \tab Returns \cr + \code{era_label(yr)} \tab \code{INTEGER} year \tab \code{'Pre-Moneyball'} (1998-2002), \code{'Moneyball'} (2003-2011), \code{'Big Data'} (2012+), or \code{NULL} \cr +} + + +Use \code{era_label(yearID)} in any SQL query instead of repeating the \code{CASE} +block. Example: \verb{SELECT era_label(yearID) AS era, ... FROM BattingStats}. + +\strong{Acquisition type} (\code{PlayerAcquisitionType.acq_type}): +\itemize{ +\item \code{homegrown} — player's first MLB season equals first season with this team +\item \code{young_acq} — joined team after MLB debut, age on arrival < 26 +\item \code{veteran_acq} — joined team after MLB debut, age on arrival >= 26 +} + \strong{FIP constant} is derived per \code{yearID + lgID} by aggregating the \code{Teams} table (\code{lgERA - (13*lgHR + 3*lgBB - 2*lgSO) / lgIP}), so it correctly adjusts for era and league scoring environment. Falls back to 3.10 only diff --git a/man/load_chadwick_ids.Rd b/man/load_chadwick_ids.Rd new file mode 100644 index 0000000..724aeef --- /dev/null +++ b/man/load_chadwick_ids.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loaders.R +\name{load_chadwick_ids} +\alias{load_chadwick_ids} +\title{Load Chadwick Bureau player ID crosswalk} +\usage{ +load_chadwick_ids(con, overwrite = FALSE) +} +\arguments{ +\item{con}{A writable \code{DBIConnection} to the baseball DuckDB database.} + +\item{overwrite}{Logical. Drop and recreate the table if it already +exists. Default \code{FALSE} leaves an existing table untouched.} +} +\value{ +Invisibly returns \code{con}. +} +\description{ +Downloads the Chadwick Bureau persons register via \pkg{baseballr} and +writes it to a \code{ChadwickIDs} table in \code{con}. Creates a \code{PlayerIDs} view +that joins Chadwick IDs to the Lahman \code{People} table so every player has +MLB Advanced Media (MLBAM), FanGraphs, Retrosheet and Baseball Reference IDs +alongside their Lahman \code{playerID}. +} +\details{ +\strong{Attribution:} Chadwick Baseball Bureau persons register, +\url{https://github.com/chadwickbureau/register}, +licensed under the Open Data Commons Attribution License (ODC-BY 1.0). +} +\examples{ +\dontrun{ +con <- connect_baseball_db(read_only = FALSE) +load_chadwick_ids(con) +DBI::dbDisconnect(con, shutdown = TRUE) +} +} diff --git a/man/load_fangraphs_war.Rd b/man/load_fangraphs_war.Rd new file mode 100644 index 0000000..19dfb98 --- /dev/null +++ b/man/load_fangraphs_war.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loaders.R +\name{load_fangraphs_war} +\alias{load_fangraphs_war} +\title{Load FanGraphs WAR data} +\usage{ +load_fangraphs_war(con, years = 1985:2025, overwrite = FALSE) +} +\arguments{ +\item{con}{A writable \code{DBIConnection} to the baseball DuckDB database.} + +\item{years}{Integer vector of seasons to fetch. Defaults to \code{1985:2025} +(aligns with Lahman \code{Salaries} coverage).} + +\item{overwrite}{Logical. Drop and recreate existing tables. Default +\code{FALSE}.} +} +\value{ +Invisibly returns \code{con}. +} +\description{ +Fetches Wins Above Replacement leaderboard data from FanGraphs via +\pkg{baseballr} for the requested seasons, writes batters to +\code{FangraphsBattingWAR} and pitchers to \code{FangraphsPitchingWAR}, then creates +two derived views: +} +\details{ +\itemize{ +\item \strong{\code{PlayerWAR}} -- one row per player-season with \code{bat_war}, \code{pit_war}, +\code{total_war}, joined to Lahman \code{playerID} via Chadwick. +\item \strong{\code{SalaryPerWAR}} -- joins \code{PlayerWAR} to \code{SalariesAll}; reports +\code{salary}, \code{total_war}, \code{dollars_per_war}, and \code{era_label}. +} + +\strong{Prerequisites:} \code{\link[=load_chadwick_ids]{load_chadwick_ids()}} must be run first (the join to +Lahman \code{playerID} routes through \code{ChadwickIDs}). + +\strong{Data note:} FanGraphs data is copyright FanGraphs. This function +performs a runtime fetch to your local database only. Do not redistribute +the fetched data. +} +\examples{ +\dontrun{ +con <- connect_baseball_db(read_only = FALSE) +load_chadwick_ids(con) +load_fangraphs_war(con, years = 2010:2025) +DBI::dbDisconnect(con, shutdown = TRUE) +} +} diff --git a/man/load_statcast.Rd b/man/load_statcast.Rd new file mode 100644 index 0000000..884ccaa --- /dev/null +++ b/man/load_statcast.Rd @@ -0,0 +1,48 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loaders.R +\name{load_statcast} +\alias{load_statcast} +\title{Load Statcast pitch-level data} +\usage{ +load_statcast(con, years, game_type = "R", overwrite = FALSE) +} +\arguments{ +\item{con}{A writable \code{DBIConnection} to the baseball DuckDB database.} + +\item{years}{Integer vector of seasons to fetch (2015 or later required).} + +\item{game_type}{One of \code{"R"} (regular season, default), \code{"P"} +(postseason), or \code{"S"} (spring training).} + +\item{overwrite}{Logical. If \code{TRUE}, drop and recreate +\code{StatcastPitches} before loading the first year. If \code{FALSE} (default), +append new seasons to any existing data.} +} +\value{ +Invisibly returns \code{con}. +} +\description{ +Fetches Baseball Savant pitch-level data via \pkg{baseballr} for each +requested season, appends to a \code{StatcastPitches} table, and creates a +\code{StatcastSeason} view with batter-season aggregates (exit velocity, launch +angle, hard-hit rate, xBA, xwOBA). +} +\details{ +\code{StatcastSeason.mlbam_id} maps to \code{PlayerIDs.mlbam_id} -- join those two +views to attach Lahman \code{playerID} and enable cross-dataset analysis. + +\strong{Data note:} Statcast data is copyright MLB Advanced Media (MLBAM). +This function performs a runtime fetch to your local database only. +Do not redistribute the fetched data. + +Pitch-level data is large -- roughly 700 MB per season uncompressed. +Load one year at a time and allow DuckDB to handle compression on disk. +Statcast data is only available from 2015 onward. +} +\examples{ +\dontrun{ +con <- connect_baseball_db(read_only = FALSE) +load_statcast(con, years = 2023) +DBI::dbDisconnect(con, shutdown = TRUE) +} +} diff --git a/man/match_player_ids.Rd b/man/match_player_ids.Rd new file mode 100644 index 0000000..defce32 --- /dev/null +++ b/man/match_player_ids.Rd @@ -0,0 +1,36 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{match_player_ids} +\alias{match_player_ids} +\title{Match salary data to Lahman playerIDs via multi-pass name matching} +\usage{ +match_player_ids(sal_dt, people_dt, roster_dt = NULL) +} +\arguments{ +\item{sal_dt}{A \code{data.table} with a \code{player} column in "Last, First" format +and a \code{yearID} column. Optionally a \code{team} (display name) or \code{teamID} +(Lahman code) column for roster-constrained matching.} + +\item{people_dt}{A \code{data.table} from \code{Lahman::People} with at least +\code{playerID}, \code{nameFirst}, \code{nameLast}, \code{debut}, \code{finalGame}.} + +\item{roster_dt}{Optional \code{data.table} with \code{playerID}, \code{yearID}, \code{teamID} +columns (e.g., from Appearances). If NULL, built automatically from +Lahman::Batting + Lahman::Pitching when team info is available.} +} +\value{ +\code{sal_dt} with a \code{playerID} column filled where matches succeed. +Modified by reference; also returned invisibly. +} +\description{ +Performs progressive matching from strict to fuzzy: +\enumerate{ +\item Exact "Last, First" match (unique names only) +\item Normalised names (strips accents, suffixes, punctuation, mojibake) +\item Normalised name + active-year filter for ambiguous names +\item Team-constrained: last name within team-year roster (if \code{team} +or \code{teamID} column present). This is the big-picture win -- +constraining to ~50 roster spots resolves nicknames, formal names, +and most ambiguous names without complex normalization. +} +} diff --git a/man/normalise_player_name.Rd b/man/normalise_player_name.Rd new file mode 100644 index 0000000..a1d4e2d --- /dev/null +++ b/man/normalise_player_name.Rd @@ -0,0 +1,23 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{normalise_player_name} +\alias{normalise_player_name} +\title{Normalise a player name for fuzzy matching} +\usage{ +normalise_player_name(x) +} +\arguments{ +\item{x}{Character vector of player names in "Last, First" format.} +} +\value{ +Character vector the same length as \code{x}, normalised. +} +\description{ +Strips suffixes (Jr., Sr., II, III, IV), injury markers (*), accents, +punctuation in initials (J.D. -> J D), apostrophes, and extra whitespace. +Returns lowercase "last, first" form suitable for exact-match joining. +} +\examples{ +normalise_player_name(c("Acuna Jr., Ronald", "Martinez, JD", "Harper, Bryce*")) +# [1] "acuna, ronald" "martinez, j d" "harper, bryce" +} diff --git a/man/setup_baseball_db.Rd b/man/setup_baseball_db.Rd index d628d48..5743335 100644 --- a/man/setup_baseball_db.Rd +++ b/man/setup_baseball_db.Rd @@ -4,7 +4,15 @@ \alias{setup_baseball_db} \title{Build the baseball DuckDB database} \usage{ -setup_baseball_db(dbdir = NULL, sal_file = NULL, overwrite = FALSE) +setup_baseball_db( + dbdir = NULL, + sal_file = NULL, + spotrac_file = NULL, + overwrite = FALSE, + load_chadwick = FALSE, + load_war = FALSE, + war_years = 1985:2025 +) } \arguments{ \item{dbdir}{Path for the output \code{baseball.duckdb} file. Defaults to the @@ -13,23 +21,60 @@ value of the \code{LAHMANS_DBDIR} environment variable if set, otherwise \item{sal_file}{Path to the combined USA Today salary CSV produced by \code{\link[=scrape_salaries]{scrape_salaries()}}. When \code{NULL} (default), looks for -\verb{salaries_*_with_playerID.csv} in the same directory as \code{dbdir}. USA -Today data is not bundled with the package — users must run +\verb{salaries_*_with_playerID.csv} (non-Spotrac) in the same directory as +\code{dbdir}. USA Today data is not bundled -- users must run \code{\link[=scrape_salaries]{scrape_salaries()}} to obtain it.} +\item{spotrac_file}{Path to the combined Spotrac salary CSV produced by +\code{data-raw/salaries.R}. When \code{NULL} (default), looks for +\verb{salaries_spotrac_*_with_playerID.csv} in the same directory as \code{dbdir}. +Spotrac data is not bundled -- users must run \code{data-raw/salaries.R} to +obtain it.} + \item{overwrite}{If \code{TRUE}, drop and recreate existing tables. Default \code{FALSE} aborts if the file already exists.} + +\item{load_chadwick}{If \code{TRUE}, download the Chadwick Bureau player ID +crosswalk via \pkg{baseballr} and create the \code{PlayerIDs} view. +Requires an internet connection and \pkg{baseballr}. Default \code{FALSE}.} + +\item{load_war}{If \code{TRUE}, fetch FanGraphs WAR leaderboards and create +\code{PlayerWAR} and \code{SalaryPerWAR} views. Implies \code{load_chadwick = TRUE}. +Requires an internet connection and \pkg{baseballr}. Default \code{FALSE}.} + +\item{war_years}{Integer vector of seasons to fetch for WAR data. +Defaults to \code{1985:2025} (full salary era).} } \value{ Invisibly returns \code{dbdir}. } \description{ -Writes all Lahman package tables plus scraped USA Today salary data into a -persistent DuckDB file, then creates a \code{SalariesAll} view that unions both -salary sources on a common schema. +Writes all Lahman package tables plus scraped salary data into a persistent +DuckDB file, then creates a \code{SalariesAll} view that unions all salary +sources on a common schema: +\itemize{ +\item \strong{Lahman} (\code{Salaries}): authoritative 1985–2016 +\item \strong{Spotrac} (\code{SalariesSpotrac}): player-level actuals 2017–2021 +\item \strong{USA Today} (\code{SalariesUSAToday}): player-level actuals 2022–2025 +} +} +\details{ +Optionally fetches supplemental data via \pkg{baseballr}: +\itemize{ +\item \code{load_chadwick = TRUE} downloads the Chadwick Bureau player ID crosswalk +and creates the \code{PlayerIDs} view (ODC-BY 1.0 licensed; safe to use locally). +\item \code{load_war = TRUE} additionally fetches FanGraphs WAR leaderboards and +creates the \code{PlayerWAR} and \code{SalaryPerWAR} views. Implies +\code{load_chadwick = TRUE}. Both batting and pitching WAR are available +from FanGraphs for the full salary era (1985+). +} } \examples{ \dontrun{ +# Lahman only setup_baseball_db() + +# With full WAR coverage (requires baseballr and internet) +setup_baseball_db(load_war = TRUE, overwrite = TRUE) } } diff --git a/man/team_name_map.Rd b/man/team_name_map.Rd new file mode 100644 index 0000000..2a5e770 --- /dev/null +++ b/man/team_name_map.Rd @@ -0,0 +1,16 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/utils.R +\name{team_name_map} +\alias{team_name_map} +\title{Map common team display names to Lahman teamID codes} +\usage{ +team_name_map() +} +\value{ +A \code{data.table} with two character columns. +} +\description{ +Returns a \code{data.table} with columns \code{team_name} and \code{teamID}. +Covers all 30 current franchises with common aliases used by +USA Today, Spotrac, and other public salary sources. +} diff --git a/man/write_mcp_config.Rd b/man/write_mcp_config.Rd new file mode 100644 index 0000000..d5be3b3 --- /dev/null +++ b/man/write_mcp_config.Rd @@ -0,0 +1,61 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/mcp_config.R +\name{write_mcp_config} +\alias{write_mcp_config} +\title{Generate or write an MCP server config for baseball.duckdb} +\usage{ +write_mcp_config( + dbdir = NULL, + binary = Sys.which("duckdb-mcp-server"), + config_path = path.expand("~/.copilot/mcp-config.json"), + dry_run = TRUE +) +} +\arguments{ +\item{dbdir}{Path to \code{baseball.duckdb}. Defaults to the \code{LAHMANS_DBDIR} +environment variable, then \verb{~/Documents/Data/baseball/baseball.duckdb}.} + +\item{binary}{Full path to the \code{duckdb-mcp-server} binary. Defaults to +\code{Sys.which("duckdb-mcp-server")}. Install with +\verb{uv tool install duckdb-mcp-server}.} + +\item{config_path}{Path to write the MCP config JSON. Defaults to +\verb{~/.copilot/mcp-config.json} (read by GitHub Copilot CLI).} + +\item{dry_run}{If \code{TRUE} (default), prints the JSON that would be written +without touching any files. Set \code{FALSE} to write.} +} +\value{ +Invisibly returns \code{config_path} when written, or \code{NULL} in dry-run +mode or when the binary is not found. Called for its side effects. +} +\description{ +Writes (or previews) the JSON entry needed to expose \code{baseball.duckdb} as a +local \href{https://github.com/alexmacy/duckdb-mcp-server}{DuckDB MCP server} for +AI tools such as GitHub Copilot CLI and Claude Code. +} +\details{ +The main pain point this solves: Python-based MCP servers do \strong{not} expand +\code{~} in path arguments, so the database path must be absolute. This function +resolves \code{dbdir} to a full path before writing. + +When \code{config_path} already exists, only the \code{"baseball"} key is updated; +all other server entries are preserved. \code{--readonly} is always included in +the server args -- omitting it would allow an AI agent to modify or drop +tables. +} +\examples{ +\dontrun{ +# Preview first -- nothing is written +write_mcp_config() + +# Write when satisfied with the output +write_mcp_config(dry_run = FALSE) + +# Custom paths (e.g. if DB lives elsewhere) +write_mcp_config(dbdir = "/data/baseball/baseball.duckdb", dry_run = FALSE) +} +} +\seealso{ +\code{\link[=setup_baseball_db]{setup_baseball_db()}}, \code{\link[=connect_baseball_db]{connect_baseball_db()}} +} diff --git a/tests/testthat/test-connect.R b/tests/testthat/test-connect.R index 906162e..f6e3f5c 100644 --- a/tests/testthat/test-connect.R +++ b/tests/testthat/test-connect.R @@ -165,3 +165,168 @@ test_that("TeamPayroll and LeagueMedianSalary have expected columns", { expect_true(all(c("yearID", "med_sal", "avg_sal", "n_players") %in% names(lms))) expect_equal(lms$n_players, 3L) }) + + +# --- Stat formula verification ------------------------------------------------ + +test_that("BattingStats computes correct rate stats", { + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + stub_all_tables(con) + + # Known inputs: 500 AB, 140 H, 30 2B, 5 3B, 20 HR, 60 BB, 100 SO, + # 4 HBP, 3 SH, 2 SF, 5 IBB, 10 GIDP + DBI::dbExecute(con, " + INSERT INTO Batting VALUES + ('testbat', 2023, 1, 'NYA', 'AL', 150, 500, 80, 140, + 30, 5, 20, 75, 10, 3, 60, 100, 5, 4, 3, 2, 10)") + + create_stats_views(con) + bs <- db_query(con, "SELECT * FROM BattingStats WHERE playerID = 'testbat'") + + # PA = AB + BB + HBP + SF + SH = 500 + 60 + 4 + 2 + 3 = 569 + expect_equal(bs$PA, 569L) + + # AVG = H / AB = 140 / 500 = 0.280 + expect_equal(bs$AVG, 0.280, tolerance = 1e-6) + + # OBP = (H + BB + HBP) / (AB + BB + HBP + SF) = 204 / 566 + expect_equal(bs$OBP, 204 / 566, tolerance = 1e-6) + + + # SLG = (H + X2B + 2*X3B + 3*HR) / AB = (140+30+10+60) / 500 = 0.480 + expect_equal(bs$SLG, 0.480, tolerance = 1e-6) + + # OPS = OBP + SLG + expect_equal(bs$OPS, 204 / 566 + 0.480, tolerance = 1e-6) + + # ISO = (X2B + 2*X3B + 3*HR) / AB = (30+10+60) / 500 = 0.200 + expect_equal(bs$ISO, 0.200, tolerance = 1e-6) + + # BABIP = (H - HR) / (AB - SO - HR + SF) = 120 / 382 + expect_equal(bs$BABIP, 120 / 382, tolerance = 1e-6) + + # BB% = BB / PA = 60 / 569 + expect_equal(bs$BB_pct, 60 / 569, tolerance = 1e-6) + + # K% = SO / PA = 100 / 569 + expect_equal(bs$K_pct, 100 / 569, tolerance = 1e-6) +}) + +test_that("BattingStats returns NULL for zero-AB player", { + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + stub_all_tables(con) + + # Pitcher with 0 AB, 1 BB (only walked once) + DBI::dbExecute(con, " + INSERT INTO Batting VALUES + ('zeroab', 2023, 1, 'NYA', 'AL', 5, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0)") + + create_stats_views(con) + bs <- db_query(con, "SELECT * FROM BattingStats WHERE playerID = 'zeroab'") + + expect_true(is.na(bs$AVG)) + expect_true(is.na(bs$SLG)) + expect_equal(bs$PA, 1L) # 0 + 1 + 0 + 0 + 0 +}) + +test_that("PitchingStats computes correct rate stats with FIP", { + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + stub_all_tables(con) + + # League totals for FIP constant calculation + DBI::dbExecute(con, " + INSERT INTO Teams (yearID, lgID, teamID, IPouts, HRA, BBA, SOA, ER) + VALUES (2023, 'AL', 'T1', 65000, 2250, 7000, 20000, 9000), + (2023, 'AL', 'T2', 65000, 2250, 7000, 20000, 9000)") + + # Player: 200 IP (600 IPouts), 180 H, 75 ER, 20 HR, 55 BB, 190 SO, 14W-8L, 8 HBP + DBI::dbExecute(con, " + INSERT INTO Pitching VALUES + ('testpit', 2023, 1, 'T1', 'AL', + 14, 8, 32, 32, 2, 1, 0, 600, 180, 75, + 20, 55, 190, 3.38, 0.243, + 3, 5, 8, 1, 820, 0, 90, 2, 3, 8)") + + create_stats_views(con) + ps <- db_query(con, "SELECT * FROM PitchingStats WHERE playerID = 'testpit'") + + # IP = IPouts / 3 = 200.0 + expect_equal(ps$IP, 200.0, tolerance = 1e-6) + + # WHIP = (BB + H) * 3 / IPouts = (55+180)*3/600 = 1.175 + expect_equal(ps$WHIP, 1.175, tolerance = 1e-6) + + # K/9 = SO * 27 / IPouts = 190*27/600 = 8.55 + expect_equal(ps$K_9, 8.55, tolerance = 1e-6) + + # BB/9 = BB * 27 / IPouts = 55*27/600 = 2.475 + expect_equal(ps$BB_9, 2.475, tolerance = 1e-6) + + # HR/9 = HR * 27 / IPouts = 20*27/600 = 0.9 + expect_equal(ps$HR_9, 0.9, tolerance = 1e-6) + + # K/BB = SO / BB = 190/55 + expect_equal(ps$K_BB, 190 / 55, tolerance = 1e-6) + + # Win% = W / (W+L) = 14/22 + expect_equal(ps$Win_pct, 14 / 22, tolerance = 1e-6) + + # FIP = (13*HR + 3*(BB+HBP) - 2*SO) / IP + FIP_constant + # lg_IPouts = 130000, lg_HR = 4500, lg_BB = 14000, lg_SO = 40000, lg_ER = 18000 + lg_ERA <- 18000 * 27.0 / 130000 + fip_c <- lg_ERA - (13.0 * 4500 + 3.0 * 14000 - 2.0 * 40000) / (130000 / 3.0) + expected_fip <- (13.0 * 20 + 3.0 * (55 + 8) - 2.0 * 190) / 200.0 + fip_c + expect_equal(ps$FIP, expected_fip, tolerance = 1e-4) + expect_equal(ps$FIP_constant, fip_c, tolerance = 1e-4) +}) + +test_that("PitchingStats returns NULL for zero-IPouts pitcher", { + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + stub_all_tables(con) + + DBI::dbExecute(con, " + INSERT INTO Pitching (playerID, yearID, stint, teamID, lgID, + W, L, G, GS, CG, SHO, SV, IPouts, H, ER, HR, BB, SO, ERA, BAOpp, + IBB, WP, HBP, BK, BFP, GF, R, SH, SF, GIDP) + VALUES ('zeroip', 2023, 1, 'NYA', 'AL', + 0, 0, 1, 0, 0, 0, 0, 0, 2, 3, 1, 1, 0, NULL, NULL, + 0, 0, 0, 0, 4, 0, 3, 0, 0, 0)") + + create_stats_views(con) + ps <- db_query(con, "SELECT * FROM PitchingStats WHERE playerID = 'zeroip'") + + expect_true(is.na(ps$WHIP)) + expect_true(is.na(ps$K_9)) + expect_true(is.na(ps$FIP)) + expect_equal(ps$IP, 0.0) +}) + +test_that("FieldingStats computes correct metrics", { + con <- DBI::dbConnect(duckdb::duckdb(), dbdir = ":memory:") + on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) + stub_all_tables(con) + + # Known: PO=300, A=150, E=10, G=150, InnOuts=3600 + DBI::dbExecute(con, " + INSERT INTO Fielding VALUES + ('testfld', 2023, 1, 'NYA', 'AL', 'SS', 150, 148, + 3600, 300, 150, 10, 40, + NULL, NULL, NULL, NULL, NULL)") + + create_stats_views(con) + fs <- db_query(con, "SELECT * FROM FieldingStats WHERE playerID = 'testfld'") + + # FPCT = (PO + A) / (PO + A + E) = 450 / 460 + expect_equal(fs$FPCT, 450 / 460, tolerance = 1e-6) + + # RF/9 = (PO + A) * 27 / InnOuts = 450 * 27 / 3600 = 3.375 + expect_equal(fs$RF_9, 3.375, tolerance = 1e-6) + + # RF/G = (PO + A) / G = 450 / 150 = 3.0 + expect_equal(fs$RF_G, 3.0, tolerance = 1e-6) +}) diff --git a/tests/testthat/test-loaders.R b/tests/testthat/test-loaders.R index dd50bb2..bddb045 100644 --- a/tests/testthat/test-loaders.R +++ b/tests/testthat/test-loaders.R @@ -141,13 +141,13 @@ test_that("war_reliable is TRUE for position player in any era", { expect_true(res$war_reliable[1L]) }) -test_that("war_reliable is FALSE for pitcher season before 2002", { +test_that("war_reliable is FALSE for pitcher season before 1985", { con <- DBI::dbConnect(duckdb::duckdb(), ":memory:") on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) stub_war_tables(con) - insert_batter_season(con, "smithpi01", "2002", 1999L, bat_war = 0.1) - # Mark as pitcher (has pitching appearances in 1999) - DBI::dbExecute(con, "INSERT INTO Pitching VALUES ('smithpi01', 1999, 30)") + insert_batter_season(con, "smithpi01", "2002", 1984L, bat_war = 0.1) + # Mark as pitcher (has pitching appearances in 1984) + DBI::dbExecute(con, "INSERT INTO Pitching VALUES ('smithpi01', 1984, 30)") lahmanTools:::create_war_views_(con) res <- DBI::dbGetQuery(con, "SELECT war_reliable FROM SalaryPerWAR") @@ -155,7 +155,7 @@ test_that("war_reliable is FALSE for pitcher season before 2002", { expect_false(res$war_reliable[1L]) }) -test_that("war_reliable is TRUE for pitcher season 2002 or later", { +test_that("war_reliable is TRUE for pitcher season 1985 or later", { con <- DBI::dbConnect(duckdb::duckdb(), ":memory:") on.exit(DBI::dbDisconnect(con, shutdown = TRUE)) stub_war_tables(con) diff --git a/tests/testthat/test-scrape.R b/tests/testthat/test-scrape.R new file mode 100644 index 0000000..e775c37 --- /dev/null +++ b/tests/testthat/test-scrape.R @@ -0,0 +1,22 @@ +# --- scrape_salaries input validation ----------------------------------------- + +test_that("scrape_salaries() rejects unknown year slugs", { + expect_error( + scrape_salaries(years = 2030), + "No URL slug defined" + ) +}) + +test_that("scrape_salaries() rejects mixed known and unknown years", { + expect_error( + scrape_salaries(years = c(2024, 2030)), + "2030" + ) +}) + +test_that("scrape_salaries() error message includes the bad year", { + expect_error( + scrape_salaries(years = c(2016, 2026)), + "2016.*2026|2026.*2016" + ) +}) diff --git a/tests/testthat/test-utils.R b/tests/testthat/test-utils.R index 58ebfa1..535df4e 100644 --- a/tests/testthat/test-utils.R +++ b/tests/testthat/test-utils.R @@ -59,3 +59,258 @@ test_that("db_query passes extra arguments to dbGetQuery", { result <- db_query(con, "SELECT * FROM nums ORDER BY n") expect_equal(nrow(result), 100L) }) + + +# --- normalise_player_name --------------------------------------------------- + +test_that("normalise_player_name strips asterisks", { + expect_equal(normalise_player_name("Harper, Bryce*"), "harper, bryce") +}) + +test_that("normalise_player_name strips suffixes", { + expect_equal(normalise_player_name("Acuna Jr., Ronald"), "acuna, ronald") + expect_equal(normalise_player_name("Guerrero Sr., Vladimir"), "guerrero, vladimir") + expect_equal(normalise_player_name("Smith III, John"), "smith, john") +}) + +test_that("normalise_player_name transliterates accents", { + expect_equal(normalise_player_name("Acu\u00f1a, Ronald"), "acuna, ronald") +}) + +test_that("normalise_player_name fixes UTF-8 mojibake", { + # "ó" is the mojibake for "ó" (UTF-8 bytes read as Latin-1) + mojibake <- "Can\u00c3\u00b3, Robinson" + expect_equal(normalise_player_name(mojibake), "cano, robinson") +}) + +test_that("normalise_player_name normalises initials", { + expect_equal(normalise_player_name("Martinez, J.D."), "martinez, j d") + expect_equal(normalise_player_name("Martinez, JD"), "martinez, j d") + expect_equal(normalise_player_name("Realmuto, JT"), "realmuto, j t") +}) + +test_that("normalise_player_name strips apostrophes", { + expect_equal(normalise_player_name("d'Arnaud, Travis"), "darnaud, travis") +}) + +test_that("normalise_player_name handles vectors", { + input <- c("Harper, Bryce*", "Acuna Jr., Ronald", "Smith, John") + result <- normalise_player_name(input) + expect_length(result, 3L) + expect_equal(result, c("harper, bryce", "acuna, ronald", "smith, john")) +}) + + +# --- match_player_ids -------------------------------------------------------- + +# Helper to build a minimal People data.table for testing +make_test_people <- function() { + data.table::data.table( + playerID = c("harpebr03", "acunaro01", "martij06", + "darntra01", "smithjo99"), + nameFirst = c("Bryce", "Ronald", "J. D.", + "Travis", "John"), + nameLast = c("Harper", "Acu\u00f1a", "Martinez", + "d'Arnaud", "Smith"), + debut = c("2012-04-28", "2018-04-25", "2011-08-11", + "2013-04-26", "2020-07-24"), + finalGame = c(NA, NA, NA, + "2024-09-29", NA) + ) +} + +test_that("match_player_ids Pass 1: exact match works", { + people <- make_test_people() + sal <- data.table::data.table( + player = "Smith, John", + yearID = 2022L + ) + match_player_ids(sal, people) + expect_equal(sal$playerID, "smithjo99") +}) + +test_that("match_player_ids Pass 2: normalised match catches suffixes + accents", { + people <- make_test_people() + sal <- data.table::data.table( + player = c("Acuna Jr., Ronald", "d'Arnaud, Travis"), + yearID = c(2023L, 2022L) + ) + match_player_ids(sal, people) + expect_equal(sal$playerID, c("acunaro01", "darntra01")) +}) + +test_that("match_player_ids Pass 2: asterisks stripped", { + people <- make_test_people() + sal <- data.table::data.table( + player = "Harper, Bryce*", + yearID = 2023L + ) + match_player_ids(sal, people) + expect_equal(sal$playerID, "harpebr03") +}) + +test_that("match_player_ids leaves truly unmatched as NA", { + people <- make_test_people() + sal <- data.table::data.table( + player = "Nonexistent, Player", + yearID = 2023L + ) + match_player_ids(sal, people) + expect_true(is.na(sal$playerID)) +}) + +test_that("match_player_ids Pass 3: disambiguates by year", { + # Two people with the same name but different eras + people <- data.table::data.table( + playerID = c("johnjr01", "johnjr02"), + nameFirst = c("Junior", "Junior"), + nameLast = c("Johnson", "Johnson"), + debut = c("1990-04-01", "2018-04-01"), + finalGame = c("2005-09-30", NA) + ) + sal <- data.table::data.table( + player = "Johnson, Junior", + yearID = 2022L + ) + match_player_ids(sal, people) + expect_equal(sal$playerID, "johnjr02") +}) + + +# --- match_player_ids Pass 4: team-constrained -------------------------------- + +test_that("match_player_ids Pass 4a: resolves by team + last name", { + people <- data.table::data.table( + playerID = c("garcica01", "garcica02", "smithjo01"), + nameFirst = c("Carlos", "Carlos", "John"), + nameLast = c("Garcia", "Garcia", "Smith"), + debut = c("2018-04-01", "2019-04-01", "2015-04-01"), + finalGame = c(NA, NA, NA) + ) + # garcica01 is on NYN, garcica02 is on HOU + roster <- data.table::data.table( + playerID = c("garcica01", "garcica02", "smithjo01"), + yearID = c(2023L, 2023L, 2023L), + teamID = c("NYN", "HOU", "NYN") + ) + sal <- data.table::data.table( + player = "Garcia, Carlos", + yearID = 2023L, + team = "N.Y. Mets" + ) + match_player_ids(sal, people, roster_dt = roster) + expect_equal(sal$playerID, "garcica01") +}) + +test_that("match_player_ids Pass 4b: disambiguates same-lastname teammates by initial", { + people <- data.table::data.table( + playerID = c("smithal01", "smithbo01"), + nameFirst = c("Alex", "Bob"), + nameLast = c("Smith", "Smith"), + debut = c("2020-04-01", "2019-04-01"), + finalGame = c(NA, NA) + ) + roster <- data.table::data.table( + playerID = c("smithal01", "smithbo01"), + yearID = c(2023L, 2023L), + teamID = c("NYA", "NYA") + ) + sal <- data.table::data.table( + player = "Smith, Alex", + yearID = 2023L, + team = "N.Y. Yankees" + ) + match_player_ids(sal, people, roster_dt = roster) + expect_equal(sal$playerID, "smithal01") +}) + +test_that("match_player_ids Pass 4: uses teamID column when present", { + people <- data.table::data.table( + playerID = c("jonesad01", "jonesad02"), + nameFirst = c("Adam", "Adam"), + nameLast = c("Jones", "Jones"), + debut = c("2016-04-01", "2020-04-01"), + finalGame = c(NA, NA) + ) + roster <- data.table::data.table( + playerID = c("jonesad01", "jonesad02"), + yearID = c(2023L, 2023L), + teamID = c("BAL", "SFN") + ) + # When teamID is already present, should skip team_name_map() lookup + sal <- data.table::data.table( + player = "Jones, Adam", + yearID = 2023L, + teamID = "SFN" + ) + match_player_ids(sal, people, roster_dt = roster) + expect_equal(sal$playerID, "jonesad02") +}) + +test_that("match_player_ids Pass 4: leaves unmatched when player not on team roster", { + # Two active John Does -- ambiguous in Pass 1-3 + # Neither is on NYN roster, so Pass 4 also fails + people <- data.table::data.table( + playerID = c("doejn01", "doejn02"), + nameFirst = c("John", "John"), + nameLast = c("Doe", "Doe"), + debut = c("2015-04-01", "2018-04-01"), + finalGame = c(NA, NA) + ) + roster <- data.table::data.table( + playerID = c("doejn01", "doejn02"), + yearID = c(2023L, 2023L), + teamID = c("BOS", "HOU") + ) + sal <- data.table::data.table( + player = "Doe, John", + yearID = 2023L, + team = "N.Y. Mets" + ) + match_player_ids(sal, people, roster_dt = roster) + expect_true(is.na(sal$playerID)) +}) + + +# --- team_name_map ----------------------------------------------------------- + +test_that("team_name_map returns expected structure", { + tmap <- team_name_map() + expect_s3_class(tmap, "data.table") + expect_true(all(c("team_name", "teamID") %in% names(tmap))) + expect_true(nrow(tmap) > 0L) +}) + +test_that("team_name_map covers all 30 current MLB franchises", { + tmap <- team_name_map() + current_30 <- c("ARI", "ATL", "BAL", "BOS", "CHN", "CHA", "CIN", "CLE", + "COL", "DET", "HOU", "KCA", "LAA", "LAN", "MIA", "MIL", + "MIN", "NYN", "NYA", "OAK", "PHI", "PIT", "SDN", "SFN", + "SEA", "SLN", "TBA", "TEX", "TOR", "WAS") + for (tid in current_30) { + expect_true(tid %in% tmap$teamID, + label = paste("missing franchise:", tid)) + } +}) + +test_that("team_name_map has no duplicate team_name entries", { + tmap <- team_name_map() + dupes <- tmap$team_name[duplicated(tmap$team_name)] + expect_true(length(dupes) == 0L, + label = paste("duplicate aliases:", toString(dupes))) +}) + +test_that("team_name_map maps common abbreviations correctly", { + tmap <- team_name_map() + expect_equal(tmap[team_name == "NYM", teamID], "NYN") + expect_equal(tmap[team_name == "NYY", teamID], "NYA") + expect_equal(tmap[team_name == "CHC", teamID], "CHN") + expect_equal(tmap[team_name == "CHW", teamID], "CHA") + expect_equal(tmap[team_name == "LAD", teamID], "LAN") + expect_equal(tmap[team_name == "STL", teamID], "SLN") + expect_equal(tmap[team_name == "KC", teamID], "KCA") + expect_equal(tmap[team_name == "TB", teamID], "TBA") + expect_equal(tmap[team_name == "SF", teamID], "SFN") + expect_equal(tmap[team_name == "SD", teamID], "SDN") + expect_equal(tmap[team_name == "WSH", teamID], "WAS") +})