luceydav · luceydav · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -20,6 +20,7 @@ Imports:
     rvest,
     xml2
 Suggests:
+    connections,
     dm,
     re2,
     ggplot2,

diff --git a/R/connect.R b/R/connect.R
@@ -7,18 +7,33 @@
 #'   `.Renviron`.
 #' @param read_only Open in read-only mode. Default `TRUE` for analysis;
 #'   use `FALSE` only when rebuilding via [setup_baseball_db()].
+#' @param connections_pane If `TRUE`, registers the connection with the RStudio
+#'   Connections pane via the `connections` package (must be installed). The
+#'   returned object is still a standard `DBIConnection` and works identically
+#'   with all DBI and `db_query()` calls; the only difference is that RStudio
+#'   will display it in the Connections pane with a browse/disconnect button.
+#'   Silently falls back to a plain `DBI::dbConnect()` when `connections` is
+#'   not installed.
 #'
-#' @return A `DBIConnection` object. Close with
+#' @return A `DBIConnection` object (or a `connections`-wrapped equivalent when
+#'   `connections_pane = TRUE`). Close with
 #'   `DBI::dbDisconnect(con, shutdown = TRUE)`.
 #' @export
 #'
 #' @examples
 #' \dontrun{
+#' # Plain connection
 #' con <- connect_baseball_db()
 #' DBI::dbListTables(con)
 #' DBI::dbDisconnect(con, shutdown = TRUE)
+#'
+#' # Visible in RStudio Connections pane
+#' con <- connect_baseball_db(connections_pane = TRUE)
+#' DBI::dbListTables(con)
+#' DBI::dbDisconnect(con, shutdown = TRUE)
 #' }
-connect_baseball_db <- function(dbdir = NULL, read_only = TRUE) {
+connect_baseball_db <- function(dbdir = NULL, read_only = TRUE,
+                                connections_pane = FALSE) {
   if (is.null(dbdir)) {
     dbdir <- Sys.getenv(
       "LAHMANS_DBDIR",
@@ -28,5 +43,16 @@ connect_baseball_db <- function(dbdir = NULL, read_only = TRUE) {
   if (!file.exists(dbdir)) {
     stop(dbdir, " not found. Run setup_baseball_db() first.")
   }
-  DBI::dbConnect(duckdb::duckdb(), dbdir = dbdir, read_only = read_only)
+
+  if (connections_pane && requireNamespace("connections", quietly = TRUE)) {
+    connections::connection_open(duckdb::duckdb(),
+                                 dbdir = dbdir,
+                                 read_only = read_only)
+  } else {
+    if (connections_pane) {
+      message("Package 'connections' not installed; ",
+              "using plain DBI::dbConnect() instead.")
+    }
+    DBI::dbConnect(duckdb::duckdb(), dbdir = dbdir, read_only = read_only)
+  }
 }
diff --git a/R/setup_db.R b/R/setup_db.R
@@ -1,17 +1,25 @@
 #' Build the baseball DuckDB database
 #'
-#' Writes all Lahman package tables plus scraped USA Today salary data into a
-#' persistent DuckDB file, then creates a `SalariesAll` view that unions both
-#' salary sources on a common schema.
+#' Writes all Lahman package tables plus scraped salary data into a persistent
+#' DuckDB file, then creates a `SalariesAll` view that unions all salary
+#' sources on a common schema:
+#' - **Lahman** (`Salaries`): authoritative 1985–2016
+#' - **Spotrac** (`SalariesSpotrac`): player-level actuals 2017–2021
+#' - **USA Today** (`SalariesUSAToday`): player-level actuals 2022–2025
 #'
 #' @param dbdir Path for the output `baseball.duckdb` file. Defaults to the
 #'   value of the `LAHMANS_DBDIR` environment variable if set, otherwise
 #'   `~/Documents/Data/baseball/baseball.duckdb`.
 #' @param sal_file Path to the combined USA Today salary CSV produced by
 #'   [scrape_salaries()]. When `NULL` (default), looks for
-#'   `salaries_*_with_playerID.csv` in the same directory as `dbdir`. USA
-#'   Today data is not bundled with the package — users must run
+#'   `salaries_*_with_playerID.csv` (non-Spotrac) in the same directory as
+#'   `dbdir`. USA Today data is not bundled — users must run
 #'   [scrape_salaries()] to obtain it.
+#' @param spotrac_file Path to the combined Spotrac salary CSV produced by
+#'   `data-raw/salaries.R`. When `NULL` (default), looks for
+#'   `salaries_spotrac_*_with_playerID.csv` in the same directory as `dbdir`.
+#'   Spotrac data is not bundled — users must run `data-raw/salaries.R` to
+#'   obtain it.
 #' @param overwrite If `TRUE`, drop and recreate existing tables. Default
 #'   `FALSE` aborts if the file already exists.
 #'
@@ -22,9 +30,10 @@
 #' \dontrun{
 #' setup_baseball_db()
 #' }
-setup_baseball_db <- function(dbdir    = NULL,
-                               sal_file = NULL,
-                               overwrite = FALSE) {
+setup_baseball_db <- function(dbdir         = NULL,
+                               sal_file      = NULL,
+                               spotrac_file  = NULL,
+                               overwrite     = FALSE) {
   if (is.null(dbdir)) {
     dbdir <- Sys.getenv(
       "LAHMANS_DBDIR",
@@ -51,35 +60,92 @@ setup_baseball_db <- function(dbdir    = NULL,
     message(sprintf("  %-25s %d rows", nm, nrow(dt)))
   }))
 
-  # -- USA Today scraped salaries -----------------------------------------------
+  # -- Spotrac scraped salaries (2017-2021) -------------------------------------
+  if (is.null(spotrac_file)) {
+    data_dir <- dirname(dbdir)
+    candidates <- list.files(data_dir,
+                             pattern = "salaries_spotrac_.*with_playerID\\.csv$",
+                             full.names = TRUE)
+    spotrac_file <- if (length(candidates)) candidates[[1L]] else ""
+  }
+  has_spotrac <- nzchar(spotrac_file) && file.exists(spotrac_file)
+  if (!has_spotrac) {
+    message("  Spotrac salary file not found -- SalariesSpotrac will not be loaded.\n",
+            "  Run data-raw/salaries.R then rerun setup_baseball_db(spotrac_file = <path>).")
+  } else {
+    sp <- data.table::fread(spotrac_file)
+    DBI::dbWriteTable(con, "SalariesSpotrac", sp, overwrite = TRUE)
+    message(sprintf("  %-25s %d rows", "SalariesSpotrac", nrow(sp)))
+  }
+
+  # -- USA Today scraped salaries (2022+) ---------------------------------------
   if (is.null(sal_file)) {
     data_dir <- dirname(dbdir)
     candidates <- list.files(data_dir, pattern = "salaries.*with_playerID\\.csv$",
                              full.names = TRUE)
+    candidates <- candidates[!grepl("spotrac", candidates, fixed = TRUE)]
     sal_file <- if (length(candidates)) candidates[[1L]] else ""
   }
-  if (!nzchar(sal_file) || !file.exists(sal_file)) {
-    warning("Scraped salary file not found -- SalariesUSAToday will not be loaded.\n",
+  has_usatoday <- nzchar(sal_file) && file.exists(sal_file)
+  if (!has_usatoday) {
+    warning("USA Today salary file not found -- SalariesUSAToday will not be loaded.\n",
             "Run scrape_salaries() then rerun setup_baseball_db(sal_file = <path>).")
   } else {
     sal <- data.table::fread(sal_file)
     sal[, average_annual := as.numeric(gsub("[$,]", "", average_annual))]
     DBI::dbWriteTable(con, "SalariesUSAToday", sal, overwrite = TRUE)
     message(sprintf("  %-25s %d rows", "SalariesUSAToday", nrow(sal)))
+  }
 
-    # -- SalariesAll view: Lahman (<=2016) + USA Today expanded with imputed years --
-    DBI::dbExecute(con, "
-      CREATE OR REPLACE VIEW SalariesAll AS
-      WITH
+  # -- SalariesAll view: union all three salary sources -------------------------
+  # Only build the view if at least one supplemental source loaded
+  if (has_spotrac || has_usatoday) {
+    # Normalise USA Today full-name team strings to Lahman 3-letter teamIDs.
+    usa_team_case <- "
+          CASE team
+            WHEN 'Angels'            THEN 'LAA'  WHEN 'L.A. Angels'       THEN 'LAA'
+            WHEN 'Arizona'           THEN 'ARI'  WHEN 'Diamondbacks'      THEN 'ARI'
+            WHEN 'Astros'            THEN 'HOU'  WHEN 'Houston'           THEN 'HOU'
+            WHEN 'Athletics'         THEN 'OAK'  WHEN 'Oakland'           THEN 'OAK'
+            WHEN 'Atlanta'           THEN 'ATL'  WHEN 'Braves'            THEN 'ATL'
+            WHEN 'Baltimore'         THEN 'BAL'  WHEN 'Orioles'           THEN 'BAL'
+            WHEN 'Blue Jays'         THEN 'TOR'  WHEN 'Toronto'           THEN 'TOR'
+            WHEN 'Boston'            THEN 'BOS'  WHEN 'Red Sox'           THEN 'BOS'
+            WHEN 'Brewers'           THEN 'MIL'  WHEN 'Milwaukee'         THEN 'MIL'
+            WHEN 'Cardinals'         THEN 'SLN'  WHEN 'St. Louis'         THEN 'SLN'
+            WHEN 'Chi. Cubs'         THEN 'CHN'  WHEN 'Chicago Cubs'      THEN 'CHN'
+            WHEN 'Cubs'              THEN 'CHN'
+            WHEN 'Chic. White Sox'   THEN 'CHA'  WHEN 'Chicago White Sox' THEN 'CHA'
+            WHEN 'White Sox'         THEN 'CHA'
+            WHEN 'Cincinnati'        THEN 'CIN'  WHEN 'Reds'              THEN 'CIN'
+            WHEN 'Cleveland'         THEN 'CLE'  WHEN 'Guardians'         THEN 'CLE'
+            WHEN 'Colorado'          THEN 'COL'  WHEN 'Rockies'           THEN 'COL'
+            WHEN 'Detroit'           THEN 'DET'  WHEN 'Tigers'            THEN 'DET'
+            WHEN 'Dodgers'           THEN 'LAN'  WHEN 'L.A. Dodgers'      THEN 'LAN'
+            WHEN 'Giants'            THEN 'SFN'  WHEN 'San Francisco'     THEN 'SFN'
+            WHEN 'Kansas City'       THEN 'KCA'  WHEN 'Royals'            THEN 'KCA'
+            WHEN 'Mariners'          THEN 'SEA'  WHEN 'Seattle'           THEN 'SEA'
+            WHEN 'Marlins'           THEN 'MIA'  WHEN 'Miami'             THEN 'MIA'
+            WHEN 'Mets'              THEN 'NYN'  WHEN 'N.Y. Mets'         THEN 'NYN'
+            WHEN 'Minnesota'         THEN 'MIN'  WHEN 'Twins'             THEN 'MIN'
+            WHEN 'Nationals'         THEN 'WAS'  WHEN 'Washington'        THEN 'WAS'
+            WHEN 'N.Y. Yankees'      THEN 'NYA'  WHEN 'Yankees'           THEN 'NYA'
+            WHEN 'Philadelphia'      THEN 'PHI'  WHEN 'Phillies'          THEN 'PHI'
+            WHEN 'Pittsburgh'        THEN 'PIT'  WHEN 'Pirates'           THEN 'PIT'
+            WHEN 'Rangers'           THEN 'TEX'  WHEN 'Texas'             THEN 'TEX'
+            WHEN 'Rays'              THEN 'TBA'  WHEN 'Tampa Bay'         THEN 'TBA'
+            WHEN 'San Diego'         THEN 'SDN'  WHEN 'Padres'            THEN 'SDN'
+            ELSE team
+          END"
 
+    usatoday_cte <- if (has_usatoday) paste0("
       -- Parse each USA Today row: clean AAV and extract contract start/end year.
       -- Handles patterns: 'N (YYYY-YY)', 'NN (YYYY-YY)', 'N(YYYY-YY)'.
       -- Rows with NULL years (1-year deals) get NULL c_start/c_end and pass
       -- through as actual records via the FULL JOIN below.
       usa_parsed AS (
         SELECT
-          playerID,
-          team                                                              AS teamID,
+          playerID,", usa_team_case, "                                        AS teamID,
           salary::DOUBLE                                                    AS salary,
           yearID,
           average_annual::DOUBLE                                            AS aav,
@@ -96,17 +162,12 @@ setup_baseball_db <- function(dbdir    = NULL,
         FROM SalariesUSAToday
         WHERE playerID IS NOT NULL
       ),
-
-      -- One actual record per player-season (handles any duplicate scrapes).
       actual AS (
         SELECT DISTINCT ON (playerID, yearID)
           playerID, teamID, salary, yearID
         FROM usa_parsed
         ORDER BY playerID, yearID
       ),
-
-      -- One contract per player + contract span; most-recent scrape's AAV wins
-      -- when the same contract appears in multiple scraped years.
       contracts AS (
         SELECT DISTINCT ON (playerID, c_start, c_end)
           playerID, teamID, aav, c_start, c_end
@@ -117,20 +178,11 @@ setup_baseball_db <- function(dbdir    = NULL,
           AND aav     IS NOT NULL
         ORDER BY playerID, c_start, c_end, yearID DESC
       ),
-
-      -- Expand each contract to one row per year in the contract range.
       contract_years AS (
         SELECT c.playerID, c.teamID, c.aav, gs::INTEGER AS yearID
         FROM contracts c,
           LATERAL (SELECT unnest(generate_series(c.c_start, c.c_end)) AS gs) t
       ),
-
-      -- Merge contract-year grid with actual records.
-      --   * Matching rows  -> use actual salary (is_actual = TRUE)
-      --   * Contract only  -> impute with AAV    (is_actual = FALSE)
-      --   * Actual only    -> 1-year deals / NULL years pass through as-is
-      -- DISTINCT ON deduplicates the rare case where overlapping contracts
-      -- produce two rows for the same player-season; actual wins.
       usa_expanded AS (
         SELECT DISTINCT ON (playerID, yearID)
           COALESCE(a.playerID, cy.playerID)::VARCHAR AS playerID,
@@ -142,24 +194,71 @@ setup_baseball_db <- function(dbdir    = NULL,
         FULL JOIN actual a
           ON cy.playerID = a.playerID AND cy.yearID = a.yearID
         ORDER BY playerID, yearID, is_actual DESC
-      )
-
-      -- Lahman (authoritative through 2016)
-      SELECT playerID, yearID, teamID, lgID,
-             salary::DOUBLE AS salary,
-             'lahman'       AS source,
-             TRUE           AS is_actual
-      FROM   Salaries
+      )") else ""
 
+    usatoday_union <- if (has_usatoday) "
       UNION ALL
-
-      -- USA Today: actual salaries (2022-2025) + AAV-imputed contract years
+      -- USA Today: actual salaries (2022+) + AAV-imputed contract years
       SELECT playerID, yearID, teamID, NULL AS lgID,
              salary,
              'usatoday'     AS source,
              is_actual
-      FROM   usa_expanded
-    ")
+      FROM   usa_expanded" else ""
+
+    # Normalise Spotrac abbreviations that differ from Lahman teamIDs.
+    spotrac_team_case <- "
+          CASE team
+            WHEN 'CHC' THEN 'CHN'  WHEN 'CHW' THEN 'CHA'
+            WHEN 'KC'  THEN 'KCA'  WHEN 'LAD' THEN 'LAN'
+            WHEN 'NYM' THEN 'NYN'  WHEN 'NYY' THEN 'NYA'
+            WHEN 'SD'  THEN 'SDN'  WHEN 'SF'  THEN 'SFN'
+            WHEN 'STL' THEN 'SLN'  WHEN 'TB'  THEN 'TBA'
+            WHEN 'WSH' THEN 'WAS'
+            ELSE team
+          END"
+
+    spotrac_union <- if (has_spotrac) paste0("
+      UNION ALL
+      -- Spotrac: actual player salaries 2017-2021, filtered to MLB-contract players
+      -- (salary >= MLB minimum for that year) for consistency with Lahman.
+      -- Minor leaguers on 40-man rosters are excluded.
+      -- MLB minimums: 2017=$535K, 2018=$545K, 2019=$555K, 2020=$208K(prorated 60g), 2021=$570.5K
+      SELECT playerID::VARCHAR,
+             yearID::INTEGER,", spotrac_team_case, "::VARCHAR  AS teamID,
+             NULL::VARCHAR    AS lgID,
+             salary::DOUBLE,
+             'spotrac'        AS source,
+             TRUE             AS is_actual
+      FROM   SalariesSpotrac
+      WHERE  playerID IS NOT NULL
+        AND  salary >= CASE yearID
+               WHEN 2017 THEN 535000
+               WHEN 2018 THEN 545000
+               WHEN 2019 THEN 555000
+               WHEN 2020 THEN 208000
+               WHEN 2021 THEN 570500
+               ELSE 500000
+             END") else ""
+
+    # Comma after last CTE only if usa CTEs are present
+    cte_comma <- if (has_usatoday) "," else ""
+
+    DBI::dbExecute(con, paste0("
+      CREATE OR REPLACE VIEW SalariesAll AS
+      WITH
+      -- placeholder CTE so the WITH clause is always valid even with no usa CTEs
+      _dummy AS (SELECT 1 AS x)", cte_comma,
+      usatoday_cte, "
+
+      -- Lahman (authoritative through 2016)
+      SELECT playerID, yearID, teamID, lgID,
+             salary::DOUBLE AS salary,
+             'lahman'       AS source,
+             TRUE           AS is_actual
+      FROM   Salaries",
+      spotrac_union,
+      usatoday_union
+    ))
     message(sprintf("  %-25s (view)", "SalariesAll"))
   }
 

diff --git a/README.md b/README.md
@@ -8,6 +8,26 @@
 
 The design choice matters at scale: DuckDB executes columnar SQL directly on the file, so aggregations over 150 years of play-by-play data that would choke an in-memory data frame run in milliseconds.
 
+## Data model
+
+20 Lahman tables loaded into DuckDB, colour-coded by functional group.
+Arrows show primary-key → foreign-key relationships.
+
+![lahmanTools schema](man/figures/lahmanTools_schema.svg)
+
+| Colour | Group | Tables |
+|--------|-------|--------|
+| 🔵 Blue | Identity spine | `People`, `Teams`, `TeamsFranchises` |
+| 🟢 Green | Regular-season | `Batting`, `Pitching`, `Fielding`, `Appearances` |
+| 🟣 Purple | Postseason | `BattingPost`, `PitchingPost`, `FieldingPost` |
+| 🩷 Pink | Salary | `Salaries` |
+| 🟠 Orange | Honours | `AwardsPlayers`, `AllstarFull`, `HallOfFame` |
+| 🩵 Teal | Management | `Managers` |
+| 🤎 Brown | College | `CollegePlaying`, `Schools` |
+| ⚫ Grey | Lookups | `Parks`, `HomeGames`, `SeriesPost` |
+
+To regenerate after schema changes: `Rscript analysis/schema_dm.R` (requires `dm`, `DiagrammeR`, `DiagrammeRsvg`).
+
 ## Requirements
 
 - R ≥ 4.1.0
@@ -126,26 +146,6 @@ db_query(con, "
 | `SalariesAll` | `Salaries`, `SalariesUSAToday` | Lahman (≤ 2016) + USA Today (2017+); AAV imputation for multi-year contracts |
 | `TeamPayroll` | `SalariesAll` | Total and per-position payroll by team-season |
 
-## Data model
-
-20 Lahman tables loaded into DuckDB, colour-coded by functional group.
-Arrows show primary-key → foreign-key relationships.
-
-![lahmanTools schema](man/figures/lahmanTools_schema.svg)
-
-| Colour | Group | Tables |
-|--------|-------|--------|
-| 🔵 Blue | Identity spine | `People`, `Teams`, `TeamsFranchises` |
-| 🟢 Green | Regular-season | `Batting`, `Pitching`, `Fielding`, `Appearances` |
-| 🟣 Purple | Postseason | `BattingPost`, `PitchingPost`, `FieldingPost` |
-| 🩷 Pink | Salary | `Salaries` |
-| 🟠 Orange | Honours | `AwardsPlayers`, `AllstarFull`, `HallOfFame` |
-| 🩵 Teal | Management | `Managers` |
-| 🤎 Brown | College | `CollegePlaying`, `Schools` |
-| ⚫ Grey | Lookups | `Parks`, `HomeGames`, `SeriesPost` |
-
-To regenerate after schema changes: `Rscript analysis/schema_dm.R` (requires `dm`, `DiagrammeR`, `DiagrammeRsvg`).
-
 ## Package structure
 
 ```
-Original file line number
+Diff line change
@@ Expand Up / @@ -20,6 +20,7 @@ Imports: @@
         rvest,
         xml2
     Suggests:
+        connections,
         dm,
         re2,
         ggplot2,
@@ Expand Down @@