From ef26f15e2ecd9379ad840c756fe72a00cb0652b7 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 06:28:14 +0000 Subject: [PATCH 1/6] =?UTF-8?q?GAM=20chapter=20(squashed)=20=E2=80=94=20cl?= =?UTF-8?q?oses=20#775?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Initial draft of a chapter on generalized additive models. Squashed from the original 69-commit history because the action's recursive submodule fetch was failing on commits that referenced a now-deleted latex-macros SHA. See the PR description for the chapter outline, new bib entries, and caveats. --- _quarto-book.yml | 1 + _quarto-website.yml | 3 + .../_exr-gams-applied.qmd | 160 ++++++++ .../_exr-gams-conceptual.qmd | 76 ++++ .../_sec_basis_functions.qmd | 96 +++++ .../_sec_gam_example.qmd | 209 ++++++++++ .../_sec_gam_fitting.qmd | 118 ++++++ .../_sec_gam_inference.qmd | 122 ++++++ .../generalized-additive-models/_sec_gams.qmd | 113 ++++++ .../generalized-additive-models/_sec_lab.qmd | 378 ++++++++++++++++++ .../_sec_local_regression.qmd | 97 +++++ .../_sec_method_comparison.qmd | 244 +++++++++++ .../_sec_motivation.qmd | 83 ++++ .../_sec_polynomial.qmd | 98 +++++ .../_sec_regression_splines.qmd | 161 ++++++++ .../_sec_smoothing_splines.qmd | 177 ++++++++ .../_sec_step_functions.qmd | 98 +++++ chapters/generalized-additive-models.qmd | 78 ++++ references.bib | 45 ++- 19 files changed, 2356 insertions(+), 1 deletion(-) create mode 100644 _subfiles/generalized-additive-models/_exr-gams-applied.qmd create mode 100644 _subfiles/generalized-additive-models/_exr-gams-conceptual.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_basis_functions.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_gam_example.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_gam_fitting.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_gam_inference.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_gams.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_lab.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_local_regression.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_method_comparison.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_motivation.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_polynomial.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_regression_splines.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_smoothing_splines.qmd create mode 100644 _subfiles/generalized-additive-models/_sec_step_functions.qmd create mode 100644 chapters/generalized-additive-models.qmd diff --git a/_quarto-book.yml b/_quarto-book.yml index 0910c98925..b8aad36c1b 100644 --- a/_quarto-book.yml +++ b/_quarto-book.yml @@ -29,6 +29,7 @@ book: - chapters/causal-inference.qmd - chapters/predictor-selection.qmd - chapters/intro-multilevel-models.qmd + - chapters/generalized-additive-models.qmd - part: chapters/time-to-event-models.qmd chapters: - chapters/intro-to-survival-analysis.qmd diff --git a/_quarto-website.yml b/_quarto-website.yml index 47d8ff5e11..3284c72e67 100644 --- a/_quarto-website.yml +++ b/_quarto-website.yml @@ -12,6 +12,7 @@ project: - chapters/causal-inference.qmd - chapters/predictor-selection.qmd - chapters/intro-multilevel-models.qmd + - chapters/generalized-additive-models.qmd - chapters/time-to-event-models.qmd - chapters/intro-to-survival-analysis.qmd - chapters/proportional-hazards-models.qmd @@ -77,6 +78,8 @@ website: href: chapters/predictor-selection.qmd - text: "Multilevel Models" href: chapters/intro-multilevel-models.qmd + - text: "Generalized Additive Models" + href: chapters/generalized-additive-models.qmd - text: "---" - text: "Time to Event Models" href: chapters/time-to-event-models.qmd diff --git a/_subfiles/generalized-additive-models/_exr-gams-applied.qmd b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd new file mode 100644 index 0000000000..09b313c94f --- /dev/null +++ b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd @@ -0,0 +1,160 @@ +::: notes +Applied exercises adapted from @james2021islr2e [§7.9] and +@hastie2009esl2e [§9.1]. +::: + +### Polynomial regression and step functions (ISLRv2 Applied 7) {#sec-exr-wage-poly} + +::::{#exr-wage-polynomial-cv} + +Using the `Wage` dataset from the `ISLR2` package [@james2021islr2e]: + +**(a)** Use 10-fold cross-validation to choose the polynomial degree $d^*$ for +predicting `wage` from `age`. Plot the CV errors for degrees 1 through 10. + +**(b)** Fit the degree-$d^*$ polynomial and plot it with a 95% confidence band. + +**(c)** Separately, choose the number of cut-points $K^*$ for a step-function +model using cross-validation. Plot the chosen step-function fit. + +:::: + + +::::{#sol-wage-polynomial-cv} + +::: {.panel-tabset} + +#### R + +```{r} +#| code-fold: true +#| eval: false +library(ISLR2) +library(boot) +library(ggplot2) +set.seed(1) +cv_poly <- vapply(1:10, function(d) { + cv.glm(Wage, glm(wage ~ poly(age, d), data = Wage), K = 10)$delta[1] +}, numeric(1)) +best_d <- which.min(cv_poly) +cat("CV-selected degree:", best_d, "\n") +plot(1:10, cv_poly, type = "b", xlab = "Degree", ylab = "CV MSE", + main = "Polynomial degree selection") +points(best_d, cv_poly[best_d], col = "red", pch = 19) +``` + +#### Python + +```{python} +#| eval: false +#| python.reticulate: false +import numpy as np +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression +from sklearn.pipeline import Pipeline +from sklearn.model_selection import cross_val_score +from ISLP import load_data +Wage = load_data("Wage") +X = Wage["age"].values.reshape(-1, 1) +y = Wage["wage"].values +cv_errs = [-cross_val_score( + Pipeline([("poly", PolynomialFeatures(d, include_bias=False)), + ("reg", LinearRegression())]), + X, y, cv=10, scoring="neg_mean_squared_error").mean() for d in range(1, 11)] +best_d = int(np.argmin(cv_errs)) + 1 +print(f"CV-selected degree: {best_d}") +``` + +::: + +:::: + + +### Backfitting algorithm (ESL §9.1.1 / ISLRv2 Applied 11) {#sec-exr-backfitting} + +::::{#exr-backfitting} + +The **backfitting algorithm** [@hastie2009esl2e, §9.1.1] fits an additive model +$Y = \a + \sum_j f_j(X_j) + \eps$ by iteratively smoothing partial residuals. + +For $p=2$ predictors with linear component smoothers $\hat f_j(X_j) = \hat\b_j X_j$: + +1. Initialise: $\hat\a = \bar{y}$, $\hat f_1 = \hat f_2 = 0$. +2. Repeat until convergence (Gauss–Seidel order — re-centre each + $\hat f_j$ immediately, so the centred $\hat f_1$ is used when + computing $r_2$): + a. $r_1 \leftarrow y - \hat\a - \hat f_2(x_2)$; regress $r_1$ on + $x_1$ to update $\hat f_1$; $\hat f_1 \leftarrow \hat f_1 - + \overline{\hat f_1}$. + b. $r_2 \leftarrow y - \hat\a - \hat f_1(x_1)$; regress $r_2$ on + $x_2$ to update $\hat f_2$; $\hat f_2 \leftarrow \hat f_2 - + \overline{\hat f_2}$. + +**(a)** Implement this on simulated data with known true coefficients. + +**(b)** Verify the coefficients match `lm(y ~ x1 + x2)`. + +:::: + + +::::{#sol-backfitting} + +::: {.panel-tabset} + +#### R + +```{r} +#| code-fold: true +#| eval: false +set.seed(1) +n <- 100 +x1 <- rnorm(n) +x2 <- rnorm(n) +y <- 2 + 3 * x1 - 1.5 * x2 + rnorm(n, sd = 0.5) +alpha_hat <- mean(y) +f1 <- f2 <- rep(0, n) +for (iter in seq_len(500)) { + r1 <- y - alpha_hat - f2 + b1 <- coef(lm(r1 ~ x1))[2] + f1 <- b1 * x1 + f1 <- f1 - mean(f1) + r2 <- y - alpha_hat - f1 + b2 <- coef(lm(r2 ~ x2))[2] + f2 <- b2 * x2 + f2 <- f2 - mean(f2) +} +lm_fit <- lm(y ~ x1 + x2) +cat(sprintf("Backfitting: b1=%.4f, b2=%.4f\n", b1, b2)) +cat(sprintf("lm(): b1=%.4f, b2=%.4f\n", + coef(lm_fit)[2], coef(lm_fit)[3])) +``` + +#### Python + +```{python} +#| eval: false +#| python.reticulate: false +import numpy as np +rng = np.random.default_rng(1) +n = 100; x1 = rng.standard_normal(n); x2 = rng.standard_normal(n) +y = 2 + 3*x1 - 1.5*x2 + rng.normal(0, 0.5, n) +alpha_hat = y.mean(); f1 = np.zeros(n); f2 = np.zeros(n) +for _ in range(500): + r1 = y - alpha_hat - f2 + b1 = float(np.cov(x1, r1)[0, 1] / np.var(x1, ddof=1)) + f1 = b1*x1 - (b1*x1).mean() + r2 = y - alpha_hat - f1 + b2 = float(np.cov(x2, r2)[0, 1] / np.var(x2, ddof=1)) + f2 = b2*x2 - (b2*x2).mean() +X_ols = np.column_stack([np.ones(n), x1, x2]) +b_ols = np.linalg.lstsq(X_ols, y, rcond=None)[0] +print(f"Backfitting: b1={b1:.4f}, b2={b2:.4f}") +print(f"OLS: b1={b_ols[1]:.4f}, b2={b_ols[2]:.4f}") +``` + +::: + +The coefficients agree, confirming backfitting converges to OLS when +all component smoothers are linear [@hastie2009esl2e, §9.1.1]. + +:::: \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_exr-gams-conceptual.qmd b/_subfiles/generalized-additive-models/_exr-gams-conceptual.qmd new file mode 100644 index 0000000000..59596d90f1 --- /dev/null +++ b/_subfiles/generalized-additive-models/_exr-gams-conceptual.qmd @@ -0,0 +1,76 @@ +::: notes +Exercises adapted from @james2021islr2e [§7.9] and @hastie2009esl2e [§9.1]. +::: + +### Piecewise polynomial continuity (ISLRv2 Conceptual 1) {#sec-exr-continuity} + +::::{#exr-spline-continuity} + +A cubic polynomial with one interior knot at $\xi$ uses the truncated power basis: + +$$f(X) = \b_0 + \b_1 X + \b_2 X^2 + \b_3 X^3 + \b_4 (X-\xi)_+^3$$ + +where $(u)_+ = \max(u,0)$. + +**(a)** Write $f(X)$ as two cubics: $f_1(X)$ for $X < \xi$ and $f_2(X)$ for $X \ge \xi$. + +**(b)** Show $f_1(\xi) = f_2(\xi)$ (continuity). + +**(c)** Show the first derivatives match at $\xi$. + +**(d)** Show the second derivatives match at $\xi$. + +**(e)** Do the third derivatives match? What order of smoothness does this cubic spline achieve? + +:::: + + +::::{#sol-spline-continuity} + +**(a)** For $X < \xi$: $(X-\xi)_+ = 0$, giving $f_1(X) = \b_0 + \b_1 X + \b_2 X^2 + \b_3 X^3$. +For $X \ge \xi$: $f_2(X) = f_1(X) + \b_4(X-\xi)^3$. + +**(b)** At $X=\xi$: $(X-\xi)_+^3 = 0$, so $f_2(\xi) = f_1(\xi)$. $\checkmark$ + +**(c)** $f_2^{\prime}(X) = f_1^{\prime}(X) + 3\b_4(X-\xi)_+^2$. +At $X=\xi$: $(X-\xi)_+^2 = 0$, so the derivatives agree. $\checkmark$ + +**(d)** $f_2^{\prime\prime}(X) = f_1^{\prime\prime}(X) + 6\b_4(X-\xi)_+$. +At $X=\xi$: $(X-\xi)_+ = 0$. $\checkmark$ + +**(e)** $f_1^{\prime\prime\prime} = 6\b_3$ and $f_2^{\prime\prime\prime} = 6\b_3 + 6\b_4$ for $X > \xi$. +These differ (unless $\b_4=0$). The truncated power basis gives a **cubic spline**: +continuous with continuous 1st and 2nd derivatives, but a jump in the 3rd derivative at each knot. + +:::: + + +### Smoothing spline penalty behavior (ISLRv2 Conceptual 2) {#sec-exr-smoothing-penalty} + +::::{#exr-smoothing-penalty} + +The smoothing spline minimizes + +$$\hat{g} = \arg\min_g \left\{ \sum_{i=1}^n (y_i - g(x_i))^2 + \l \int g^{\prime\prime}(t)^2 \, dt \right\}.$$ + +**(a)** What is the limiting form of $\hat{g}$ as $\l \to \infty$? + +**(b)** What is the limiting form as $\l \to 0$? + +**(c)** For $\l \to 0$, is the training RSS equal to zero? + +:::: + + +::::{#sol-smoothing-penalty} + +**(a)** As $\l \to \infty$ the roughness penalty dominates. The only $g$ with +$\int g^{\prime\prime}(t)^2\,dt = 0$ is a linear function $g(x) = a + bx$. +So $\hat{g}$ converges to the ordinary least-squares line. + +**(b)** As $\l \to 0$ there is no penalty; the minimizer interpolates every +training point: $g(x_i) = y_i$ for all $i$. + +**(c)** Yes. Interpolation means training RSS $= 0$ -- the overfit extreme. + +:::: diff --git a/_subfiles/generalized-additive-models/_sec_basis_functions.qmd b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd new file mode 100644 index 0000000000..14cfbab9f5 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd @@ -0,0 +1,96 @@ +::: notes +Adapted from @james2021islr2e [§7.3] and @wood2017generalized [§4]. +::: + +Polynomial regression and step-function regression look very +different, but they share a common structure: both replace $X$ with +a fixed set of **transformations** of $X$, then fit a linear +regression in those transformations. This shared structure is the +**basis function** framework, and it's the foundation for splines. + +::::{#def-basis-expansion} +#### Basis expansion + +A **basis expansion** of $X$ is a fixed (data-free) set of functions +$b_1(X), \ldots, b_K(X)$. The basis-expansion regression model +replaces $X$ with these transformations: + +$$y_i = \b_0 + \b_1 b_1(x_i) + \b_2 b_2(x_i) + \cdots + \b_K b_K(x_i) + \eps_i.$$ + +The basis functions $b_k(\cdot)$ are chosen ahead of time; the +coefficients $\b_k$ are estimated by least squares (or, more +generally, by maximum likelihood for a GLM with link +$g(\E{Y \mid X}) = \b_0 + \sum_k \b_k b_k(X)$). +:::: + +::::{#exm-basis-polynomial} +#### Polynomial regression as a basis expansion + +For polynomial regression of degree $d$ (@def-polynomial-regression), +the basis functions are the monomials + +$$b_k(X) = X^k, \quad k = 1, \ldots, d.$$ +:::: + +::::{#exm-basis-step} +#### Step-function regression as a basis expansion + +For step-function regression with cutpoints $c_1 < \cdots < c_K$ +(@def-step-function-regression), the basis functions are the bin +indicators + +$$b_k(X) = \indicp{c_{k-1} < X \le c_k}, \quad k = 1, \ldots, K+1.$$ +:::: + +## Why the basis-function abstraction matters + +Once we see polynomial and step-function regression as instances of +the same template, we can ask: *which other basis functions might be +worth trying?* Two answers turn out to be especially useful: + +- **Truncated power basis functions** — + $b_k(X) = (X - \xi_k)_+^d$ for chosen knots $\xi_k$. These give the + **regression spline** family, which enforces piecewise-polynomial + structure with continuity and smoothness constraints at the knots + (developed in the next section). +- **B-spline basis functions** — a numerically well-conditioned + alternative basis that spans the same space of piecewise + polynomials as the truncated power basis, but with locally + supported basis functions (each $b_k$ is nonzero only on a small + region of the $x$-axis). + +Both of these are still **linear** in the coefficients, so all of the +GLM machinery — likelihood, standard errors, Wald tests, AIC, etc. — +applies unchanged. The only thing that changes is the design matrix: +each row $i$ of the design matrix has entries +$1, b_1(x_i), \ldots, b_K(x_i)$ instead of $1, x_i$. + +## The general estimation recipe + +For a continuous response $Y$ and a basis expansion +$b_1(\cdot), \ldots, b_K(\cdot)$: + +1. **Build the design matrix.** Construct the $n \times (K+1)$ + matrix + + $$\mat{B} = \sbmat{ + 1 & b_1(x_1) & \cdots & b_K(x_1) \\ + 1 & b_1(x_2) & \cdots & b_K(x_2) \\ + \vdots & \vdots & \ddots & \vdots \\ + 1 & b_1(x_n) & \cdots & b_K(x_n) + }.$$ + +2. **Fit by least squares (or MLE for a GLM).** + The OLS estimator $\hat\b = (\tp{\mat{B}} \mat{B})^{-1} \tp{\mat{B}} \mat{y}$ + minimizes the residual sum of squares — exactly the linear + regression machinery from earlier chapters, just with a different + design matrix. + +3. **Plot the fitted curve.** For a dense grid of $x$ values, compute + $\hat y(x) = \hat\b_0 + \sum_k \hat\b_k b_k(x)$ and plot. Standard + errors and pointwise confidence bands come from the usual linear + model formulas applied to $\mat{B}$. + +The remaining design decision is **which basis functions to use** — +and that choice is what distinguishes regression splines from +smoothing splines from local regression from GAMs. \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_gam_example.qmd b/_subfiles/generalized-additive-models/_sec_gam_example.qmd new file mode 100644 index 0000000000..223a0cd61f --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_gam_example.qmd @@ -0,0 +1,209 @@ +:::: {.content-hidden} +::: callout-warning +This section is illustrative only — the dataset is a built-in R +example, not an Epi 204 case study. Replace with a course-appropriate +exposure when the chapter is finalized. +::: +:::: + +We'll fit a GAM to the built-in `MASS::Pima.tr` dataset, predicting +diabetes status (`type`) from continuous risk factors. The exposures +of interest are body mass index (`bmi`), plasma glucose (`glu`), and +age (`age`). + +## Setup {.unnumbered} + +```r +library(MASS) # Pima.tr dataset +library(mgcv) # gam(), s() +library(ggplot2) +library(gratia) # tidy plots for gam fits (optional) + +data(Pima.tr) +str(Pima.tr) +# 'data.frame': 200 obs. of 8 variables: (sample values omitted; columns are) +# $ npreg: int number of pregnancies +# $ glu : int plasma glucose concentration (2 h OGTT) +# $ bp : int diastolic blood pressure (mm Hg) +# $ skin : int triceps skinfold thickness (mm) +# $ bmi : num body mass index (kg/m^2) +# $ ped : num diabetes pedigree function +# $ age : int age (years) +# $ type : Factor w/ 2 levels "No","Yes": diabetes diagnosis + +Pima.tr$diabetes <- as.integer(Pima.tr$type == "Yes") +``` + +## Linear baseline {.unnumbered} + +Start with an ordinary logistic GLM as a baseline: + +```r +m_lin <- glm( + diabetes ~ bmi + glu + age, + family = binomial(), data = Pima.tr +) +summary(m_lin) +``` + +Each coefficient is a log-odds increment per unit of the predictor — +a single number for the slope of risk across the entire range of +BMI, glucose, and age. That's a strong assumption, especially for +age, where diabetes risk is well known to be nonlinear. + +## GAM with smooth terms {.unnumbered} + +Now refit with smooths on each continuous predictor: + +```r +m_gam <- gam( + diabetes ~ s(bmi) + s(glu) + s(age), + family = binomial(), + data = Pima.tr, + method = "REML" +) +summary(m_gam) +``` + +The summary will report effective degrees of freedom for each +smooth. In a typical run on `Pima.tr` you'll see EDFs in the +1.5–4 range — modest nonlinearity but not extreme. + +## Reading the EDFs {.unnumbered} + +For each smooth, ask: + +- **EDF $\approx 1$**: the smooth has collapsed to a straight line. + No evidence the relationship is nonlinear. You can drop the + smoother and use a linear term without loss. +- **EDF in 2–5**: meaningfully nonlinear. The fitted curve is more + than just a line but doesn't have lots of bumps. +- **EDF $> 5$**: substantially nonlinear, possibly with multiple + inflections. Check whether the data actually support that wiggle + (look at the plot; check `gam.check()`'s `k-index`). + +If `gam.check(m_gam)` flags any smooth as having `edf` close to +`k'`, bump `k` up: + +```r +m_gam2 <- gam( + diabetes ~ s(bmi) + s(glu, k = 20) + s(age), + family = binomial(), + data = Pima.tr, + method = "REML" +) +``` + +## Plotting partial-effect curves {.unnumbered} + +```r +plot(m_gam, pages = 1, shade = TRUE, seWithMean = TRUE) +``` + +produces one panel per smooth, each showing the partial effect on +the log-odds scale with a 2-SE band. `seWithMean = TRUE` adds the +variance of the smooth's overall level (its centering offset) to +the pointwise standard errors. Without it, the band is the variance +of the *deviation* of the smooth around its mean; with it, the band +is the variance of the smooth itself — the right interpretation when +comparing partial-effect curves across predictors on a publication +figure. + +For a cleaner ggplot rendering, `gratia::draw()` is a one-line +alternative: + +```r +gratia::draw(m_gam) +``` + +## Translating to the response scale {.unnumbered} + +Partial-effect plots are on the **link scale** (log-odds for +binary). To plot predicted probability vs a single predictor, hold +the others at typical values, build the Wald CI on the **link** scale, +then back-transform with `plogis()`: + +```r +typical <- data.frame( + bmi = median(Pima.tr$bmi), + glu = median(Pima.tr$glu), + age = median(Pima.tr$age) +) +grid <- data.frame( + bmi = seq(min(Pima.tr$bmi), max(Pima.tr$bmi), length.out = 200), + glu = typical$glu, + age = typical$age +) +pred <- predict(m_gam, newdata = grid, + type = "link", se.fit = TRUE) +grid$prob <- plogis(pred$fit) +grid$lower <- plogis(pred$fit - 1.96 * pred$se.fit) +grid$upper <- plogis(pred$fit + 1.96 * pred$se.fit) + +ggplot(grid, aes(x = bmi, y = prob)) + + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + + geom_line() + + labs(x = "BMI (kg/m²)", + y = "Predicted probability of diabetes", + caption = "Other predictors held at sample medians") + + theme_minimal() +``` + +We compute the CI on the link scale and back-transform because the +delta-method SE on `type = "response"` doesn't respect the [0, 1] +boundary — Wald intervals built from it can extend outside (0, 1) +near the tails of the curve and have non-symmetric coverage. Working +on the link scale and applying the monotone `plogis()` transformation +keeps the interval inside (0, 1) by construction and preserves +asymmetry around the boundaries (wider near 0 and 1, narrower near +the middle), which is the right qualitative shape for a probability. + +## Comparing GLM and GAM {.unnumbered} + +```r +# Refit the GAM with method = "ML" for AIC comparison. +# REML and ML likelihoods are on different scales; comparing AIC +# from glm() (always ML) with a REML-fitted gam() mixes criteria. +# Use select = TRUE to enable term-level smoothness selection (the +# null-space penalty), so a smooth that's really just a line — or +# zero — isn't penalized as more complex than the linear baseline. +m_gam_ml <- gam( + diabetes ~ s(bmi) + s(glu) + s(age), + family = binomial(), + data = Pima.tr, + method = "ML", + select = TRUE +) +AIC(m_lin, m_gam_ml) +``` + +For `Pima.tr`, the GAM typically improves AIC by a few units — +consistent with the modest EDFs above 1 we saw in the summary. +Whether that improvement is worth the added model complexity is a +judgment call: if the smooths all have EDF very close to 1, the GLM +is a parsimonious choice; if EDFs are clearly $> 1$ and the curve +plots show a substantively interesting shape (e.g. a J in BMI), the +GAM is the better report. + +The refit with `method = "ML"` is required only for the AIC +comparison; the REML fit `m_gam` above remains the preferred fit for +final inference and plotting [@wood2017generalized, §6.2]. + +## Reporting in a manuscript {.unnumbered} + +A typical methods-section sentence is: + +> Continuous predictors were modeled using thin-plate regression +> splines (the default basis for `mgcv::s()`) via penalized restricted +> maximum likelihood (`mgcv` *version*, R *version*; +> @wood2017generalized). The number of basis functions per predictor +> was set to 10 and reduced by penalization to the effective degrees +> of freedom shown in Table X. + +Replace *version* with the actual versions you used — these can +drift between drafts. + +Pair this with a table giving each smooth's EDF and approximate +p-value, plus a figure of the partial-effect curves on the +link scale (with the response-scale plot of the predictor of +primary interest as a supplementary figure). diff --git a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd new file mode 100644 index 0000000000..971694b45f --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd @@ -0,0 +1,118 @@ +::: notes +Adapted from @wood2017generalized [§6]. +::: + +The standard tool for fitting GAMs in R is the `mgcv` package +[@wood2017generalized]. We'll use `mgcv::gam()`, which fits GAMs by +**penalized likelihood** with automatic smoothness selection via +REML or GCV. + +## The `gam()` call + +The basic syntax mirrors `lm()`/`glm()`, with smoothers introduced by +`s()`: + +```r +library(mgcv) + +m <- gam( + y ~ s(age) + s(bmi) + sex + s(year, k = 10), + data = d, + family = binomial(), + method = "REML" +) +``` + +Reading the formula: + +- `s(age)`, `s(bmi)`, `s(year, k = 10)` are **smooth terms**. Each + introduces a basis expansion plus a smoothness penalty. The + default basis (`bs = "tp"`, thin-plate regression spline) is + almost always a good choice. `k = 10` requests a basis of + dimension up to 10 — large enough that the smoothness penalty, + not the basis, controls flexibility. +- `sex` is a **parametric term**, treated exactly as in a GLM. + +You can mix smooth and parametric terms freely. Parametric terms +contribute conventional regression coefficients (with $t$/$z$ +statistics in the model summary); smooth terms contribute fitted +curves with effective-degrees-of-freedom summaries. + +## Useful `s()` arguments + +| Argument | What it controls | When to change from default | +|:--|:--|:--| +| `bs` | basis type (`"tp"`, `"cr"`, `"ps"`, ...) | rarely; `"tp"` is the default and the safe choice | +| `k` | basis dimension (max EDF + 1) | when `gam.check()` flags `k-index < 1` | +| `by` | factor for per-level smooths | predictor-by-factor interactions | +| `m` | order of the penalty derivative | rarely; default penalizes 2nd derivative | +| `pc` | constraint point | when you need the smooth to pass through 0 at a chosen $x$ | + +## Smoothness selection: REML vs GCV + +`mgcv::gam()` picks the smoothness parameter(s) $\l$ automatically by +optimizing one of: + +- **REML** (restricted maximum likelihood). Treats the penalty as + arising from a normal prior on the spline coefficients and selects + $\l$ by the marginal likelihood. **Not the default** — `gam()` + defaults to `method = "GCV.Cp"`, so you have to pass + `method = "REML"` explicitly (as in the examples in this + section). Generally **more stable** than GCV, less prone to + undersmoothing. +- **ML** (marginal likelihood). Similar to REML but with a slightly + different effective sample size adjustment. +- **GCV** (generalized cross-validation). The default. Computes + faster than REML but can occasionally select a much wigglier + smooth than is justified by the data, especially when $n$ is + small or signal is weak. + +**Recommendation: use `method = "REML"`** for routine work. If +you're comparing GAMs to GLMs via AIC, also set `select = TRUE` to +turn on smoothness selection at the term level. `select = TRUE` +adds an extra null-space penalty to each smooth (separate from the +wiggliness penalty) that can shrink an *entire* smooth term to zero +— effectively functioning as variable selection +[@wood2017generalized, §5.4]. Without it, the ordinary wiggliness +penalty can only shrink a smooth down to a straight line, never to +identically zero; `select = TRUE` extends that floor all the way +down. This makes AIC comparisons more honest because the GAM is +then allowed to "discover" that a predictor's smooth shouldn't be +there at all. + +## Checking the fit + +After fitting, run `gam.check()` to verify: + +```r +gam.check(m) +``` + +Output includes: + +- A **convergence diagnostic** (REML/GCV optimization should converge + to a stationary point — usually it does without intervention). +- **Basis-dimension check** (`k'`, `edf`, `k-index`, `p-value`) for + each smooth. If `edf` is close to `k'`, the smooth is hitting + its maximum allowed flexibility — bump `k` up and refit. If + `k-index < 1` with a small p-value, the basis may be too small. +- **Residual diagnostics**: QQ plot, residuals vs. linear predictor, + histogram of residuals, observed vs. fitted. Same intuition as for + a GLM. + +## Plotting smooths + +```r +plot(m, pages = 1, shade = TRUE) +``` + +produces a panel of partial-effect plots — one per smooth term — +with a 2-SE pointwise confidence band. The y-axis is on the **linear +predictor scale** by default, centered to mean zero (the smooth's +contribution to $g(\E{Y})$ above its mean), not on the response +scale. + +For a publication-quality plot on the response scale, use +`predict(m, type = "response", se.fit = TRUE)` on a grid of +$X$ values with the other predictors held at typical values. + diff --git a/_subfiles/generalized-additive-models/_sec_gam_inference.qmd b/_subfiles/generalized-additive-models/_sec_gam_inference.qmd new file mode 100644 index 0000000000..aec9f5a8f1 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_gam_inference.qmd @@ -0,0 +1,122 @@ +::: notes +Adapted from @wood2017generalized [§6.10–6.12]. +::: + +GAM output from `summary()` looks similar to GLM output but has a +few quirks driven by the penalized fit. Here's what to read off it. + +## Reading `summary(gam_fit)` + +A `summary.gam` object splits the report into two sections: + +### Parametric coefficients + +These are the non-smooth terms in the formula — intercept, +categorical predictors, terms entered as `x` rather than `s(x)`. +Their output is identical to a GLM: + +``` +Parametric coefficients: + Estimate Std. Error z value Pr(>|z|) +(Intercept) -2.143 0.087 -24.63 < 2e-16 *** +sexFemale 0.342 0.061 5.61 2.0e-08 *** +``` + +Read these exactly as you would GLM coefficients. + +### Smooth terms + +``` +Approximate significance of smooth terms: + edf Ref.df Chi.sq p-value +s(age) 4.82 5.93 76.2 3.6e-14 *** +s(bmi) 2.13 2.78 18.4 2.4e-04 *** +``` + +Each column reports: + +- **`edf`** — effective degrees of freedom of the smooth, as defined + in @def-effective-degrees-of-freedom. An EDF of 1.0 means the smooth + has been penalized down to a straight line; higher values mean more + wiggliness. +- **`Ref.df`** — a reference degrees of freedom used in constructing + the p-value (slightly larger than EDF; the difference accounts for + the penalized-fit uncertainty in selecting $\l$). +- **`Chi.sq`** — the test statistic for "is this smooth different + from zero?". +- **`p-value`** — an *approximate* p-value for that null. The standard + caveat applies: the smoothing parameter was selected from the + data, so the test statistic doesn't have an exact $\chi^2$ + distribution — `mgcv` uses a careful approximation accounting for + the selection, but it's still an approximation [@wood2013pvalues]. + +### Adjusted $R^2$ and deviance explained + +For Gaussian GAMs, the summary reports an adjusted $R^2$; for GLM +families, **deviance explained** (the GLM analog of $R^2$). Same +interpretation as in the GLM chapters. + +## Confidence bands for smooth terms + +The pointwise CIs that `plot.gam()` draws come from the Bayesian +posterior distribution of the spline coefficients +[@wood2017generalized, §6.10]. They have the property: + +- They are **roughly the right Bayesian credible intervals** for the + smooth at each point. +- They are **roughly the right frequentist confidence bands** + "across the function" — i.e. averaged over $x$, the band has + approximately 95% coverage of the true $f(x)$. +- They **are not** pointwise frequentist CIs in the strict sense; + at any single $x$, coverage can be a few percentage points off. + +For routine plotting and reporting in epidemiology, these bands are +the right thing to display. If you need something stricter (e.g. +simultaneous bands valid across the entire $x$-range), posterior +simulation from the spline coefficients gives the right answer: +`gratia::smooth_samples()` (gratia >= 0.9) returns draws of the smooth +from the posterior on the spline coefficients (equivalently, you can +roll your own via `mgcv::rmvn()` with the fitted `Vp`). See +[@wood2017generalized, §6.10] for the underlying theory. +Note that `simulate(m_gam)` is a *different* tool — it returns +response-scale draws (posterior predictive samples of new +observations), not smooth-coefficient draws, so it's appropriate for +predictive checks but not for constructing simultaneous bands on the +smooth. + +## Comparing models + +Two GAMs can be compared in the usual ways: + +- **AIC**: `AIC(m1, m2)`. Lower is better. `mgcv` reports the AIC + with an effective-degrees-of-freedom correction that's + appropriate for the penalized fit. +- **Likelihood-ratio test**: `anova(m1, m2, test = "Chisq")`. Same + caveat as the smooth-term p-values — the smoothing parameter was + data-chosen, so the LRT is approximate. Use with the same care. +- **`gratia::compare_smooths()`** (from the companion `gratia` + package) — a more comprehensive comparison framework, including + visual overlays of smooths and difference plots. + +## When to prefer a GLM over a GAM + +GAMs are strictly more flexible than GLMs, but that flexibility costs +something: + +- **Communication.** A linear effect is easier to describe in a + table of coefficients than a curve. +- **Power.** With small $n$ and a smooth that's truly linear, the + GAM wastes degrees of freedom on penalty selection. The EDF will + drop toward 1, but the inference loses a little efficiency. +- **Reporting standards.** Some epidemiology journals still require + linear or quadratic exposure terms. + +A reasonable workflow: + +1. Plot the data + a LOESS smoother for an initial scatterplot + diagnostic. +2. Fit a GAM with `s(x)` for every continuous predictor where + nonlinearity is plausible. +3. Read the EDF: if EDF $\approx 1$, the linear form is fine — drop + `s()` and refit as a GLM for parsimony. +4. If EDF $> 1$, report the GAM with a curve plot. diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd new file mode 100644 index 0000000000..4be0d099e8 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_gams.qmd @@ -0,0 +1,113 @@ +::: notes +Adapted from @james2021islr2e [§7.7] and @wood2017generalized [§4–7]. +::: + +We're now ready to assemble the **generalized additive model**. The +idea is to take any of the nonlinear smoothers we've built up — +regression splines, smoothing splines, local regression — and let +**each predictor have its own smoother**, with the smoother +contributions added together on the link scale. + +## The GAM formulation + +::::{#def-gam} +#### Generalized additive model + +Let $Y$ be a response variable with conditional distribution from +an exponential family (Gaussian, binomial, Poisson, etc.), and let +$X = (X_1, \ldots, X_p)$ be predictors. A **generalized additive +model** (GAM) specifies + +$$g(\E{Y \mid X = x}) = \b_0 + f_1(x_1) + f_2(x_2) + \cdots + f_p(x_p),$$ + +where: + +- $g$ is a link function (identity for Gaussian, logit for binary, + log for Poisson — same as in a GLM). +- Each $f_j$ is a **smooth function** of the $j$-th predictor, + estimated from the data. +- $\b_0$ is an intercept (sometimes absorbed into one of the $f_j$'s + via a centering constraint). + +A "smooth function" in practice means a regression spline, +smoothing spline, or other smoother — anything that can be written +as a basis expansion $f_j(x_j) = \sum_k \b_{jk} b_{jk}(x_j)$ with +the coefficients $\b_{jk}$ estimated jointly across all predictors. +:::: + +::::{#exm-gam} +#### GAM for diabetes risk + +For the Pima diabetes data with continuous predictors BMI, plasma +glucose, and age, a GAM on the logit scale is + +$$\logitf{\Pr(\text{diabetes} = 1 \mid \text{bmi}, \text{glu}, \text{age})} += \b_0 + f_1(\text{bmi}) + f_2(\text{glu}) + f_3(\text{age}),$$ + +where each $f_j$ is a smoothing spline estimated by REML. The three +partial-effect curves can be plotted separately, revealing, for +example, whether diabetes risk rises linearly with age or accelerates +nonlinearly. +:::: + +The model class includes ordinary linear regression as a special +case (every $f_j$ is just $\b_j x_j$) and ordinary GLMs as another +special case (same — every $f_j$ linear, with the right link +function). The point of the GAM extension is to let *some* $f_j$ +be nonlinear while keeping the predictor contributions additive. + +## What "additive" buys you + +The additive structure makes GAMs strictly less flexible than fully +nonparametric models like + +$$g(\E{Y \mid X = x}) = f(x_1, x_2, \ldots, x_p),$$ + +which allow arbitrary interactions among predictors. In return, GAMs +gain: + +- **Interpretability per predictor.** Each $f_j$ can be plotted on its + own axis as the predicted shift in the linear predictor as $X_j$ + varies, holding the other predictors fixed. The same partial-effect + intuition that makes linear-model coefficients useful carries over. +- **Avoidance of the curse of dimensionality.** Fitting a smooth in + one dimension takes $O(n)$ data; fitting it in $p$ dimensions + jointly takes $O(n^p)$ data. Additivity is a structural assumption + that makes the problem tractable even with several predictors. +- **Direct extension of GLM workflow.** The link function $g$, + exponential-family likelihood, residual diagnostics, and AIC + comparisons all work essentially unchanged — GAMs are GLMs in + spline coordinates. + +## Adding interactions and tensor smooths + +The strict-additivity restriction can be relaxed when needed. The +two standard escape hatches are: + +- **Predictor-by-factor interactions.** A smooth that varies by + category — e.g. an age effect that's different for men and women + — is fit as $f_j(X_j) + f_{j,\text{by}}(X_j) \cdot \indicp{\text{sex} = M}$ + via `s(age, by = sex)` in `mgcv`. When `sex` is a factor, this + syntax requires an accompanying parametric main effect for + identifiability (the smooth is centered to mean zero in each + level, so the level-specific intercepts have to come from + somewhere else); the correct idiom is: + ```r + gam(y ~ sex + s(age, by = sex), data = d, method = "REML") + ``` + The result is still additive in the *factors*, but allows + separate smooth shapes within each level. +- **Tensor-product smooths.** A genuine two-dimensional smooth surface + in two continuous predictors — e.g. $f(\text{age}, \text{bmi})$ — + fit via `te(age, bmi)` in `mgcv`. The smoothing penalty is applied + separately to wiggliness in each marginal direction, so the smooth + doesn't assume the two predictors are on comparable scales — the + right choice when predictors have different units (e.g. age in years, + BMI in kg/m²). An isotropic 2D smooth (`s(age, bmi)`) would penalize + the same amount of "wiggliness per year of age" as per "unit of + BMI", which is rarely substantively defensible. + +For routine epidemiology work, the additive model is usually the +right starting point; tensor smooths come up mainly when interactions +between continuous predictors are scientifically central (e.g. +space–time models, age–calendar-time mortality surfaces). \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_lab.qmd b/_subfiles/generalized-additive-models/_sec_lab.qmd new file mode 100644 index 0000000000..d45ef30668 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_lab.qmd @@ -0,0 +1,378 @@ +::: notes +Lab adapted from @james2021islr2e [§7.8] and @james2023islp [§7.8]. +::: + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-setup-r +#| eval: false +#| message: false +library(ISLR2) +library(splines) +library(mgcv) +library(ggplot2) +library(boot) +data(Wage) +head(Wage[, c("age", "wage", "education", "year")]) +``` + +#### Python +The smoothing-spline and GAM sections also require `pygam`, which is +not part of the standard ISLP/sklearn ecosystem — install with +`pip install pygam` before running those blocks. + +```{python} +#| eval: false +#| python.reticulate: false +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from ISLP import load_data +from sklearn.preprocessing import PolynomialFeatures +from sklearn.linear_model import LinearRegression, LogisticRegression +import statsmodels.api as sm +from patsy import dmatrix +# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f # noqa: E402 +Wage = load_data("Wage") +print(Wage[["age","wage","education","year"]].head()) +``` +::: + +## Polynomial regression {.unnumbered} + +Fit degree-1 through degree-5 polynomials, then use ANOVA to select degree [@james2021islr2e, §7.1]. + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-poly-r +#| eval: false +#| code-fold: true +fit1 <- lm(wage ~ poly(age, 1), data = Wage) +fit2 <- lm(wage ~ poly(age, 2), data = Wage) +fit3 <- lm(wage ~ poly(age, 3), data = Wage) +fit4 <- lm(wage ~ poly(age, 4), data = Wage) +fit5 <- lm(wage ~ poly(age, 5), data = Wage) +anova(fit1, fit2, fit3, fit4, fit5) +``` + +The ANOVA p-values suggest degree 3 or 4 is adequate. + +```{r} +#| label: lab-poly-plot-r +#| eval: false +#| code-fold: true +age_grid <- seq(min(Wage$age), max(Wage$age), length.out = 200) +pred <- predict(fit4, newdata = data.frame(age = age_grid), se.fit = TRUE) +df_pred <- data.frame( + age = age_grid, + fit = pred$fit, + lo = pred$fit - 2 * pred$se.fit, + hi = pred$fit + 2 * pred$se.fit +) +ggplot(Wage, aes(age, wage)) + + geom_point(alpha = 0.2, size = 0.8) + + geom_ribbon(data = df_pred, aes(y = fit, ymin = lo, ymax = hi), + alpha = 0.3, fill = "steelblue") + + geom_line(data = df_pred, aes(y = fit), colour = "steelblue", linewidth = 1) + + labs(title = "Degree-4 polynomial fit", x = "Age", y = "Wage") +``` +#### Python +```{python} +#| eval: false +#| python.reticulate: false +from scipy import stats +results = {} +for deg in range(1, 6): + X = np.column_stack([Wage["age"] ** k for k in range(1, deg + 1)]) + X = sm.add_constant(X) + results[deg] = sm.OLS(Wage["wage"], X).fit() + +# F-tests between consecutive nested models. compare_f_test() is +# called on the *unrestricted* (more flexible) model and given the +# restricted model as the argument. +for d in range(1, 5): + f, p, df_diff = results[d + 1].compare_f_test(results[d]) + print(f"Degree {d} vs {d+1}: F={f:.3f}, p={p:.4f}") +``` + +```{python} +#| eval: false +#| python.reticulate: false +age_grid = np.linspace(Wage["age"].min(), Wage["age"].max(), 200) +X_grid = sm.add_constant(np.column_stack([age_grid ** k for k in range(1, 5)])) +pred4 = results[4].get_prediction(X_grid) +pred_df = pred4.summary_frame(alpha=0.05) + +plt.figure(figsize=(7, 4)) +plt.scatter(Wage["age"], Wage["wage"], alpha=0.15, s=8) +plt.plot(age_grid, pred_df["mean"], color="steelblue", linewidth=2) +plt.fill_between(age_grid, pred_df["mean_ci_lower"], pred_df["mean_ci_upper"], + alpha=0.3, color="steelblue") +plt.title("Degree-4 polynomial fit"); plt.xlabel("Age"); plt.ylabel("Wage") +plt.tight_layout(); plt.show() +``` +::: + +## Step functions {.unnumbered} + +Cut `age` into four intervals and fit a piecewise-constant model [@james2021islr2e, §7.2]. + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-step-r +#| eval: false +#| code-fold: true +Wage$age_cut <- cut(Wage$age, 4) +fit_step <- lm(wage ~ age_cut, data = Wage) +summary(fit_step)$coef +``` +#### Python +```{python} +#| eval: false +#| python.reticulate: false +Wage["age_cut"] = pd.cut(Wage["age"], bins=4) +step_dummies = pd.get_dummies(Wage["age_cut"], drop_first=True).astype(float) +X_step = sm.add_constant(step_dummies) +fit_step = sm.OLS(Wage["wage"], X_step).fit() +print(fit_step.summary().tables[1]) +``` +::: + +## Regression splines {.unnumbered} + +Fit a cubic B-spline with knots at ages 25, 40, and 60, and a natural cubic spline with the same knots [@james2021islr2e, §7.4]. + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-splines-r +#| eval: false +#| code-fold: true +# Redefine age_grid so the splines section runs in isolation +# (it's also defined in the poly-plot chunk above). +age_grid <- seq(min(Wage$age), max(Wage$age), length.out = 200) +knots <- c(25, 40, 60) +fit_bs <- lm(wage ~ bs(age, knots = knots), data = Wage) +fit_ns <- lm(wage ~ ns(age, knots = knots), data = Wage) + +pred_bs <- predict(fit_bs, newdata = data.frame(age = age_grid), se.fit = TRUE) +pred_ns <- predict(fit_ns, newdata = data.frame(age = age_grid), se.fit = TRUE) + +df_sp <- data.frame( + age = rep(age_grid, 2), + fit = c(pred_bs$fit, pred_ns$fit), + lo = c(pred_bs$fit - 2*pred_bs$se.fit, pred_ns$fit - 2*pred_ns$se.fit), + hi = c(pred_bs$fit + 2*pred_bs$se.fit, pred_ns$fit + 2*pred_ns$se.fit), + type = rep(c("B-spline", "Natural spline"), each = 200) +) +ggplot(Wage, aes(age, wage)) + + geom_point(alpha = 0.15, size = 0.6) + + geom_ribbon(data = df_sp, aes(y = fit, ymin = lo, ymax = hi, fill = type), alpha = 0.25) + + geom_line(data = df_sp, aes(y = fit, colour = type), linewidth = 1) + + labs(x = "Age", y = "Wage", colour = NULL, fill = NULL) +``` +#### Python +```{python} +#| eval: false +#| python.reticulate: false +bs_mat = dmatrix("bs(age, knots=(25,40,60), include_intercept=False)", + {"age": Wage["age"]}, return_type="dataframe") +ns_mat = dmatrix("cr(age, knots=(25,40,60))", + {"age": Wage["age"]}, return_type="dataframe") + +fit_bs_py = sm.OLS(Wage["wage"], bs_mat).fit() +fit_ns_py = sm.OLS(Wage["wage"], ns_mat).fit() + +bs_grid = dmatrix("bs(age, knots=(25,40,60), include_intercept=False)", + {"age": age_grid}, return_type="dataframe") +ns_grid = dmatrix("cr(age, knots=(25,40,60))", + {"age": age_grid}, return_type="dataframe") + +plt.figure(figsize=(7, 4)) +plt.scatter(Wage["age"], Wage["wage"], alpha=0.15, s=6) +plt.plot(age_grid, fit_bs_py.predict(bs_grid), label="B-spline", linewidth=2) +plt.plot(age_grid, fit_ns_py.predict(ns_grid), label="Natural spline", linewidth=2) +plt.legend(); plt.xlabel("Age"); plt.ylabel("Wage") +plt.tight_layout(); plt.show() +``` +::: + +## Smoothing splines and local regression {.unnumbered} + +A smoothing spline selects the penalty parameter $\l$ by generalized cross-validation (GCV); LOESS uses a local polynomial fit [@james2021islr2e, §7.5–7.6]. + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-smooth-r +#| eval: false +#| code-fold: true +fit_ss_16 <- smooth.spline(Wage$age, Wage$wage, df = 16) +fit_ss_gcv <- smooth.spline(Wage$age, Wage$wage, cv = FALSE) # GCV +fit_lo <- loess(wage ~ age, data = Wage, span = 0.2) + +age_seq <- seq(min(Wage$age), max(Wage$age), length.out = 200) +df_sm <- data.frame( + age = rep(age_seq, 3), + fit = c(predict(fit_ss_16, age_seq)$y, + predict(fit_ss_gcv, age_seq)$y, + predict(fit_lo, data.frame(age = age_seq))), + type = rep(c("Smooth spline df=16", + paste0("Smooth spline GCV (df=", + round(fit_ss_gcv$df, 1), ")"), + "LOESS span=0.2"), + each = 200) +) +ggplot(Wage, aes(age, wage)) + + geom_point(alpha = 0.15, size = 0.6) + + geom_line(data = df_sm, aes(y = fit, colour = type), linewidth = 1) + + labs(x = "Age", y = "Wage", colour = NULL) +``` +#### Python +```{python} +#| eval: false +#| python.reticulate: false +from pygam import LinearGAM, s as gam_s +from statsmodels.nonparametric.smoothers_lowess import lowess + +# Smoothing spline via pyGAM +gam_smooth = LinearGAM(gam_s(0, n_splines=25)).gridsearch( + Wage[["age"]].values, Wage["wage"].values) +XX = gam_smooth.generate_X_grid(term=0, n=200) +pdep, confi = gam_smooth.partial_dependence(term=0, X=XX, width=0.95) + +# LOESS +lo_fit = lowess(Wage["wage"], Wage["age"], frac=0.2) + +fig, axes = plt.subplots(1, 2, figsize=(12, 4)) +axes[0].scatter(Wage["age"], Wage["wage"], alpha=0.15, s=6) +axes[0].plot(XX.ravel(), pdep, color="steelblue", linewidth=2, label="Smoothing spline (GCV)") +axes[0].fill_between(XX.ravel(), confi[:,0], confi[:,1], alpha=0.3) +axes[0].set_xlabel("Age"); axes[0].set_ylabel("Wage"); axes[0].legend() + +axes[1].scatter(Wage["age"], Wage["wage"], alpha=0.15, s=6) +axes[1].plot(lo_fit[:,0], lo_fit[:,1], color="tomato", linewidth=2, label="LOESS span=0.2") +axes[1].set_xlabel("Age"); axes[1].set_ylabel("Wage"); axes[1].legend() +plt.tight_layout(); plt.show() +``` +::: + +## GAMs: continuous outcome {.unnumbered} + +Fit three nested GAMs for `wage` on the `Wage` data and compare with AIC [@james2021islr2e, §7.7]. +This lab follows the ISLRv2 code, which uses the `mgcv` default +`method = "GCV.Cp"` for smoothness selection; for new analyses prefer +`method = "REML"` (see @sec-smoothing-splines and @sec-gam-fitting). + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-gam-r +#| eval: false +#| code-fold: true +gam1 <- gam(wage ~ s(year, k = 4) + s(age, k = 5) + education, + data = Wage) +gam2 <- gam(wage ~ year + s(age, k = 5) + education, + data = Wage) +gam3 <- gam(wage ~ s(year, k = 4) + poly(age, 4) + education, + data = Wage) + +AIC(gam1, gam2, gam3) + +par(mfrow = c(1, 3)) +plot(gam1, shade = TRUE, seWithMean = TRUE, + main = "GAM1: s(year) + s(age) + education") +``` +#### Python +```{python} +#| eval: false +#| python.reticulate: false +from pygam import LinearGAM, s as gam_s, f as gam_f + +edu_codes = Wage["education"].cat.codes.values +X_gam = np.column_stack([Wage["year"].values, + Wage["age"].values, + edu_codes]) +y_gam = Wage["wage"].values + +gam_py = LinearGAM( + gam_s(0, n_splines=4) + gam_s(1, n_splines=5) + gam_f(2) +).gridsearch(X_gam, y_gam) + +print("AIC:", gam_py.statistics_["AIC"]) + +fig, axes = plt.subplots(1, 3, figsize=(14, 4)) +titles = ["year", "age", "education"] +for i, ax in enumerate(axes): + XX = gam_py.generate_X_grid(term=i) + pdep, confi = gam_py.partial_dependence(term=i, X=XX, width=0.95) + ax.plot(XX[:, i], pdep, linewidth=2) + ax.fill_between(XX[:, i], confi[:,0], confi[:,1], alpha=0.3) + ax.set_xlabel(titles[i]); ax.set_ylabel("Partial effect") +plt.tight_layout(); plt.show() +``` +::: + +## GAMs: binary outcome {.unnumbered} + +Predict whether a worker earns more than \$250k using a logistic GAM. We subset to workers +with at least some college education, following @james2021islr2e [§7.7]. + +:::{.panel-tabset} +#### R +```{r} +#| label: lab-gam-logistic-r +#| eval: false +#| code-fold: true +Wage$high <- factor(ifelse(Wage$wage > 250, "High", "Low"), + levels = c("Low", "High")) + +fit_logit_gam <- gam( + high ~ s(year, k = 4) + s(age, k = 5) + education, + family = binomial, + data = Wage, + subset = (education != "1. < HS Grad") +) +summary(fit_logit_gam) + +par(mfrow = c(1, 3)) +plot(fit_logit_gam, shade = TRUE, seWithMean = TRUE, + trans = plogis, + main = "Logistic GAM (P(wage > 250k))") +``` +#### Python +```{python} +#| eval: false +#| python.reticulate: false +from pygam import LogisticGAM, s as gam_s, f as gam_f + +mask = Wage["education"] != "1. < HS Grad" +W2 = Wage[mask].copy() +W2["high"] = (W2["wage"] > 250).astype(int) +edu2 = W2["education"].cat.codes.values + +X_log = np.column_stack([W2["year"].values, + W2["age"].values, + edu2]) +y_log = W2["high"].values + +gam_log = LogisticGAM( + gam_s(0, n_splines=4) + gam_s(1, n_splines=5) + gam_f(2) +).gridsearch(X_log, y_log) + +print("Training accuracy:", (gam_log.predict(X_log) == y_log).mean()) + +fig, axes = plt.subplots(1, 3, figsize=(14, 4)) +titles = ["year", "age", "education"] +for i, ax in enumerate(axes): + XX = gam_log.generate_X_grid(term=i) + pdep, confi = gam_log.partial_dependence(term=i, X=XX, width=0.95) + ax.plot(XX[:, i], pdep, linewidth=2) + ax.fill_between(XX[:, i], confi[:,0], confi[:,1], alpha=0.3) + ax.set_xlabel(titles[i]); ax.set_ylabel("Partial effect (log-odds)") +plt.tight_layout(); plt.show() +``` +::: diff --git a/_subfiles/generalized-additive-models/_sec_local_regression.qmd b/_subfiles/generalized-additive-models/_sec_local_regression.qmd new file mode 100644 index 0000000000..7339b628f6 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_local_regression.qmd @@ -0,0 +1,97 @@ +::: notes +Adapted from @james2021islr2e [§7.6]. +::: + +**Local regression**, also called **LOESS** (LOcally Estimated +Scatterplot Smoothing) or **LOWESS**, takes yet another approach to +nonlinear regression: instead of fitting a single global model with +flexible basis functions, it fits **many local models**, one for +each point on the $x$-axis where we want a prediction. + +## The local fit + +::::{#def-local-regression} +#### Local regression at $x_0$ + +To estimate the regression function $f(x_0)$ at a target point $x_0$: + +1. Choose a **bandwidth** $s \in (0, 1]$, also called the **span**. + Take the fraction $s$ of training observations with $x_i$ closest + to $x_0$ — this is the **local neighborhood**. +2. **Weight** the neighborhood observations using a kernel that + gives higher weight to points closer to $x_0$ and zero weight to + points outside the neighborhood. The tricube kernel + $K(u) = (1 - |u|^3)^3 \indicp{|u| \le 1}$ is the standard choice. +3. **Fit a low-degree polynomial** (constant, linear, or quadratic) + by weighted least squares to the neighborhood, using the kernel + weights. The fitted value at $x_0$ is the prediction + $\hat f(x_0)$. + +To get the entire fitted curve, repeat for a dense grid of $x_0$ +values and connect the resulting predictions. +:::: + +::::{#exm-local-regression} +#### LOESS fit on age--glucose + +To smooth the relationship between age and plasma glucose in +`MASS::Pima.tr` using LOESS with a span of 0.5: + +```r +m_loess <- loess(glu ~ age, data = MASS::Pima.tr, span = 0.5) +``` + +At age 40, the local neighborhood is the 50% of observations with +ages closest to 40. Each observation in that window is weighted by +the tricube kernel --- down-weighted as its age moves away from 40, +zero weight outside the window. The predicted glucose at age 40 is +the fitted value from the weighted least-squares linear regression +over that neighborhood. +:::: + +## Strengths + +- **Conceptually simple.** No basis functions, no penalty matrices. + Just "fit a line through the nearby points, and call that the + curve at this $x$." +- **Adaptive to local structure.** Regions with rapid change get + their own local polynomial, just like flat regions do. +- **Robust to outliers** in the standard `loess()` implementation, + which iteratively down-weights observations with large residuals. + +## Limitations + +- **No global model.** Local regression doesn't produce a closed-form + $\hat f(x)$ — predictions at any new $x$ are computed on demand by + re-running the local fit at that point, but there is no parametric + formula for the curve. Inference for the entire function (e.g. "is this exposure associated with + outcome at all") needs special tooling. +- **Hard to extend to many predictors.** With $p$ predictors, the + local neighborhood is in $p$-dimensional space, and the curse of + dimensionality bites hard: even for moderate $p$, "the nearest + fraction $s$ of the data" is geometrically nothing like local. LOESS + is rarely used with more than 2–4 predictors. +- **Bandwidth selection.** Just like the smoothing parameter $\l$ in + smoothing splines, $s$ has to be chosen. Cross-validation works + but is slower. + +## Local regression in R + +```r +m_loess <- loess(y ~ x, data = d, span = 0.5) +grid <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200)) +grid$fit <- predict(m_loess, newdata = grid) + +# Inside ggplot2, geom_smooth() with method = "loess" +# is the standard scatterplot smoother: +ggplot(d, aes(x = x, y = y)) + + geom_point() + + geom_smooth(method = "loess", span = 0.5) +``` + +For most modeling work in this chapter we'll prefer the +smoothing-spline approach via `mgcv::s()`, which produces a global +fit with full inferential machinery. LOESS is best thought of as a +**diagnostic smoother** — an unobtrusive way to overlay the local +mean of a scatterplot to check whether a parametric fit is missing +nonlinearity. \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_method_comparison.qmd b/_subfiles/generalized-additive-models/_sec_method_comparison.qmd new file mode 100644 index 0000000000..92a54bb248 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_method_comparison.qmd @@ -0,0 +1,244 @@ +::: notes +This section illustrates the methods of @sec-polynomial-regression +through @sec-local-regression on a shared synthetic dataset, so +the strengths and failure modes can be compared side-by-side. +::: + +```{r} +#| label: method-comparison-setup +#| include: false +set.seed(204) +library(ggplot2) + +n <- 200 +x_lim <- c(0, 2 * pi) +truth <- function(x) sin(x) + 0.3 * sin(3 * x) +demo_data <- data.frame( + x = runif(n, x_lim[1], x_lim[2]) +) +demo_data$y <- truth(demo_data$x) + rnorm(n, sd = 0.4) + +grid <- data.frame(x = seq(x_lim[1], x_lim[2], length.out = 400)) +grid$truth <- truth(grid$x) + +predict_with <- function(model) { + predict(model, newdata = grid) +} + +theme_set(theme_minimal()) +``` + +We generate $n = `r n`$ noisy observations from a smooth nonlinear +function + +$$f(x) = \sin(x) + 0.3 \sin(3 x), \quad x \in [0, 2\pi],$$ + +with additive Gaussian noise ($\s = 0.4$). The truth is bumpy +enough to expose differences between the methods but not so wild +that any one of them obviously breaks. + +```{r} +#| label: method-comparison-data-fits +#| include: false +# Fit each method on the same demo_data; predictions on `grid` for +# overlay. Comments on each fit explain the design choice. + +# (1) Linear baseline. +fit_lin <- lm(y ~ x, data = demo_data) +grid$linear <- predict_with(fit_lin) + +# (2) Quadratic polynomial. +fit_poly2 <- lm(y ~ poly(x, 2), data = demo_data) +grid$poly2 <- predict_with(fit_poly2) + +# (3) Degree-12 polynomial — deliberately high degree to surface +# Runge-style boundary swinging. +fit_poly12 <- lm(y ~ poly(x, 12), data = demo_data) +grid$poly12 <- predict_with(fit_poly12) + +# (4) Step function with 5 bins on equal-quantile cuts. Pin the +# outer breakpoints to x_lim so the grid endpoints don't fall +# outside breaks[1] / breaks[length(breaks)] — otherwise cut() +# returns NA at the boundaries, leaving the step line truncated. +breaks <- quantile(demo_data$x, probs = seq(0, 1, length.out = 6)) +breaks[c(1L, length(breaks))] <- x_lim +demo_data$bin <- cut(demo_data$x, breaks = breaks, + include.lowest = TRUE) +grid$bin <- cut(grid$x, breaks = breaks, include.lowest = TRUE) +fit_step <- lm(y ~ bin, data = demo_data) +grid$step <- predict_with(fit_step) + +# (5) Natural cubic spline, 6 df (knots at internal quantiles). +fit_ns <- lm(y ~ splines::ns(x, df = 6), data = demo_data) +grid$ns <- predict_with(fit_ns) + +# (6) Smoothing spline via mgcv::gam, REML smoothness selection. +fit_gam <- mgcv::gam( + y ~ s(x, bs = "tp", k = 20), + data = demo_data, + method = "REML" +) +grid$gam <- predict_with(fit_gam) + +# (7) LOESS, default span. +fit_loess <- loess(y ~ x, data = demo_data, span = 0.5) +grid$loess <- predict_with(fit_loess) +``` + +The fitted curves are shown in @fig-method-comparison-overlay. + +::::{#fig-method-comparison-overlay layout-ncol="2"} + +:::{#fig-method-comparison-poly} + +```{r} +#| label: method-comparison-poly-plot +#| code-fold: true +ggplot(grid, aes(x = x)) + + geom_point( + data = demo_data, aes(y = y), + alpha = 0.3, size = 0.8 + ) + + geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) + + geom_line(aes(y = linear, color = "Linear"), + linewidth = 0.7, linetype = "longdash") + + geom_line(aes(y = poly2, color = "Quadratic"), linewidth = 0.7) + + geom_line(aes(y = poly12, color = "Degree 12"), + linewidth = 0.7) + + scale_color_manual( + name = NULL, + values = c("Truth" = "black", "Linear" = "grey50", + "Quadratic" = "tomato", "Degree 12" = "steelblue") + ) + + labs(x = "x", y = "y") + + theme(legend.position = "bottom") +``` + +Polynomial fits. Quadratic underfits the central inflection; the +degree-12 polynomial chases the noise and swings wildly near the +boundaries (Runge's phenomenon). +::: + +:::{#fig-method-comparison-step} + +```{r} +#| label: method-comparison-step-plot +#| code-fold: true +ggplot(grid, aes(x = x)) + + geom_point( + data = demo_data, aes(y = y), + alpha = 0.3, size = 0.8 + ) + + geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) + + geom_step(aes(y = step, color = "Step (5 bins)"), + linewidth = 0.7) + + scale_color_manual( + name = NULL, + values = c("Truth" = "black", "Step (5 bins)" = "tomato") + ) + + labs(x = "x", y = "y") + + theme(legend.position = "bottom") +``` + +Step function with 5 quantile-spaced bins. Captures the broad shape +but is discontinuous at each cutpoint — almost never the right +model for a continuous exposure. +::: + +:::{#fig-method-comparison-splines} + +```{r} +#| label: method-comparison-splines-plot +#| code-fold: true +ggplot(grid, aes(x = x)) + + geom_point( + data = demo_data, aes(y = y), + alpha = 0.3, size = 0.8 + ) + + geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) + + geom_line( + aes(y = ns, color = "Natural cubic spline (df = 6)"), + linewidth = 0.7 + ) + + geom_line( + aes(y = gam, color = "Smoothing spline (REML, k = 20)"), + linewidth = 0.7 + ) + + scale_color_manual( + name = NULL, + values = c( + "Truth" = "black", + "Natural cubic spline (df = 6)" = "tomato", + "Smoothing spline (REML, k = 20)" = "steelblue" + ) + ) + + labs(x = "x", y = "y") + + theme(legend.position = "bottom") +``` + +Splines. Both spline fits track the truth closely without the +boundary swings of the degree-12 polynomial. The smoothing spline +(REML-selected $\l$) has slightly less peak-to-peak amplitude +because the penalty shrinks toward a straight line. +::: + +:::{#fig-method-comparison-loess} + +```{r} +#| label: method-comparison-loess-plot +#| code-fold: true +ggplot(grid, aes(x = x)) + + geom_point( + data = demo_data, aes(y = y), + alpha = 0.3, size = 0.8 + ) + + geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) + + geom_line(aes(y = loess, color = "LOESS (span = 0.5)"), + linewidth = 0.7) + + scale_color_manual( + name = NULL, + values = c("Truth" = "black", "LOESS (span = 0.5)" = "tomato") + ) + + labs(x = "x", y = "y") + + theme(legend.position = "bottom") +``` + +LOESS with the default 50% span. Similar shape to the splines but +without a global closed-form parametric representation — the curve +is computed on demand at any evaluation point rather than stored as +a parametric formula. +::: + +Comparison of nonlinear regression methods on synthetic data +$y = \sin(x) + 0.3\sin(3x) + \eps$, $\eps \sim \ndist{0, 0.4^2}$, +$n = `r n`$. Splines (both regression and smoothing) and LOESS +track the truth closely; quadratic polynomial underfits the multi- +modal structure; degree-12 polynomial overfits and swings at +boundaries. +:::: + +## Takeaways + +- **Polynomials beyond degree 3–4 are usually a bad idea.** The + degree-12 fit is a clean illustration: it's interpolating noise in + the middle of the data and swinging beyond the data range. With 13 + parameters (intercept + 12 polynomial terms) it has more flexibility + than the natural cubic spline (`df = 6`), yet uses that flexibility + on noise rather than on the curve's shape — a reminder that global + polynomials concentrate their degrees of freedom in the wrong + places. +- **Splines and LOESS are essentially indistinguishable on this + example.** Both produce smooth curves close to the truth. The + practical differences are in how easily they extend to multi- + predictor settings (splines do, LOESS doesn't beyond 2-D) and in + how inference works (splines have full Wald/Bayesian machinery, + LOESS doesn't). +- **REML-selected smoothing spline doesn't require a degrees-of- + freedom choice.** The `df = 6` for the natural cubic spline was a + modeler choice; the REML smoothing spline picked its own + smoothness from the data. +- **Step functions are for categorization, not smoothing.** The 5- + bin step function is the worst fit on a smooth-truth dataset, but + on real epidemiologic data where bins have substantive meaning + (clinical BMI categories, age groups), the interpretability + benefit can outweigh the within-bin information loss. diff --git a/_subfiles/generalized-additive-models/_sec_motivation.qmd b/_subfiles/generalized-additive-models/_sec_motivation.qmd new file mode 100644 index 0000000000..46f4a0b785 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_motivation.qmd @@ -0,0 +1,83 @@ +::: notes +Adapted from @james2021islr2e [§7] and @wood2017generalized [§1]. +::: + +So far the regression models in this book have assumed that each +continuous predictor enters the linear predictor through a single +coefficient: + +$$g(\E{Y \mid X = x}) = \b_0 + \b_1 x_1 + \b_2 x_2 + \cdots + \b_p x_p$$ + +This forces the effect of every predictor to be **monotone**: if +$\b_j > 0$, raising $x_j$ always increases the predicted response on +the link scale; if $\b_j < 0$, it always decreases it. Many real +epidemiologic exposures don't behave that way. + +::::{#exm-age-mortality} +#### Age and all-cause mortality + +All-cause mortality is **U-shaped** in age: high in infancy, low through +young adulthood, then rising steeply in old age [@me4]. A linear model in +$\text{age}$ would average those regimes into a single slope, badly +misrepresenting risk at both ends. +:::: + +::::{#exm-bmi-mortality} +#### BMI and mortality + +BMI–mortality curves are **J-** or **U-shaped**: very low and very high +BMI both carry excess risk, with the minimum somewhere in the 22–25 +kg/m² range [@aune2016bmi]. +A linear or even quadratic fit misses the asymmetry between the +underweight and obese tails. +:::: + +::::{#exm-temperature-mortality} +#### Outdoor temperature and daily mortality + +Time-series studies of mortality and daily temperature consistently +show a **smooth nonlinear "bathtub"** relationship: both very cold +and very hot days raise short-term mortality risk, with a +location-specific optimum in between [@gasparrini2015mortality]. The +optimum and the steepness of the cold/heat slopes vary by city, +season, and population — features that no single linear term can +capture. +:::: + +The pattern in @exm-age-mortality, @exm-bmi-mortality, and +@exm-temperature-mortality is general: many epidemiologic exposures +have an effect on outcome that is **smooth but not monotone**. We need +a regression framework that lets the response depend on a continuous +predictor through an arbitrary smooth function, while keeping the +GLM machinery (link function, likelihood, inference) we have already +built up. + +**Generalized additive models** (GAMs) provide exactly this. A GAM +replaces the linear predictor + +$$g(\E{Y \mid X = x}) = \b_0 + \sum_{j=1}^p \b_j x_j$$ + +with an **additive** sum of smooth functions + +$$g(\E{Y \mid X = x}) = \b_0 + \sum_{j=1}^p f_j(x_j),$$ + +where each $f_j$ is estimated nonparametrically from the data. The +linear predictor remains additive across predictors (so each +predictor's contribution is still interpretable on its own), but the +shape of each $f_j$ is no longer constrained to be linear. + +The rest of this chapter builds up to GAMs in stages: + +1. **Polynomial regression** and **step functions** — the simplest + ways to introduce nonlinearity, useful but limited. +2. **Basis functions** — the general framework that polynomial and + step expansions both fit into, and the route to splines. +3. **Regression splines** and **smoothing splines** — the smoothers + most commonly used inside GAMs. +4. **Local regression (LOESS)** — an alternative smoother family. +5. **GAMs** — additive combinations of any of the above smoothers, + fit via penalized likelihood with automatic smoothness selection. + +We'll fit GAMs in R using the `mgcv` package [@wood2017generalized], +which implements the modern penalized-regression approach with +automatic smoothness selection via REML or GCV. \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_polynomial.qmd b/_subfiles/generalized-additive-models/_sec_polynomial.qmd new file mode 100644 index 0000000000..2ad8332b48 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_polynomial.qmd @@ -0,0 +1,98 @@ +::: notes +Adapted from @james2021islr2e [§7.1]. +::: + +The simplest way to allow a nonlinear effect of $X$ is to extend the +linear regression model by including powers of $X$ as additional +predictors: + +::::{#def-polynomial-regression} +#### Polynomial regression + +For a continuous predictor $X$ and degree $d \ge 1$, the +**polynomial regression model** is + +$$y_i = \b_0 + \b_1 x_i + \b_2 x_i^2 + \cdots + \b_d x_i^d + \eps_i$$ + +with $\eps_i \siid \ndist{0, \s^2}$. + +This is still a *linear* regression model in the coefficients +$\b_0, \ldots, \b_d$ — the nonlinearity lives in the deterministic +transformations $x_i^k$ of the predictor, not in the relationship +between coefficients and response. So everything we know about +linear-regression estimation, inference, and prediction (least squares, +standard errors, confidence intervals) applies directly. +:::: + +::::{#exm-polynomial-quadratic} +#### Quadratic fit to a U-shaped exposure + +For BMI–mortality, a degree-2 polynomial + +$$\logitf{\Pr(Y = 1 \mid \text{BMI} = b)} = \b_0 + \b_1 b + \b_2 b^2$$ + +can capture a single-minimum U-shape, with the minimum-risk BMI at +$-\b_1 / (2 \b_2)$ (assuming $\b_2 > 0$, i.e., the parabola opens +upward). A quadratic is the lowest-degree polynomial that +admits a turning point, so it's a natural first step beyond a linear +fit. +:::: + +## Limitations of polynomial regression + +Polynomials are convenient but have well-known problems: + +- **Global behavior.** A single $\b_k$ controls the function's + behavior everywhere on the $x$-axis. A polynomial that fits the + middle of the data well often **swings wildly** at the extremes. + The classical name for this in the *interpolation* setting is + **Runge's phenomenon**; for high-degree least-squares regression + polynomials the analogous oscillation comes from boundary + instability and ill-conditioning of the design matrix, but the + qualitative warning is the same. +- **No local control.** If the true relationship is bumpy in one + region and flat in another, no single polynomial fits both well. +- **Degree selection is unstable.** Adding one more degree changes + every fitted value, not just the local fit. Standard model + selection criteria (AIC, cross-validation) can recommend very + different degrees from one dataset to the next. +- **Extrapolation explodes.** Outside the observed range of $x$, the + highest-degree term dominates: a degree-4 fit predicts that + log-odds grows like $x^4$ at extreme exposures, which is essentially + never the right scientific model. + +A common rule of thumb is to use polynomials of degree at most 3 or 4, +and only when the underlying mechanism plausibly produces that shape. +For genuinely flexible nonlinear effects, the **basis-function** and +**spline** approaches developed later in this chapter are usually +better choices. + +## Polynomial regression in R + +In R, polynomial regression is fit by including `poly(x, degree)` in +the formula: + +```r +# Naive polynomial: raw powers x, x^2, x^3 +lm(y ~ x + I(x^2) + I(x^3), data = d) + +# Orthogonal polynomial: same fitted values, better-conditioned design +lm(y ~ poly(x, 3), data = d) +``` + +The two parameterizations produce identical fitted values $\hat y_i$ +and identical hypothesis tests of "is the polynomial nonzero", but +differ in the interpretation of individual $\hat\b_k$ coefficients: + +- `x + I(x^2) + I(x^3)` uses **raw powers**. Coefficients are tightly + correlated (the columns of the design matrix are not orthogonal), + so the individual $t$-statistic on $\hat\b_2$, say, is hard to + interpret on its own. +- `poly(x, 3)` uses an **orthogonal basis** that decorrelates the + columns. Individual $t$-statistics now test the marginal + contribution of each degree given the lower-degree terms, which is + usually what we want for "does this degree improve the fit". + +For prediction (the fitted curve), both parameterizations are +equivalent — but the orthogonal form has better numerical +conditioning and is the default we recommend. \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_regression_splines.qmd b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd new file mode 100644 index 0000000000..4c8716fa36 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd @@ -0,0 +1,161 @@ +::: notes +Adapted from @james2021islr2e [§7.4] and @wood2017generalized [§5]. +::: + +A **regression spline** is a piecewise polynomial that's constrained +to be smooth at the points where the pieces meet ("knots"). The +result is much more flexible than a single global polynomial but +much smoother than a step function. + +## Piecewise polynomials and knots + +::::{#def-piecewise-polynomial} +#### Piecewise polynomial + +Choose **knots** $\xi_1 < \xi_2 < \cdots < \xi_K$ on the support of +$X$. A **piecewise polynomial of degree $d$** with knots +$\xi_1, \ldots, \xi_K$ is a function that is a polynomial of degree +$d$ on each of the intervals +$(-\infty, \xi_1], (\xi_1, \xi_2], \ldots, (\xi_K, \infty)$ — but +the polynomials in different intervals can have different +coefficients. +:::: + +::::{#exm-piecewise-polynomial} +#### Unconstrained piecewise polynomial parameter count + +Without any continuity constraint, an unconstrained piecewise +polynomial has $(K+1)(d+1)$ free coefficients ($d+1$ per piece, $K+1$ +pieces) and can have **jumps and corners** at every knot — basically +$K+1$ independent polynomial fits glued side by side. That's almost +always *too* flexible. +:::: + +## Continuity and smoothness constraints + +Splines pay for their flexibility by adding **smoothness +constraints** at the knots. + +::::{#def-spline} +#### Spline + +A **spline of degree $d$** with knots $\xi_1, \ldots, \xi_K$ is a +piecewise polynomial of degree $d$ that is continuous, has +continuous first derivative, ..., has continuous $(d-1)$-th +derivative at every knot. +:::: + +::::{#exm-spline} +#### Effective parameter count for a cubic spline + +Each continuity constraint removes one degree of freedom. A +piecewise-cubic ($d = 3$) with $K$ knots has $(K+1) \cdot 4 = 4K + 4$ +unconstrained parameters. Requiring continuity of the function and +its first two derivatives at each of the $K$ knots imposes $3K$ +constraints, leaving $4K + 4 - 3K = K + 4$ effective parameters. +:::: + +The most common choice in practice is $d = 3$ — the **cubic +spline** — because human eyes can't see discontinuities in third or +higher derivatives, so a cubic spline already looks "smooth" to the +viewer. + +## Truncated power basis + +One concrete way to construct a degree-$d$ spline with knots +$\xi_1, \ldots, \xi_K$ is the **truncated power basis**: + +$$\b_0 + \b_1 X + \b_2 X^2 + \cdots + \b_d X^d ++ \sum_{k=1}^K \b_{d+k} (X - \xi_k)_+^d,$$ + +where $(u)_+ = \max(u, 0)$. The first $d+1$ terms are a global +polynomial; each truncated-power term $(X - \xi_k)_+^d$ adds a "kink" +of order $d$ that turns on at knot $\xi_k$. The basis has $K + d + 1$ +columns — one for the intercept, $d$ for the global polynomial, and +$K$ for the knot-activated terms — matching the $K + 4$ effective +parameters of a cubic spline (when $d = 3$). + +This basis is conceptually clean — every basis function has an +obvious geometric meaning — but **numerically poorly conditioned** +when knots are close together or far from zero. The design matrix +$\tp{\mat{B}} \mat{B}$ can be nearly singular, so OLS +estimates are unstable. For numerical work, the **B-spline basis** +(equivalent space of splines, locally supported basis functions) is +the standard choice. R's `splines::bs()` returns a B-spline basis. + +## Natural cubic splines + +Cubic splines often **misbehave at the boundary**: with no data +beyond the outermost knots, the cubic terms can swing wildly past the +last observed $x$. The **natural cubic spline** fixes this by adding +four more constraints — the function is required to be **linear** +beyond the outermost knots, which is equivalent to forcing the +second and third derivatives to vanish at each boundary knot: + +$$f''(\xi_1) = f'''(\xi_1) = f''(\xi_K) = f'''(\xi_K) = 0.$$ + +These four boundary constraints (linearity past each end of the +data) reduce the effective parameter count from $K + 4$ to $K$. The +fit is identical to a cubic spline in the *interior* but is forced +to be linear in the tails — almost always a better-behaved +extrapolation than an unconstrained cubic. + +R provides natural cubic splines via `splines::ns()`. For routine +work, **natural cubic splines are the default recommendation** over +unconstrained cubic splines. + +## Knot placement + +A spline fit depends on three choices: + +1. **Number of knots** $K$. More knots = more flexibility = more risk + of overfitting. Common practice: pick $K$ so that the resulting + degrees of freedom is in the 4–8 range for most exposures, or + tune $K$ by cross-validation. +2. **Location of knots** $\xi_1, \ldots, \xi_K$. The standard + recommendation is to place knots at **quantiles of the $X$ + distribution** (e.g. for $K = 4$: at the 20th, 40th, 60th, 80th + percentiles). This puts more knots where there's more data — the + exact opposite of what equally-spaced knots would do. +3. **Boundary handling** (cubic vs natural cubic). + +The `df` argument to `ns()` or `bs()` automates (1) and (2): R +chooses knots at the appropriate sample quantiles to produce a basis +of the requested dimension. + +## Regression splines in R + +```r +library(splines) + +# Natural cubic spline with 4 degrees of freedom +# (knots auto-placed at quantiles) +m_ns <- lm(y ~ ns(x, df = 4), data = d) + +# Cubic B-spline basis with 6 degrees of freedom +m_bs <- lm(y ~ bs(x, df = 6), data = d) + +# In a GLM, just substitute glm() and a family +m_glm <- glm(y ~ ns(x, df = 4), + data = d, family = binomial()) +``` + +Both `ns()` and `bs()` return numeric matrices that slot directly +into `lm()`/`glm()` formulas. The `df` argument is the **exact** +degrees of freedom (basis dimension), in contrast to the `k` +argument of `mgcv::s()` introduced later in this chapter, which is +an **upper bound** that REML penalization can reduce to the actual +effective degrees of freedom. With `splines::ns(df = 4)` you always +get 4 df; with `mgcv::s(x, k = 20)` you might end up with EDF 3 or +EDF 15 depending on the data. + +The fitted curve and pointwise CIs come from `predict()` with +`se.fit = TRUE` in the usual way: + +```r +grid <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200)) +pred <- predict(m_ns, newdata = grid, se.fit = TRUE) +grid$fit <- pred$fit +grid$lower <- pred$fit - 1.96 * pred$se.fit +grid$upper <- pred$fit + 1.96 * pred$se.fit +``` \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd new file mode 100644 index 0000000000..6019114d2d --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd @@ -0,0 +1,177 @@ +::: notes +Adapted from @james2021islr2e [§7.5] and @wood2017generalized [§5–6]. +::: + +Regression splines force the modeler to make several choices — +basis type, number of knots, knot locations — that affect the +result. **Smoothing splines** sidestep that decision tree by taking +a different route: they place a knot at every data point and then +**penalize wiggliness** to control overfitting. + +## The penalized fit + +::::{#def-smoothing-spline} +#### Smoothing spline (penalized least squares) + +Among all twice-differentiable functions $g$, the **smoothing +spline with smoothing parameter $\l \ge 0$** is the minimizer of + +$$\underbrace{\sum_{i=1}^n (y_i - g(x_i))^2}_{\text{fit to data}} ++ \l \underbrace{\int g''(t)^2 \, dt}_{\text{penalty on wiggliness}}.$$ + +The first term rewards close fit to the data; the second term +penalizes large second derivatives, i.e. fast changes in slope. +The trade-off is controlled by $\l$: + +- $\l \to 0$: the function passes through every data point + ($g(x_i) = y_i$). No penalty for wiggling — pure interpolation, + badly overfits. +- $\l \to \infty$: the integrated squared second derivative is + forced to zero, so $g$ is a straight line. Maximum smoothness — + reduces to ordinary linear regression. +- Intermediate $\l$: a smooth nonparametric fit. + +A foundational result is that the minimizer is always a **natural +cubic spline with knots at every unique $x_i$** +[@wood2017generalized, §5.1; @james2021islr2e, §7.5]. So even though +the problem is defined over all twice-differentiable functions, the +answer lives in a finite-dimensional space we can compute with. +:::: + +::::{#exm-smoothing-spline} +#### Effect of the smoothing parameter on a fitted curve + +Consider fitting a smoothing spline to a scatterplot of 200 +observations from a U-shaped exposure--outcome relationship. + +- With $\l$ very small (near zero), the fitted curve passes through + every point --- it wiggles through the noise and overfits. +- With $\l$ very large, the fitted curve is a straight line --- + it misses the U-shape entirely. +- At the GCV/REML-optimal $\hat\l$, the effective degrees of freedom + lies somewhere between 2 (linear) and $n$ (interpolation), + capturing the U-shape without chasing individual data points. + +The optimal $\hat\l$ is selected automatically by +`smooth.spline(x, y)` (the default `cv = FALSE` uses GCV; pass +`cv = TRUE` to switch to exact leave-one-out CV) or by +`mgcv::gam(y ~ s(x), method = "REML")`. +:::: + +## Effective degrees of freedom + +The smoothing spline fit can be written as +$\hat{\mat{y}} = \mat{S}_\l \mat{y}$ +for a smoother matrix $\mat{S}_\l$ that depends on $\l$ and the +$x_i$ values. Because the function is penalized rather than +projected onto a low-dimensional subspace, the usual "number of +parameters" count doesn't apply. + +The standard replacement is the **effective degrees of freedom** +(EDF): + +::::{#def-effective-degrees-of-freedom} +#### Effective degrees of freedom + +$$\text{EDF}(\l) = \text{tr}(\mat{S}_\l) += \sum_{i=1}^n \frac{\partial \hat y_i}{\partial y_i}.$$ + +The EDF interpolates smoothly between two endpoints. For the **full +smoothing-spline model** (intercept + smooth term), EDF ranges from: + +- $\text{EDF} \approx 2$ when $\l$ is large (model is essentially linear, + i.e. 2 parameters: intercept + slope). +- $\text{EDF} \approx n$ when $\l \to 0$ (model interpolates every data + point). + +When reading `mgcv::summary.gam()` output later in this chapter, the +column labeled `edf` reports a **per-smooth EDF that excludes the +global model intercept**, so a smooth that has been penalized down +to a straight line shows up as EDF $\approx 1$ — *not* 2. The two +conventions are consistent (the intercept is counted once, outside +the per-smooth column); they just split the parameter count +differently. An EDF of 1 in `summary()` output therefore means +"this smooth has collapsed to a line"; 5–8 means a moderately +wiggly curve; larger EDFs indicate more local features. +:::: + +## Choosing $\l$ automatically + +We don't pick $\l$ by hand — we pick it from the data using one +of: + +- **Cross-validation** (typically leave-one-out for smoothing splines, + which has a closed-form formula avoiding an actual leave-one-out + loop). +- **Generalized cross-validation** (GCV), a fast computational + approximation to LOO-CV. +- **REML** (restricted maximum likelihood), which treats the + penalty as arising from a normal-prior random effect and selects + $\l$ by likelihood. REML is not the default in `mgcv::gam()` + (which uses `method = "GCV.Cp"`), but it is generally preferred for + its better statistical properties + [@wood2017generalized, §6.2]; pass `method = "REML"` explicitly. + +The user typically specifies a maximum dimension for the basis +(`k = ...` in `mgcv::s()`) and lets the software select the +effective smoothness automatically. + +## Smoothing splines vs regression splines + +| Aspect | Regression spline | Smoothing spline | +|:--|:--|:--| +| Knot count | Modeler picks $K$ (small) | One per data point ($\sim n$) | +| Knot placement | Modeler or quantile rule | All $x_i$ values | +| Smoothness control | Number of knots | Penalty parameter $\l$ | +| Overfitting risk | Choose $K$ too large | Choose $\l$ too small | +| Inference | Standard linear-model output | Bayesian / EDF-based | +| Software default | `splines::ns(df = ...)` | `mgcv::s(...)`, `smooth.spline()` | + +The "inference" row reflects the typical software pairings shown in +the last row, not a fundamental property of the bases. Regression +splines fit via `splines::ns()` + `lm()` / `glm()` use the usual +linear-model machinery; smoothing splines fit via `mgcv::s()` + +`mgcv::gam()` use Bayesian posterior intervals. A regression spline +fit through `mgcv::gam()` with a fixed penalty (`sp = 0`) gets the +Bayesian intervals too — the table is about software defaults, not +the underlying bases. + +The two approaches **span the same function space** (piecewise +cubic polynomials with continuity constraints) and often produce +very similar fitted curves. The practical difference is who picks +the smoothness: + +- **Regression splines**: the modeler picks `df` (or `k`), then OLS + picks the best fit at that complexity. +- **Smoothing splines**: the basis is intentionally over-large, and + the smoothness parameter is data-adaptive. + +For routine work, **smoothing splines via `mgcv::s()` are the +default recommendation**: REML smoothness selection just works, +and the EDF in the model summary gives an honest answer to "how +wiggly is my smooth". + +## Smoothing splines in R + +Base R has `smooth.spline()`, but for any nontrivial use we'll go +through `mgcv`: + +```r +library(mgcv) + +# Univariate smoother, REML smoothness selection +m_ss <- gam(y ~ s(x, k = 20, bs = "tp"), + data = d, method = "REML") + +# Same idea inside a GLM +m_ss_logistic <- gam(y ~ s(x, k = 20, bs = "tp"), + data = d, family = binomial(), + method = "REML") + +summary(m_ss) # reports EDF of the s(x) term +plot(m_ss) # plots the fitted smooth with CI band +``` + +We'll return to `mgcv::s()` in the GAM-fitting section below — the +same `s()` syntax extends to multi-predictor additive models with +no additional machinery. \ No newline at end of file diff --git a/_subfiles/generalized-additive-models/_sec_step_functions.qmd b/_subfiles/generalized-additive-models/_sec_step_functions.qmd new file mode 100644 index 0000000000..1b63c1ece9 --- /dev/null +++ b/_subfiles/generalized-additive-models/_sec_step_functions.qmd @@ -0,0 +1,98 @@ +::: notes +Adapted from @james2021islr2e [§7.2]. +::: + +**Step functions** are the opposite extreme from polynomials: instead +of letting the response depend smoothly on $X$, they cut the range of +$X$ into bins and fit a separate constant within each bin. The fit is +piecewise-constant — a staircase rather than a curve. + +::::{#def-step-function-regression} +#### Step-function regression + +Choose **cutpoints** $c_1 < c_2 < \cdots < c_K$ on the support of $X$, +producing $K+1$ bins: +$(-\infty, c_1]$, $(c_1, c_2]$, ..., $(c_K, \infty)$. +Using the conventions $c_0 = -\infty$ and $c_{K+1} = \infty$, define indicator +variables + +$$C_k(X) = \indicp{c_{k-1} < X \le c_k}, \quad k = 1, \ldots, K+1.$$ + +Because +$\sum_{k=1}^{K+1} C_k(X) = 1$ for every $X$ (exactly one indicator is +"on"), the full set of $K+1$ indicators plus an intercept is linearly +dependent. One bin is therefore dropped as the **reference category**. +Taking bin $1$ as the reference and using $C_2, \ldots, C_{K+1}$, the +identified step-function regression model is + +$$y_i = \b_0 + \b_2 C_2(x_i) + \b_3 C_3(x_i) + \cdots + \b_{K+1} C_{K+1}(x_i) + \eps_i,$$ + +with $K$ free slope parameters (one per non-reference bin) plus the +intercept. $\b_0$ is the expected response in the reference bin and +$\b_k$ ($k \ge 2$) is the contrast in mean response between bin $k$ +and the reference bin. +:::: + +::::{#exm-step-function-regression} +#### BMI categories as a step function + +Step-function regression is the regression-modeling equivalent of +categorizing a continuous predictor. The familiar "age groups" or +"BMI categories" used in many epidemiology tables are exactly this: +cut the continuous predictor into bins, then enter the bin +indicators as regressors. For example, WHO BMI categories --- +underweight ($< 18.5$), normal weight ($18.5$--$24.9$), overweight +($25$--$29.9$), and obese ($\ge 30$) --- use $K = 3$ cutpoints +producing four bin indicators, with three slope parameters estimating +the mean response contrast between each non-reference category and +the reference (here, normal weight). +:::: + +## Strengths + +- **Interpretable on the original scale.** Each $\b_k$ is just "mean + response in bin $k$ minus mean response in the reference bin," with + the same units as the outcome (or on the link scale for a GLM). + Easy to put in a table. +- **No assumption about within-bin shape.** Step functions don't + impose linearity, monotonicity, or any other within-bin + relationship — they just average. +- **Robust to outliers in $X$.** Extreme $X$ values pull only the + outermost bin's mean. + +## Limitations + +- **Discontinuous fit.** Step functions can't represent a smooth + relationship: the fitted curve jumps at each cutpoint, which is + almost never the right scientific model for a continuous exposure. +- **Bin choice is arbitrary.** Different cutpoint sets produce + different fits, and there is no statistically principled way to + pick cutpoints from the data without invalidating inference. +- **Power loss.** Within each bin, all variation in $X$ is treated as + noise. Compared with a smooth fit that exploits the continuous + structure, step functions waste information about where in the bin + each observation falls. +- **Effective parameter count.** A step function with $K$ cutpoints + uses $K$ parameters beyond the intercept. For a meaningful fit, $K$ + has to be large enough to resolve the underlying shape — but every + extra cutpoint costs a degree of freedom. + +Step functions are most useful when bin boundaries are +**substantively meaningful** (e.g. clinical BMI categories) and the +goal is to communicate effects in those familiar categories rather +than to estimate a smooth dose-response curve. For pure curve +estimation, splines almost always do better. + +## Step functions in R + +The base-R function `cut()` produces a factor coding the bins: + +```r +d$bmi_cat <- cut(d$bmi, breaks = c(-Inf, 18.5, 25, 30, Inf), + labels = c("underweight", "normal", "overweight", "obese")) +lm(y ~ bmi_cat, data = d) +``` + +The factor is automatically expanded into $K$ contrast indicators by +`model.matrix`, with the first level as the reference. The fitted +coefficients are the contrasts relative to that reference category. \ No newline at end of file diff --git a/chapters/generalized-additive-models.qmd b/chapters/generalized-additive-models.qmd new file mode 100644 index 0000000000..b759dec7aa --- /dev/null +++ b/chapters/generalized-additive-models.qmd @@ -0,0 +1,78 @@ +--- +title: "Generalized Additive Models" +format: + html: default + revealjs: + output-file: generalized-additive-models-slides.html + pdf: + output-file: generalized-additive-models-handout.pdf + docx: + output-file: generalized-additive-models-handout.docx + +--- + +{{< include shared-config.qmd >}} + +# Motivation: beyond the linear predictor {#sec-gam-motivation} + +{{< include _subfiles/generalized-additive-models/_sec_motivation.qmd >}} + +# Polynomial regression {#sec-polynomial-regression} + +{{< include _subfiles/generalized-additive-models/_sec_polynomial.qmd >}} + +# Step functions {#sec-step-functions} + +{{< include _subfiles/generalized-additive-models/_sec_step_functions.qmd >}} + +# Basis functions {#sec-basis-functions} + +{{< include _subfiles/generalized-additive-models/_sec_basis_functions.qmd >}} + +# Regression splines {#sec-regression-splines} + +{{< include _subfiles/generalized-additive-models/_sec_regression_splines.qmd >}} + +# Smoothing splines {#sec-smoothing-splines} + +{{< include _subfiles/generalized-additive-models/_sec_smoothing_splines.qmd >}} + +# Local regression {#sec-local-regression} + +{{< include _subfiles/generalized-additive-models/_sec_local_regression.qmd >}} + +# Method comparison on a shared synthetic dataset {#sec-method-comparison} + +{{< include _subfiles/generalized-additive-models/_sec_method_comparison.qmd >}} + +# Generalized additive models {#sec-gams} + +{{< include _subfiles/generalized-additive-models/_sec_gams.qmd >}} + +# Fitting GAMs in R {#sec-gam-fitting} + +{{< include _subfiles/generalized-additive-models/_sec_gam_fitting.qmd >}} + +# Inference and model selection {#sec-gam-inference} + +{{< include _subfiles/generalized-additive-models/_sec_gam_inference.qmd >}} + +# Worked example {#sec-gam-example} + +{{< include _subfiles/generalized-additive-models/_sec_gam_example.qmd >}} + +# Lab: Non-linear modeling {#sec-gam-lab} + +## Setup {.unnumbered} + +{{< include _subfiles/generalized-additive-models/_sec_lab.qmd >}} + +# Exercises {#sec-gam-exercises} + +## Conceptual exercises + +{{< include _subfiles/generalized-additive-models/_exr-gams-conceptual.qmd >}} + +## Applied exercises + +{{< include _subfiles/generalized-additive-models/_exr-gams-applied.qmd >}} diff --git a/references.bib b/references.bib index 845703a653..5246243b6e 100644 --- a/references.bib +++ b/references.bib @@ -798,7 +798,8 @@ @book{wood2017generalized title={Generalized additive models: an introduction with R}, author={Wood, Simon N}, year={2017}, - publisher={chapman and hall/CRC} + edition={2}, + publisher={Chapman and Hall/CRC} } @book{roback2021beyond, @@ -1689,3 +1690,45 @@ @article{debruyn2011mira year={2011}, doi={10.1136/sti.2010.044990} } + +@article{aune2016bmi, + title = {{BMI} and all cause mortality: systematic review and non-linear dose-response meta-analysis of 230 cohort studies with 3.74 million deaths among 30.3 million participants}, + author = {Aune, Dagfinn and Sen, Abhijit and Prasad, Manya and Norat, Teresa and Janszky, Imre and Tonstad, Serena and Romundstad, Pal and Vatten, Lars J}, + journal = {BMJ}, + volume = {353}, + pages = {i2156}, + year = {2016}, + doi = {10.1136/bmj.i2156} +} + +@article{gasparrini2015mortality, + title = {Mortality risk attributable to high and low ambient temperature: a multicountry observational study}, + author = {Gasparrini, Antonio and Guo, Yuming and Hashizume, Masahiro and Lavigne, Eric and Zanobetti, Antonella and Schwartz, Joel and Tobias, Aurelio and Tong, Shilu and Rocklov, Joacim and Forsberg, Bertil and others}, + journal = {The Lancet}, + volume = {386}, + number = {9991}, + pages = {369--375}, + year = {2015}, + doi = {10.1016/S0140-6736(14)62114-0} +} + +@article{wood2013pvalues, + title = {On p-values for smooth components of an extended generalized additive model}, + author = {Wood, Simon N}, + journal = {Biometrika}, + volume = {100}, + number = {1}, + pages = {221--228}, + year = {2013}, + doi = {10.1093/biomet/ass048} +} + +@book{james2023islp, + title = {An Introduction to Statistical Learning: with Applications in {Python}}, + author = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert and Taylor, Jonathan}, + year = {2023}, + publisher = {Springer}, + address = {New York}, + isbn = {978-3-031-38746-3}, + url = {https://www.statlearning.com} +} From 719594b02e00122a7369987eb89edd8c65dbe669 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 08:02:26 +0000 Subject: [PATCH 2/6] GAM chapter cleanups: trailing newlines, eval:false, content-hidden, deps - Add trailing newlines to 9 _subfiles/generalized-additive-models/ files. POSIX requires text files to end with a newline; many tools (cat, sed, diff, git's last-line tracking) misbehave without one. - Convert 7 subfiles' display-only ```r blocks to ```{r}\n#| eval: false so they render as code fences without trying to evaluate. Keeps the prose-illustration semantics but pulls the snippets into Quarto's code-display path (gets syntax highlighting + copy button + per-language config), instead of the plain Pandoc one. - _sec_gam_example.qmd: remove the outer .content-hidden div around the callout-warning. The whole point of the warning ("dataset is illustrative; swap for an Epi 204 case study") is to be visible to readers; hiding it defeats it. The unwrapped callout shows in HTML, slides, and PDF, which is the intended audience. - DESCRIPTION: declare gratia (>=0.9) and mgcv in Suggests. Both are loaded in the chapter (eval:false), and listing them lets renv resolve consistent versions when the chapter ever gets flipped to eval:true. --- DESCRIPTION | 2 ++ .../_exr-gams-applied.qmd | 2 +- .../_sec_basis_functions.qmd | 2 +- .../_sec_gam_example.qmd | 26 ++++++++++++------- .../_sec_gam_fitting.qmd | 9 ++++--- .../generalized-additive-models/_sec_gams.qmd | 2 +- .../_sec_local_regression.qmd | 8 +++--- .../_sec_motivation.qmd | 2 +- .../_sec_polynomial.qmd | 5 ++-- .../_sec_regression_splines.qmd | 8 +++--- .../_sec_smoothing_splines.qmd | 5 ++-- .../_sec_step_functions.qmd | 5 ++-- 12 files changed, 47 insertions(+), 29 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 6cffdb3d7d..7269bf6dad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -46,9 +46,11 @@ Suggests: ggdag, ggplot2, glmx, + gratia (>= 0.9), haven, kableExtra, KMsurv, + mgcv, muhaz, palmerpenguins, parameters, diff --git a/_subfiles/generalized-additive-models/_exr-gams-applied.qmd b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd index 09b313c94f..1f6c6cd131 100644 --- a/_subfiles/generalized-additive-models/_exr-gams-applied.qmd +++ b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd @@ -157,4 +157,4 @@ print(f"OLS: b1={b_ols[1]:.4f}, b2={b_ols[2]:.4f}") The coefficients agree, confirming backfitting converges to OLS when all component smoothers are linear [@hastie2009esl2e, §9.1.1]. -:::: \ No newline at end of file +:::: diff --git a/_subfiles/generalized-additive-models/_sec_basis_functions.qmd b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd index 14cfbab9f5..521ad849a7 100644 --- a/_subfiles/generalized-additive-models/_sec_basis_functions.qmd +++ b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd @@ -93,4 +93,4 @@ $b_1(\cdot), \ldots, b_K(\cdot)$: The remaining design decision is **which basis functions to use** — and that choice is what distinguishes regression splines from -smoothing splines from local regression from GAMs. \ No newline at end of file +smoothing splines from local regression from GAMs. diff --git a/_subfiles/generalized-additive-models/_sec_gam_example.qmd b/_subfiles/generalized-additive-models/_sec_gam_example.qmd index 223a0cd61f..d48cadeda1 100644 --- a/_subfiles/generalized-additive-models/_sec_gam_example.qmd +++ b/_subfiles/generalized-additive-models/_sec_gam_example.qmd @@ -1,10 +1,8 @@ -:::: {.content-hidden} ::: callout-warning This section is illustrative only — the dataset is a built-in R example, not an Epi 204 case study. Replace with a course-appropriate exposure when the chapter is finalized. ::: -:::: We'll fit a GAM to the built-in `MASS::Pima.tr` dataset, predicting diabetes status (`type`) from continuous risk factors. The exposures @@ -13,7 +11,8 @@ age (`age`). ## Setup {.unnumbered} -```r +```{r} +#| eval: false library(MASS) # Pima.tr dataset library(mgcv) # gam(), s() library(ggplot2) @@ -38,7 +37,8 @@ Pima.tr$diabetes <- as.integer(Pima.tr$type == "Yes") Start with an ordinary logistic GLM as a baseline: -```r +```{r} +#| eval: false m_lin <- glm( diabetes ~ bmi + glu + age, family = binomial(), data = Pima.tr @@ -55,7 +55,8 @@ age, where diabetes risk is well known to be nonlinear. Now refit with smooths on each continuous predictor: -```r +```{r} +#| eval: false m_gam <- gam( diabetes ~ s(bmi) + s(glu) + s(age), family = binomial(), @@ -85,7 +86,8 @@ For each smooth, ask: If `gam.check(m_gam)` flags any smooth as having `edf` close to `k'`, bump `k` up: -```r +```{r} +#| eval: false m_gam2 <- gam( diabetes ~ s(bmi) + s(glu, k = 20) + s(age), family = binomial(), @@ -96,7 +98,8 @@ m_gam2 <- gam( ## Plotting partial-effect curves {.unnumbered} -```r +```{r} +#| eval: false plot(m_gam, pages = 1, shade = TRUE, seWithMean = TRUE) ``` @@ -112,7 +115,8 @@ figure. For a cleaner ggplot rendering, `gratia::draw()` is a one-line alternative: -```r +```{r} +#| eval: false gratia::draw(m_gam) ``` @@ -123,7 +127,8 @@ binary). To plot predicted probability vs a single predictor, hold the others at typical values, build the Wald CI on the **link** scale, then back-transform with `plogis()`: -```r +```{r} +#| eval: false typical <- data.frame( bmi = median(Pima.tr$bmi), glu = median(Pima.tr$glu), @@ -160,7 +165,8 @@ the middle), which is the right qualitative shape for a probability. ## Comparing GLM and GAM {.unnumbered} -```r +```{r} +#| eval: false # Refit the GAM with method = "ML" for AIC comparison. # REML and ML likelihoods are on different scales; comparing AIC # from glm() (always ML) with a REML-fitted gam() mixes criteria. diff --git a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd index 971694b45f..fec824cf68 100644 --- a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd +++ b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd @@ -12,7 +12,8 @@ REML or GCV. The basic syntax mirrors `lm()`/`glm()`, with smoothers introduced by `s()`: -```r +```{r} +#| eval: false library(mgcv) m <- gam( @@ -84,7 +85,8 @@ there at all. After fitting, run `gam.check()` to verify: -```r +```{r} +#| eval: false gam.check(m) ``` @@ -102,7 +104,8 @@ Output includes: ## Plotting smooths -```r +```{r} +#| eval: false plot(m, pages = 1, shade = TRUE) ``` diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd index 4be0d099e8..98f3d90969 100644 --- a/_subfiles/generalized-additive-models/_sec_gams.qmd +++ b/_subfiles/generalized-additive-models/_sec_gams.qmd @@ -110,4 +110,4 @@ two standard escape hatches are: For routine epidemiology work, the additive model is usually the right starting point; tensor smooths come up mainly when interactions between continuous predictors are scientifically central (e.g. -space–time models, age–calendar-time mortality surfaces). \ No newline at end of file +space–time models, age–calendar-time mortality surfaces). diff --git a/_subfiles/generalized-additive-models/_sec_local_regression.qmd b/_subfiles/generalized-additive-models/_sec_local_regression.qmd index 7339b628f6..b32ac93378 100644 --- a/_subfiles/generalized-additive-models/_sec_local_regression.qmd +++ b/_subfiles/generalized-additive-models/_sec_local_regression.qmd @@ -37,7 +37,8 @@ values and connect the resulting predictions. To smooth the relationship between age and plasma glucose in `MASS::Pima.tr` using LOESS with a span of 0.5: -```r +```{r} +#| eval: false m_loess <- loess(glu ~ age, data = MASS::Pima.tr, span = 0.5) ``` @@ -77,7 +78,8 @@ over that neighborhood. ## Local regression in R -```r +```{r} +#| eval: false m_loess <- loess(y ~ x, data = d, span = 0.5) grid <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200)) grid$fit <- predict(m_loess, newdata = grid) @@ -94,4 +96,4 @@ smoothing-spline approach via `mgcv::s()`, which produces a global fit with full inferential machinery. LOESS is best thought of as a **diagnostic smoother** — an unobtrusive way to overlay the local mean of a scatterplot to check whether a parametric fit is missing -nonlinearity. \ No newline at end of file +nonlinearity. diff --git a/_subfiles/generalized-additive-models/_sec_motivation.qmd b/_subfiles/generalized-additive-models/_sec_motivation.qmd index 46f4a0b785..57b837e4e9 100644 --- a/_subfiles/generalized-additive-models/_sec_motivation.qmd +++ b/_subfiles/generalized-additive-models/_sec_motivation.qmd @@ -80,4 +80,4 @@ The rest of this chapter builds up to GAMs in stages: We'll fit GAMs in R using the `mgcv` package [@wood2017generalized], which implements the modern penalized-regression approach with -automatic smoothness selection via REML or GCV. \ No newline at end of file +automatic smoothness selection via REML or GCV. diff --git a/_subfiles/generalized-additive-models/_sec_polynomial.qmd b/_subfiles/generalized-additive-models/_sec_polynomial.qmd index 2ad8332b48..cbdcc7f354 100644 --- a/_subfiles/generalized-additive-models/_sec_polynomial.qmd +++ b/_subfiles/generalized-additive-models/_sec_polynomial.qmd @@ -72,7 +72,8 @@ better choices. In R, polynomial regression is fit by including `poly(x, degree)` in the formula: -```r +```{r} +#| eval: false # Naive polynomial: raw powers x, x^2, x^3 lm(y ~ x + I(x^2) + I(x^3), data = d) @@ -95,4 +96,4 @@ differ in the interpretation of individual $\hat\b_k$ coefficients: For prediction (the fitted curve), both parameterizations are equivalent — but the orthogonal form has better numerical -conditioning and is the default we recommend. \ No newline at end of file +conditioning and is the default we recommend. diff --git a/_subfiles/generalized-additive-models/_sec_regression_splines.qmd b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd index 4c8716fa36..d81d0806a0 100644 --- a/_subfiles/generalized-additive-models/_sec_regression_splines.qmd +++ b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd @@ -125,7 +125,8 @@ of the requested dimension. ## Regression splines in R -```r +```{r} +#| eval: false library(splines) # Natural cubic spline with 4 degrees of freedom @@ -152,10 +153,11 @@ EDF 15 depending on the data. The fitted curve and pointwise CIs come from `predict()` with `se.fit = TRUE` in the usual way: -```r +```{r} +#| eval: false grid <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200)) pred <- predict(m_ns, newdata = grid, se.fit = TRUE) grid$fit <- pred$fit grid$lower <- pred$fit - 1.96 * pred$se.fit grid$upper <- pred$fit + 1.96 * pred$se.fit -``` \ No newline at end of file +``` diff --git a/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd index 6019114d2d..048d75f5ea 100644 --- a/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd +++ b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd @@ -156,7 +156,8 @@ wiggly is my smooth". Base R has `smooth.spline()`, but for any nontrivial use we'll go through `mgcv`: -```r +```{r} +#| eval: false library(mgcv) # Univariate smoother, REML smoothness selection @@ -174,4 +175,4 @@ plot(m_ss) # plots the fitted smooth with CI band We'll return to `mgcv::s()` in the GAM-fitting section below — the same `s()` syntax extends to multi-predictor additive models with -no additional machinery. \ No newline at end of file +no additional machinery. diff --git a/_subfiles/generalized-additive-models/_sec_step_functions.qmd b/_subfiles/generalized-additive-models/_sec_step_functions.qmd index 1b63c1ece9..29d96f4108 100644 --- a/_subfiles/generalized-additive-models/_sec_step_functions.qmd +++ b/_subfiles/generalized-additive-models/_sec_step_functions.qmd @@ -87,7 +87,8 @@ estimation, splines almost always do better. The base-R function `cut()` produces a factor coding the bins: -```r +```{r} +#| eval: false d$bmi_cat <- cut(d$bmi, breaks = c(-Inf, 18.5, 25, 30, Inf), labels = c("underweight", "normal", "overweight", "obese")) lm(y ~ bmi_cat, data = d) @@ -95,4 +96,4 @@ lm(y ~ bmi_cat, data = d) The factor is automatically expanded into $K$ contrast indicators by `model.matrix`, with the first level as the reference. The fitted -coefficients are the contrasts relative to that reference category. \ No newline at end of file +coefficients are the contrasts relative to that reference category. From 91b60e96f867c22061d4e742c8007bea58ea26a2 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 20:35:24 +0000 Subject: [PATCH 3/6] GAM worked example: use WCGS instead of MASS::Pima.tr Replace the built-in MASS::Pima.tr placeholder with the course's WCGS coronary-heart-disease cohort, addressing the review concern that the worked example was not an Epi 204 case study (and the student-visible 'illustrative only / replace dataset' callout-warning). - Outcome: incident CHD (chd69); smoothed continuous predictors: age, bmi, chol, sbp. - Load WCGS from the package's bundled inst/extdata/wcgs.rds. - Drop the placeholder callout-warning. - Adjust the narrative to WCGS reality (most smooths near EDF 1, cholesterol the predictor most likely to show curvature) without asserting specific run-dependent numbers. Chunks remain eval:false, consistent with the section's existing style and the book's other WCGS code listings. https://claude.ai/code/session_01NQSLJmjMN22RpwpgJ6Lw6u --- .../_sec_gam_example.qmd | 112 +++++++++--------- 1 file changed, 53 insertions(+), 59 deletions(-) diff --git a/_subfiles/generalized-additive-models/_sec_gam_example.qmd b/_subfiles/generalized-additive-models/_sec_gam_example.qmd index d48cadeda1..e88c016480 100644 --- a/_subfiles/generalized-additive-models/_sec_gam_example.qmd +++ b/_subfiles/generalized-additive-models/_sec_gam_example.qmd @@ -1,36 +1,29 @@ -::: callout-warning -This section is illustrative only — the dataset is a built-in R -example, not an Epi 204 case study. Replace with a course-appropriate -exposure when the chapter is finalized. -::: - -We'll fit a GAM to the built-in `MASS::Pima.tr` dataset, predicting -diabetes status (`type`) from continuous risk factors. The exposures -of interest are body mass index (`bmi`), plasma glucose (`glu`), and -age (`age`). +We'll fit a logistic GAM to the Western Collaborative Group Study (WCGS) +data [@rosenman1975coronary], +the coronary-heart-disease cohort introduced in the +[logistic regression chapter](logistic-regression.qmd). +The outcome is incident coronary heart disease (`chd69`); +the continuous risk factors we smooth are +age (`age`), +body mass index (`bmi`), +serum cholesterol (`chol`), +and systolic blood pressure (`sbp`) — +predictors whose effect on CHD risk need not be linear on the log-odds scale. ## Setup {.unnumbered} ```{r} #| eval: false -library(MASS) # Pima.tr dataset library(mgcv) # gam(), s() library(ggplot2) library(gratia) # tidy plots for gam fits (optional) -data(Pima.tr) -str(Pima.tr) -# 'data.frame': 200 obs. of 8 variables: (sample values omitted; columns are) -# $ npreg: int number of pregnancies -# $ glu : int plasma glucose concentration (2 h OGTT) -# $ bp : int diastolic blood pressure (mm Hg) -# $ skin : int triceps skinfold thickness (mm) -# $ bmi : num body mass index (kg/m^2) -# $ ped : num diabetes pedigree function -# $ age : int age (years) -# $ type : Factor w/ 2 levels "No","Yes": diabetes diagnosis - -Pima.tr$diabetes <- as.integer(Pima.tr$type == "Yes") +# WCGS ships with this package: +wcgs <- readRDS(system.file("extdata", "wcgs.rds", package = "rme")) + +# Binary 0/1 outcome, and drop the one known cholesterol outlier (645): +wcgs <- subset(wcgs, !is.na(chol) & chol < 645) +wcgs$chd <- as.integer(wcgs$chd69 == "Yes") ``` ## Linear baseline {.unnumbered} @@ -40,35 +33,40 @@ Start with an ordinary logistic GLM as a baseline: ```{r} #| eval: false m_lin <- glm( - diabetes ~ bmi + glu + age, - family = binomial(), data = Pima.tr + chd ~ age + bmi + chol + sbp, + family = binomial(), data = wcgs ) summary(m_lin) ``` -Each coefficient is a log-odds increment per unit of the predictor — -a single number for the slope of risk across the entire range of -BMI, glucose, and age. That's a strong assumption, especially for -age, where diabetes risk is well known to be nonlinear. +Each coefficient is a single log-odds increment per unit of the +predictor — one slope for the entire range of age, BMI, cholesterol, +and blood pressure. That's a strong assumption: it forces risk to +climb at a constant rate across the whole range of each exposure. ## GAM with smooth terms {.unnumbered} -Now refit with smooths on each continuous predictor: +Now refit with a smooth on each continuous predictor: ```{r} #| eval: false m_gam <- gam( - diabetes ~ s(bmi) + s(glu) + s(age), + chd ~ s(age) + s(bmi) + s(chol) + s(sbp), family = binomial(), - data = Pima.tr, + data = wcgs, method = "REML" ) summary(m_gam) ``` -The summary will report effective degrees of freedom for each -smooth. In a typical run on `Pima.tr` you'll see EDFs in the -1.5–4 range — modest nonlinearity but not extreme. +The summary reports an effective degrees of freedom (EDF) for each +smooth. A smooth whose EDF stays close to 1 has collapsed to a +straight line — the data give no evidence of curvature for that +predictor — while an EDF meaningfully above 1 signals a nonlinear +effect. In this dataset most of the smooths sit near 1 (the linear +baseline already captures them well), with cholesterol the predictor +most likely to show curvature; treat the actual EDFs from your run, +not these expectations, as the evidence. ## Reading the EDFs {.unnumbered} @@ -89,9 +87,9 @@ If `gam.check(m_gam)` flags any smooth as having `edf` close to ```{r} #| eval: false m_gam2 <- gam( - diabetes ~ s(bmi) + s(glu, k = 20) + s(age), + chd ~ s(age) + s(bmi) + s(chol, k = 20) + s(sbp), family = binomial(), - data = Pima.tr, + data = wcgs, method = "REML" ) ``` @@ -123,21 +121,18 @@ gratia::draw(m_gam) ## Translating to the response scale {.unnumbered} Partial-effect plots are on the **link scale** (log-odds for -binary). To plot predicted probability vs a single predictor, hold +binary). To plot predicted probability against a single predictor — +here cholesterol, the predictor with the clearest curvature — hold the others at typical values, build the Wald CI on the **link** scale, then back-transform with `plogis()`: ```{r} #| eval: false -typical <- data.frame( - bmi = median(Pima.tr$bmi), - glu = median(Pima.tr$glu), - age = median(Pima.tr$age) -) grid <- data.frame( - bmi = seq(min(Pima.tr$bmi), max(Pima.tr$bmi), length.out = 200), - glu = typical$glu, - age = typical$age + chol = seq(min(wcgs$chol), max(wcgs$chol), length.out = 200), + age = median(wcgs$age), + bmi = median(wcgs$bmi), + sbp = median(wcgs$sbp) ) pred <- predict(m_gam, newdata = grid, type = "link", se.fit = TRUE) @@ -145,11 +140,11 @@ grid$prob <- plogis(pred$fit) grid$lower <- plogis(pred$fit - 1.96 * pred$se.fit) grid$upper <- plogis(pred$fit + 1.96 * pred$se.fit) -ggplot(grid, aes(x = bmi, y = prob)) + +ggplot(grid, aes(x = chol, y = prob)) + geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) + geom_line() + - labs(x = "BMI (kg/m²)", - y = "Predicted probability of diabetes", + labs(x = "Serum cholesterol (mg/dL)", + y = "Predicted probability of CHD", caption = "Other predictors held at sample medians") + theme_minimal() ``` @@ -174,22 +169,21 @@ the middle), which is the right qualitative shape for a probability. # null-space penalty), so a smooth that's really just a line — or # zero — isn't penalized as more complex than the linear baseline. m_gam_ml <- gam( - diabetes ~ s(bmi) + s(glu) + s(age), + chd ~ s(age) + s(bmi) + s(chol) + s(sbp), family = binomial(), - data = Pima.tr, + data = wcgs, method = "ML", select = TRUE ) AIC(m_lin, m_gam_ml) ``` -For `Pima.tr`, the GAM typically improves AIC by a few units — -consistent with the modest EDFs above 1 we saw in the summary. -Whether that improvement is worth the added model complexity is a -judgment call: if the smooths all have EDF very close to 1, the GLM -is a parsimonious choice; if EDFs are clearly $> 1$ and the curve -plots show a substantively interesting shape (e.g. a J in BMI), the -GAM is the better report. +Compare the two fits on AIC. When most smooths sit at EDF near 1 — +as several do here — the GLM is the parsimonious choice and the AIC +gap will be small or may even favor the GLM. The GAM earns its extra +complexity only when one or more smooths show clear curvature and +the curve plots reveal a substantively interesting shape; if they +do, the GAM is the better report. The refit with `method = "ML"` is required only for the AIC comparison; the REML fit `m_gam` above remains the preferred fit for From 225baa773274944daa7786288ff5e1ef8a85e18d Mon Sep 17 00:00:00 2001 From: Douglas Ezra Morrison Date: Thu, 18 Jun 2026 10:34:14 -0700 Subject: [PATCH 4/6] Address 3 rme#779 low-priority items + merge main MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. _sec_lab.qmd:36 — remove the # noqa: E402 trailing the commented- out pygam import. The import is already commented out, so the linter suppression is meaningless and just noise. 2. _sec_gams.qmd:95 — the inline `s(age, by = sex)` example was the one ```r block missed by the prior cleanup commit; convert to ```{r} #| eval: false to match the rest of the chapter. 3. _sec_gam_fitting.qmd: add the `id` argument to the s() argument table — used to share a single smoothness parameter across two or more smooths (e.g. s(x1, id = 1) + s(x2, id = 1)). Commonly needed in multi-predictor epi models. Also merged origin/main (the branch was 125 commits behind). Co-Authored-By: Claude Opus 4.7 (1M context) Co-authored-by: d-morrison <2474437+d-morrison@users.noreply.github.com> --- _subfiles/generalized-additive-models/_sec_gam_fitting.qmd | 1 + _subfiles/generalized-additive-models/_sec_gams.qmd | 3 ++- _subfiles/generalized-additive-models/_sec_lab.qmd | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd index fec824cf68..cd511d4d5c 100644 --- a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd +++ b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd @@ -46,6 +46,7 @@ curves with effective-degrees-of-freedom summaries. | `bs` | basis type (`"tp"`, `"cr"`, `"ps"`, ...) | rarely; `"tp"` is the default and the safe choice | | `k` | basis dimension (max EDF + 1) | when `gam.check()` flags `k-index < 1` | | `by` | factor for per-level smooths | predictor-by-factor interactions | +| `id` | label for sharing smoothness across smooths | when two or more smooths should share a single $\l$, e.g. `s(x1, id = 1) + s(x2, id = 1)` | | `m` | order of the penalty derivative | rarely; default penalizes 2nd derivative | | `pc` | constraint point | when you need the smooth to pass through 0 at a chosen $x$ | diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd index 98f3d90969..1de8c2e813 100644 --- a/_subfiles/generalized-additive-models/_sec_gams.qmd +++ b/_subfiles/generalized-additive-models/_sec_gams.qmd @@ -92,7 +92,8 @@ two standard escape hatches are: identifiability (the smooth is centered to mean zero in each level, so the level-specific intercepts have to come from somewhere else); the correct idiom is: - ```r + ```{r} + #| eval: false gam(y ~ sex + s(age, by = sex), data = d, method = "REML") ``` The result is still additive in the *factors*, but allows diff --git a/_subfiles/generalized-additive-models/_sec_lab.qmd b/_subfiles/generalized-additive-models/_sec_lab.qmd index d45ef30668..61ef98b924 100644 --- a/_subfiles/generalized-additive-models/_sec_lab.qmd +++ b/_subfiles/generalized-additive-models/_sec_lab.qmd @@ -33,7 +33,7 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression, LogisticRegression import statsmodels.api as sm from patsy import dmatrix -# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f # noqa: E402 +# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f Wage = load_data("Wage") print(Wage[["age","wage","education","year"]].head()) ``` From 7ad9aa82f9bdce55d7fcc75437b2227bfaffc996 Mon Sep 17 00:00:00 2001 From: Douglas Ezra Morrison Date: Thu, 18 Jun 2026 10:42:46 -0700 Subject: [PATCH 5/6] Drop dead pygam import + switch Pima examples to WCGS (rme#779) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. _sec_lab.qmd:36 — drop the commented-out pygam import entirely. The active imports of LogisticGAM/gam_f live in the chunks that use them (lines 293, 350), so the setup-block comment was just dead code that puzzled students. 2. _sec_gams.qmd:38 — switch #exm-gam conceptual example from "Pima diabetes data with bmi/glu/age → diabetes" to "WCGS data with age/sbp/chol → chd69" so it matches the worked example's WCGS framing. 3. _sec_local_regression.qmd:34 — switch #exm-local-regression from loess(glu ~ age, MASS::Pima.tr) to loess(chol ~ age, wcgs) for the same reason; pivot the surrounding prose from age 40/glucose to age 50/cholesterol to match. Co-Authored-By: Claude Opus 4.7 (1M context) Co-authored-by: d-morrison <2474437+d-morrison@users.noreply.github.com> --- .../generalized-additive-models/_sec_gams.qmd | 12 ++++++------ .../generalized-additive-models/_sec_lab.qmd | 1 - .../_sec_local_regression.qmd | 18 +++++++++--------- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd index 1de8c2e813..6b893b1422 100644 --- a/_subfiles/generalized-additive-models/_sec_gams.qmd +++ b/_subfiles/generalized-additive-models/_sec_gams.qmd @@ -36,17 +36,17 @@ the coefficients $\b_{jk}$ estimated jointly across all predictors. :::: ::::{#exm-gam} -#### GAM for diabetes risk +#### GAM for CHD risk -For the Pima diabetes data with continuous predictors BMI, plasma -glucose, and age, a GAM on the logit scale is +For the WCGS data with continuous predictors age, systolic blood +pressure, and cholesterol, a GAM for CHD on the logit scale is -$$\logitf{\Pr(\text{diabetes} = 1 \mid \text{bmi}, \text{glu}, \text{age})} -= \b_0 + f_1(\text{bmi}) + f_2(\text{glu}) + f_3(\text{age}),$$ +$$\logitf{\Pr(\text{chd69} = 1 \mid \text{age}, \text{sbp}, \text{chol})} += \b_0 + f_1(\text{age}) + f_2(\text{sbp}) + f_3(\text{chol}),$$ where each $f_j$ is a smoothing spline estimated by REML. The three partial-effect curves can be plotted separately, revealing, for -example, whether diabetes risk rises linearly with age or accelerates +example, whether CHD risk rises linearly with age or accelerates nonlinearly. :::: diff --git a/_subfiles/generalized-additive-models/_sec_lab.qmd b/_subfiles/generalized-additive-models/_sec_lab.qmd index 61ef98b924..dffe3c058f 100644 --- a/_subfiles/generalized-additive-models/_sec_lab.qmd +++ b/_subfiles/generalized-additive-models/_sec_lab.qmd @@ -33,7 +33,6 @@ from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression, LogisticRegression import statsmodels.api as sm from patsy import dmatrix -# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f Wage = load_data("Wage") print(Wage[["age","wage","education","year"]].head()) ``` diff --git a/_subfiles/generalized-additive-models/_sec_local_regression.qmd b/_subfiles/generalized-additive-models/_sec_local_regression.qmd index b32ac93378..07a63ffc1b 100644 --- a/_subfiles/generalized-additive-models/_sec_local_regression.qmd +++ b/_subfiles/generalized-additive-models/_sec_local_regression.qmd @@ -32,21 +32,21 @@ values and connect the resulting predictions. :::: ::::{#exm-local-regression} -#### LOESS fit on age--glucose +#### LOESS fit on age--cholesterol -To smooth the relationship between age and plasma glucose in -`MASS::Pima.tr` using LOESS with a span of 0.5: +To smooth the relationship between age and total cholesterol in the +WCGS data using LOESS with a span of 0.5: ```{r} #| eval: false -m_loess <- loess(glu ~ age, data = MASS::Pima.tr, span = 0.5) +m_loess <- loess(chol ~ age, data = wcgs, span = 0.5) ``` -At age 40, the local neighborhood is the 50% of observations with -ages closest to 40. Each observation in that window is weighted by -the tricube kernel --- down-weighted as its age moves away from 40, -zero weight outside the window. The predicted glucose at age 40 is -the fitted value from the weighted least-squares linear regression +At age 50, the local neighborhood is the 50% of observations with +ages closest to 50. Each observation in that window is weighted by +the tricube kernel --- down-weighted as its age moves away from 50, +zero weight outside the window. The predicted cholesterol at age 50 +is the fitted value from the weighted least-squares linear regression over that neighborhood. :::: From 46f37d81e539b895ff8d97fb96ec2fa965e75a31 Mon Sep 17 00:00:00 2001 From: Douglas Ezra Morrison Date: Thu, 18 Jun 2026 10:51:39 -0700 Subject: [PATCH 6/6] Record gratia 0.11.2 in renv.lock (rme#779 low-priority) DESCRIPTION:49 declares gratia (>= 0.9) in Suggests, but renv::snapshot() doesn't auto-detect it because all gratia code lives in eval:false chunks. Force-record via renv::record('gratia') so renv::restore() on a fresh clone installs it. Other Suggests dependencies are already present in renv.lock (mgcv, ISLR2, boot, etc.), so this brings gratia into line with that pattern. Co-Authored-By: Claude Opus 4.7 (1M context) Co-authored-by: d-morrison <2474437+d-morrison@users.noreply.github.com> --- renv.lock | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/renv.lock b/renv.lock index f5008745dd..257974e31f 100644 --- a/renv.lock +++ b/renv.lock @@ -4801,6 +4801,12 @@ "Maintainer": "David Schoch ", "Repository": "CRAN" }, + "gratia": { + "Package": "gratia", + "Version": "0.11.2", + "Source": "Repository", + "Repository": "CRAN" + }, "gridExtra": { "Package": "gridExtra", "Version": "2.3",