From ef26f15e2ecd9379ad840c756fe72a00cb0652b7 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 9 Jun 2026 06:28:14 +0000
Subject: [PATCH 1/6] =?UTF-8?q?GAM=20chapter=20(squashed)=20=E2=80=94=20cl?=
 =?UTF-8?q?oses=20#775?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Initial draft of a chapter on generalized additive models. Squashed
from the original 69-commit history because the action's recursive
submodule fetch was failing on commits that referenced a now-deleted
latex-macros SHA. See the PR description for the chapter outline,
new bib entries, and caveats.
---
 _quarto-book.yml                              |   1 +
 _quarto-website.yml                           |   3 +
 .../_exr-gams-applied.qmd                     | 160 ++++++++
 .../_exr-gams-conceptual.qmd                  |  76 ++++
 .../_sec_basis_functions.qmd                  |  96 +++++
 .../_sec_gam_example.qmd                      | 209 ++++++++++
 .../_sec_gam_fitting.qmd                      | 118 ++++++
 .../_sec_gam_inference.qmd                    | 122 ++++++
 .../generalized-additive-models/_sec_gams.qmd | 113 ++++++
 .../generalized-additive-models/_sec_lab.qmd  | 378 ++++++++++++++++++
 .../_sec_local_regression.qmd                 |  97 +++++
 .../_sec_method_comparison.qmd                | 244 +++++++++++
 .../_sec_motivation.qmd                       |  83 ++++
 .../_sec_polynomial.qmd                       |  98 +++++
 .../_sec_regression_splines.qmd               | 161 ++++++++
 .../_sec_smoothing_splines.qmd                | 177 ++++++++
 .../_sec_step_functions.qmd                   |  98 +++++
 chapters/generalized-additive-models.qmd      |  78 ++++
 references.bib                                |  45 ++-
 19 files changed, 2356 insertions(+), 1 deletion(-)
 create mode 100644 _subfiles/generalized-additive-models/_exr-gams-applied.qmd
 create mode 100644 _subfiles/generalized-additive-models/_exr-gams-conceptual.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_basis_functions.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_gam_example.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_gam_fitting.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_gam_inference.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_gams.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_lab.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_local_regression.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_method_comparison.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_motivation.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_polynomial.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_regression_splines.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_smoothing_splines.qmd
 create mode 100644 _subfiles/generalized-additive-models/_sec_step_functions.qmd
 create mode 100644 chapters/generalized-additive-models.qmd

diff --git a/_quarto-book.yml b/_quarto-book.yml
index 0910c98925..b8aad36c1b 100644
--- a/_quarto-book.yml
+++ b/_quarto-book.yml
@@ -29,6 +29,7 @@ book:
        - chapters/causal-inference.qmd
        - chapters/predictor-selection.qmd
        - chapters/intro-multilevel-models.qmd
+       - chapters/generalized-additive-models.qmd
     - part: chapters/time-to-event-models.qmd
       chapters:
        - chapters/intro-to-survival-analysis.qmd
diff --git a/_quarto-website.yml b/_quarto-website.yml
index 47d8ff5e11..3284c72e67 100644
--- a/_quarto-website.yml
+++ b/_quarto-website.yml
@@ -12,6 +12,7 @@ project:
     - chapters/causal-inference.qmd
     - chapters/predictor-selection.qmd
     - chapters/intro-multilevel-models.qmd
+    - chapters/generalized-additive-models.qmd
     - chapters/time-to-event-models.qmd
     - chapters/intro-to-survival-analysis.qmd
     - chapters/proportional-hazards-models.qmd
@@ -77,6 +78,8 @@ website:
             href: chapters/predictor-selection.qmd
           - text: "Multilevel Models"
             href: chapters/intro-multilevel-models.qmd
+          - text: "Generalized Additive Models"
+            href: chapters/generalized-additive-models.qmd
           - text: "---"
           - text: "Time to Event Models"
             href: chapters/time-to-event-models.qmd
diff --git a/_subfiles/generalized-additive-models/_exr-gams-applied.qmd b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd
new file mode 100644
index 0000000000..09b313c94f
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd
@@ -0,0 +1,160 @@
+::: notes
+Applied exercises adapted from @james2021islr2e [§7.9] and
+@hastie2009esl2e [§9.1].
+:::
+
+### Polynomial regression and step functions (ISLRv2 Applied 7) {#sec-exr-wage-poly}
+
+::::{#exr-wage-polynomial-cv}
+
+Using the `Wage` dataset from the `ISLR2` package [@james2021islr2e]:
+
+**(a)** Use 10-fold cross-validation to choose the polynomial degree $d^*$ for
+predicting `wage` from `age`. Plot the CV errors for degrees 1 through 10.
+
+**(b)** Fit the degree-$d^*$ polynomial and plot it with a 95% confidence band.
+
+**(c)** Separately, choose the number of cut-points $K^*$ for a step-function
+model using cross-validation. Plot the chosen step-function fit.
+
+::::
+
+
+::::{#sol-wage-polynomial-cv}
+
+::: {.panel-tabset}
+
+#### R
+
+```{r}
+#| code-fold: true
+#| eval: false
+library(ISLR2)
+library(boot)
+library(ggplot2)
+set.seed(1)
+cv_poly <- vapply(1:10, function(d) {
+  cv.glm(Wage, glm(wage ~ poly(age, d), data = Wage), K = 10)$delta[1]
+}, numeric(1))
+best_d <- which.min(cv_poly)
+cat("CV-selected degree:", best_d, "\n")
+plot(1:10, cv_poly, type = "b", xlab = "Degree", ylab = "CV MSE",
+     main = "Polynomial degree selection")
+points(best_d, cv_poly[best_d], col = "red", pch = 19)
+```
+
+#### Python
+
+```{python}
+#| eval: false
+#| python.reticulate: false
+import numpy as np
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import LinearRegression
+from sklearn.pipeline import Pipeline
+from sklearn.model_selection import cross_val_score
+from ISLP import load_data
+Wage = load_data("Wage")
+X = Wage["age"].values.reshape(-1, 1)
+y = Wage["wage"].values
+cv_errs = [-cross_val_score(
+    Pipeline([("poly", PolynomialFeatures(d, include_bias=False)),
+              ("reg", LinearRegression())]),
+    X, y, cv=10, scoring="neg_mean_squared_error").mean() for d in range(1, 11)]
+best_d = int(np.argmin(cv_errs)) + 1
+print(f"CV-selected degree: {best_d}")
+```
+
+:::
+
+::::
+
+
+### Backfitting algorithm (ESL §9.1.1 / ISLRv2 Applied 11) {#sec-exr-backfitting}
+
+::::{#exr-backfitting}
+
+The **backfitting algorithm** [@hastie2009esl2e, §9.1.1] fits an additive model
+$Y = \a + \sum_j f_j(X_j) + \eps$ by iteratively smoothing partial residuals.
+
+For $p=2$ predictors with linear component smoothers $\hat f_j(X_j) = \hat\b_j X_j$:
+
+1. Initialise: $\hat\a = \bar{y}$, $\hat f_1 = \hat f_2 = 0$.
+2. Repeat until convergence (Gauss–Seidel order — re-centre each
+   $\hat f_j$ immediately, so the centred $\hat f_1$ is used when
+   computing $r_2$):
+   a. $r_1 \leftarrow y - \hat\a - \hat f_2(x_2)$; regress $r_1$ on
+      $x_1$ to update $\hat f_1$; $\hat f_1 \leftarrow \hat f_1 -
+      \overline{\hat f_1}$.
+   b. $r_2 \leftarrow y - \hat\a - \hat f_1(x_1)$; regress $r_2$ on
+      $x_2$ to update $\hat f_2$; $\hat f_2 \leftarrow \hat f_2 -
+      \overline{\hat f_2}$.
+
+**(a)** Implement this on simulated data with known true coefficients.
+
+**(b)** Verify the coefficients match `lm(y ~ x1 + x2)`.
+
+::::
+
+
+::::{#sol-backfitting}
+
+::: {.panel-tabset}
+
+#### R
+
+```{r}
+#| code-fold: true
+#| eval: false
+set.seed(1)
+n <- 100
+x1 <- rnorm(n)
+x2 <- rnorm(n)
+y <- 2 + 3 * x1 - 1.5 * x2 + rnorm(n, sd = 0.5)
+alpha_hat <- mean(y)
+f1 <- f2 <- rep(0, n)
+for (iter in seq_len(500)) {
+  r1 <- y - alpha_hat - f2
+  b1 <- coef(lm(r1 ~ x1))[2]
+  f1 <- b1 * x1
+  f1 <- f1 - mean(f1)
+  r2 <- y - alpha_hat - f1
+  b2 <- coef(lm(r2 ~ x2))[2]
+  f2 <- b2 * x2
+  f2 <- f2 - mean(f2)
+}
+lm_fit <- lm(y ~ x1 + x2)
+cat(sprintf("Backfitting: b1=%.4f, b2=%.4f\n", b1, b2))
+cat(sprintf("lm():        b1=%.4f, b2=%.4f\n",
+            coef(lm_fit)[2], coef(lm_fit)[3]))
+```
+
+#### Python
+
+```{python}
+#| eval: false
+#| python.reticulate: false
+import numpy as np
+rng = np.random.default_rng(1)
+n = 100; x1 = rng.standard_normal(n); x2 = rng.standard_normal(n)
+y = 2 + 3*x1 - 1.5*x2 + rng.normal(0, 0.5, n)
+alpha_hat = y.mean(); f1 = np.zeros(n); f2 = np.zeros(n)
+for _ in range(500):
+    r1 = y - alpha_hat - f2
+    b1 = float(np.cov(x1, r1)[0, 1] / np.var(x1, ddof=1))
+    f1 = b1*x1 - (b1*x1).mean()
+    r2 = y - alpha_hat - f1
+    b2 = float(np.cov(x2, r2)[0, 1] / np.var(x2, ddof=1))
+    f2 = b2*x2 - (b2*x2).mean()
+X_ols = np.column_stack([np.ones(n), x1, x2])
+b_ols = np.linalg.lstsq(X_ols, y, rcond=None)[0]
+print(f"Backfitting: b1={b1:.4f}, b2={b2:.4f}")
+print(f"OLS:         b1={b_ols[1]:.4f}, b2={b_ols[2]:.4f}")
+```
+
+:::
+
+The coefficients agree, confirming backfitting converges to OLS when
+all component smoothers are linear [@hastie2009esl2e, §9.1.1].
+
+::::
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_exr-gams-conceptual.qmd b/_subfiles/generalized-additive-models/_exr-gams-conceptual.qmd
new file mode 100644
index 0000000000..59596d90f1
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_exr-gams-conceptual.qmd
@@ -0,0 +1,76 @@
+::: notes
+Exercises adapted from @james2021islr2e [§7.9] and @hastie2009esl2e [§9.1].
+:::
+
+### Piecewise polynomial continuity (ISLRv2 Conceptual 1) {#sec-exr-continuity}
+
+::::{#exr-spline-continuity}
+
+A cubic polynomial with one interior knot at $\xi$ uses the truncated power basis:
+
+$$f(X) = \b_0 + \b_1 X + \b_2 X^2 + \b_3 X^3 + \b_4 (X-\xi)_+^3$$
+
+where $(u)_+ = \max(u,0)$.
+
+**(a)** Write $f(X)$ as two cubics: $f_1(X)$ for $X < \xi$ and $f_2(X)$ for $X \ge \xi$.
+
+**(b)** Show $f_1(\xi) = f_2(\xi)$ (continuity).
+
+**(c)** Show the first derivatives match at $\xi$.
+
+**(d)** Show the second derivatives match at $\xi$.
+
+**(e)** Do the third derivatives match? What order of smoothness does this cubic spline achieve?
+
+::::
+
+
+::::{#sol-spline-continuity}
+
+**(a)** For $X < \xi$: $(X-\xi)_+ = 0$, giving $f_1(X) = \b_0 + \b_1 X + \b_2 X^2 + \b_3 X^3$.
+For $X \ge \xi$: $f_2(X) = f_1(X) + \b_4(X-\xi)^3$.
+
+**(b)** At $X=\xi$: $(X-\xi)_+^3 = 0$, so $f_2(\xi) = f_1(\xi)$. $\checkmark$
+
+**(c)** $f_2^{\prime}(X) = f_1^{\prime}(X) + 3\b_4(X-\xi)_+^2$.
+At $X=\xi$: $(X-\xi)_+^2 = 0$, so the derivatives agree. $\checkmark$
+
+**(d)** $f_2^{\prime\prime}(X) = f_1^{\prime\prime}(X) + 6\b_4(X-\xi)_+$.
+At $X=\xi$: $(X-\xi)_+ = 0$. $\checkmark$
+
+**(e)** $f_1^{\prime\prime\prime} = 6\b_3$ and $f_2^{\prime\prime\prime} = 6\b_3 + 6\b_4$ for $X > \xi$.
+These differ (unless $\b_4=0$). The truncated power basis gives a **cubic spline**:
+continuous with continuous 1st and 2nd derivatives, but a jump in the 3rd derivative at each knot.
+
+::::
+
+
+### Smoothing spline penalty behavior (ISLRv2 Conceptual 2) {#sec-exr-smoothing-penalty}
+
+::::{#exr-smoothing-penalty}
+
+The smoothing spline minimizes
+
+$$\hat{g} = \arg\min_g \left\{ \sum_{i=1}^n (y_i - g(x_i))^2 + \l \int g^{\prime\prime}(t)^2 \, dt \right\}.$$
+
+**(a)** What is the limiting form of $\hat{g}$ as $\l \to \infty$?
+
+**(b)** What is the limiting form as $\l \to 0$?
+
+**(c)** For $\l \to 0$, is the training RSS equal to zero?
+
+::::
+
+
+::::{#sol-smoothing-penalty}
+
+**(a)** As $\l \to \infty$ the roughness penalty dominates. The only $g$ with
+$\int g^{\prime\prime}(t)^2\,dt = 0$ is a linear function $g(x) = a + bx$.
+So $\hat{g}$ converges to the ordinary least-squares line.
+
+**(b)** As $\l \to 0$ there is no penalty; the minimizer interpolates every
+training point: $g(x_i) = y_i$ for all $i$.
+
+**(c)** Yes. Interpolation means training RSS $= 0$ -- the overfit extreme.
+
+::::
diff --git a/_subfiles/generalized-additive-models/_sec_basis_functions.qmd b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd
new file mode 100644
index 0000000000..14cfbab9f5
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd
@@ -0,0 +1,96 @@
+::: notes
+Adapted from @james2021islr2e [§7.3] and @wood2017generalized [§4].
+:::
+
+Polynomial regression and step-function regression look very
+different, but they share a common structure: both replace $X$ with
+a fixed set of **transformations** of $X$, then fit a linear
+regression in those transformations. This shared structure is the
+**basis function** framework, and it's the foundation for splines.
+
+::::{#def-basis-expansion}
+#### Basis expansion
+
+A **basis expansion** of $X$ is a fixed (data-free) set of functions
+$b_1(X), \ldots, b_K(X)$. The basis-expansion regression model
+replaces $X$ with these transformations:
+
+$$y_i = \b_0 + \b_1 b_1(x_i) + \b_2 b_2(x_i) + \cdots + \b_K b_K(x_i) + \eps_i.$$
+
+The basis functions $b_k(\cdot)$ are chosen ahead of time; the
+coefficients $\b_k$ are estimated by least squares (or, more
+generally, by maximum likelihood for a GLM with link
+$g(\E{Y \mid X}) = \b_0 + \sum_k \b_k b_k(X)$).
+::::
+
+::::{#exm-basis-polynomial}
+#### Polynomial regression as a basis expansion
+
+For polynomial regression of degree $d$ (@def-polynomial-regression),
+the basis functions are the monomials
+
+$$b_k(X) = X^k, \quad k = 1, \ldots, d.$$
+::::
+
+::::{#exm-basis-step}
+#### Step-function regression as a basis expansion
+
+For step-function regression with cutpoints $c_1 < \cdots < c_K$
+(@def-step-function-regression), the basis functions are the bin
+indicators
+
+$$b_k(X) = \indicp{c_{k-1} < X \le c_k}, \quad k = 1, \ldots, K+1.$$
+::::
+
+## Why the basis-function abstraction matters
+
+Once we see polynomial and step-function regression as instances of
+the same template, we can ask: *which other basis functions might be
+worth trying?* Two answers turn out to be especially useful:
+
+- **Truncated power basis functions** —
+  $b_k(X) = (X - \xi_k)_+^d$ for chosen knots $\xi_k$. These give the
+  **regression spline** family, which enforces piecewise-polynomial
+  structure with continuity and smoothness constraints at the knots
+  (developed in the next section).
+- **B-spline basis functions** — a numerically well-conditioned
+  alternative basis that spans the same space of piecewise
+  polynomials as the truncated power basis, but with locally
+  supported basis functions (each $b_k$ is nonzero only on a small
+  region of the $x$-axis).
+
+Both of these are still **linear** in the coefficients, so all of the
+GLM machinery — likelihood, standard errors, Wald tests, AIC, etc. —
+applies unchanged. The only thing that changes is the design matrix:
+each row $i$ of the design matrix has entries
+$1, b_1(x_i), \ldots, b_K(x_i)$ instead of $1, x_i$.
+
+## The general estimation recipe
+
+For a continuous response $Y$ and a basis expansion
+$b_1(\cdot), \ldots, b_K(\cdot)$:
+
+1. **Build the design matrix.** Construct the $n \times (K+1)$
+   matrix
+
+   $$\mat{B} = \sbmat{
+   1 & b_1(x_1) & \cdots & b_K(x_1) \\
+   1 & b_1(x_2) & \cdots & b_K(x_2) \\
+   \vdots & \vdots & \ddots & \vdots \\
+   1 & b_1(x_n) & \cdots & b_K(x_n)
+   }.$$
+
+2. **Fit by least squares (or MLE for a GLM).**
+   The OLS estimator $\hat\b = (\tp{\mat{B}} \mat{B})^{-1} \tp{\mat{B}} \mat{y}$
+   minimizes the residual sum of squares — exactly the linear
+   regression machinery from earlier chapters, just with a different
+   design matrix.
+
+3. **Plot the fitted curve.** For a dense grid of $x$ values, compute
+   $\hat y(x) = \hat\b_0 + \sum_k \hat\b_k b_k(x)$ and plot. Standard
+   errors and pointwise confidence bands come from the usual linear
+   model formulas applied to $\mat{B}$.
+
+The remaining design decision is **which basis functions to use** —
+and that choice is what distinguishes regression splines from
+smoothing splines from local regression from GAMs.
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_gam_example.qmd b/_subfiles/generalized-additive-models/_sec_gam_example.qmd
new file mode 100644
index 0000000000..223a0cd61f
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_gam_example.qmd
@@ -0,0 +1,209 @@
+:::: {.content-hidden}
+::: callout-warning
+This section is illustrative only — the dataset is a built-in R
+example, not an Epi 204 case study. Replace with a course-appropriate
+exposure when the chapter is finalized.
+:::
+::::
+
+We'll fit a GAM to the built-in `MASS::Pima.tr` dataset, predicting
+diabetes status (`type`) from continuous risk factors. The exposures
+of interest are body mass index (`bmi`), plasma glucose (`glu`), and
+age (`age`).
+
+## Setup {.unnumbered}
+
+```r
+library(MASS)        # Pima.tr dataset
+library(mgcv)        # gam(), s()
+library(ggplot2)
+library(gratia)      # tidy plots for gam fits (optional)
+
+data(Pima.tr)
+str(Pima.tr)
+#  'data.frame': 200 obs. of  8 variables:  (sample values omitted; columns are)
+#  $ npreg: int  number of pregnancies
+#  $ glu  : int  plasma glucose concentration (2 h OGTT)
+#  $ bp   : int  diastolic blood pressure (mm Hg)
+#  $ skin : int  triceps skinfold thickness (mm)
+#  $ bmi  : num  body mass index (kg/m^2)
+#  $ ped  : num  diabetes pedigree function
+#  $ age  : int  age (years)
+#  $ type : Factor w/ 2 levels "No","Yes":  diabetes diagnosis
+
+Pima.tr$diabetes <- as.integer(Pima.tr$type == "Yes")
+```
+
+## Linear baseline {.unnumbered}
+
+Start with an ordinary logistic GLM as a baseline:
+
+```r
+m_lin <- glm(
+  diabetes ~ bmi + glu + age,
+  family = binomial(), data = Pima.tr
+)
+summary(m_lin)
+```
+
+Each coefficient is a log-odds increment per unit of the predictor —
+a single number for the slope of risk across the entire range of
+BMI, glucose, and age. That's a strong assumption, especially for
+age, where diabetes risk is well known to be nonlinear.
+
+## GAM with smooth terms {.unnumbered}
+
+Now refit with smooths on each continuous predictor:
+
+```r
+m_gam <- gam(
+  diabetes ~ s(bmi) + s(glu) + s(age),
+  family = binomial(),
+  data = Pima.tr,
+  method = "REML"
+)
+summary(m_gam)
+```
+
+The summary will report effective degrees of freedom for each
+smooth. In a typical run on `Pima.tr` you'll see EDFs in the
+1.5–4 range — modest nonlinearity but not extreme.
+
+## Reading the EDFs {.unnumbered}
+
+For each smooth, ask:
+
+- **EDF $\approx 1$**: the smooth has collapsed to a straight line.
+  No evidence the relationship is nonlinear. You can drop the
+  smoother and use a linear term without loss.
+- **EDF in 2–5**: meaningfully nonlinear. The fitted curve is more
+  than just a line but doesn't have lots of bumps.
+- **EDF $> 5$**: substantially nonlinear, possibly with multiple
+  inflections. Check whether the data actually support that wiggle
+  (look at the plot; check `gam.check()`'s `k-index`).
+
+If `gam.check(m_gam)` flags any smooth as having `edf` close to
+`k'`, bump `k` up:
+
+```r
+m_gam2 <- gam(
+  diabetes ~ s(bmi) + s(glu, k = 20) + s(age),
+  family = binomial(),
+  data = Pima.tr,
+  method = "REML"
+)
+```
+
+## Plotting partial-effect curves {.unnumbered}
+
+```r
+plot(m_gam, pages = 1, shade = TRUE, seWithMean = TRUE)
+```
+
+produces one panel per smooth, each showing the partial effect on
+the log-odds scale with a 2-SE band. `seWithMean = TRUE` adds the
+variance of the smooth's overall level (its centering offset) to
+the pointwise standard errors. Without it, the band is the variance
+of the *deviation* of the smooth around its mean; with it, the band
+is the variance of the smooth itself — the right interpretation when
+comparing partial-effect curves across predictors on a publication
+figure.
+
+For a cleaner ggplot rendering, `gratia::draw()` is a one-line
+alternative:
+
+```r
+gratia::draw(m_gam)
+```
+
+## Translating to the response scale {.unnumbered}
+
+Partial-effect plots are on the **link scale** (log-odds for
+binary). To plot predicted probability vs a single predictor, hold
+the others at typical values, build the Wald CI on the **link** scale,
+then back-transform with `plogis()`:
+
+```r
+typical <- data.frame(
+  bmi = median(Pima.tr$bmi),
+  glu = median(Pima.tr$glu),
+  age = median(Pima.tr$age)
+)
+grid <- data.frame(
+  bmi = seq(min(Pima.tr$bmi), max(Pima.tr$bmi), length.out = 200),
+  glu = typical$glu,
+  age = typical$age
+)
+pred <- predict(m_gam, newdata = grid,
+                type = "link", se.fit = TRUE)
+grid$prob  <- plogis(pred$fit)
+grid$lower <- plogis(pred$fit - 1.96 * pred$se.fit)
+grid$upper <- plogis(pred$fit + 1.96 * pred$se.fit)
+
+ggplot(grid, aes(x = bmi, y = prob)) +
+  geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
+  geom_line() +
+  labs(x = "BMI (kg/m²)",
+       y = "Predicted probability of diabetes",
+       caption = "Other predictors held at sample medians") +
+  theme_minimal()
+```
+
+We compute the CI on the link scale and back-transform because the
+delta-method SE on `type = "response"` doesn't respect the [0, 1]
+boundary — Wald intervals built from it can extend outside (0, 1)
+near the tails of the curve and have non-symmetric coverage. Working
+on the link scale and applying the monotone `plogis()` transformation
+keeps the interval inside (0, 1) by construction and preserves
+asymmetry around the boundaries (wider near 0 and 1, narrower near
+the middle), which is the right qualitative shape for a probability.
+
+## Comparing GLM and GAM {.unnumbered}
+
+```r
+# Refit the GAM with method = "ML" for AIC comparison.
+# REML and ML likelihoods are on different scales; comparing AIC
+# from glm() (always ML) with a REML-fitted gam() mixes criteria.
+# Use select = TRUE to enable term-level smoothness selection (the
+# null-space penalty), so a smooth that's really just a line — or
+# zero — isn't penalized as more complex than the linear baseline.
+m_gam_ml <- gam(
+  diabetes ~ s(bmi) + s(glu) + s(age),
+  family = binomial(),
+  data = Pima.tr,
+  method = "ML",
+  select = TRUE
+)
+AIC(m_lin, m_gam_ml)
+```
+
+For `Pima.tr`, the GAM typically improves AIC by a few units —
+consistent with the modest EDFs above 1 we saw in the summary.
+Whether that improvement is worth the added model complexity is a
+judgment call: if the smooths all have EDF very close to 1, the GLM
+is a parsimonious choice; if EDFs are clearly $> 1$ and the curve
+plots show a substantively interesting shape (e.g. a J in BMI), the
+GAM is the better report.
+
+The refit with `method = "ML"` is required only for the AIC
+comparison; the REML fit `m_gam` above remains the preferred fit for
+final inference and plotting [@wood2017generalized, §6.2].
+
+## Reporting in a manuscript {.unnumbered}
+
+A typical methods-section sentence is:
+
+> Continuous predictors were modeled using thin-plate regression
+> splines (the default basis for `mgcv::s()`) via penalized restricted
+> maximum likelihood (`mgcv` *version*, R *version*;
+> @wood2017generalized). The number of basis functions per predictor
+> was set to 10 and reduced by penalization to the effective degrees
+> of freedom shown in Table X.
+
+Replace *version* with the actual versions you used — these can
+drift between drafts.
+
+Pair this with a table giving each smooth's EDF and approximate
+p-value, plus a figure of the partial-effect curves on the
+link scale (with the response-scale plot of the predictor of
+primary interest as a supplementary figure).
diff --git a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
new file mode 100644
index 0000000000..971694b45f
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
@@ -0,0 +1,118 @@
+::: notes
+Adapted from @wood2017generalized [§6].
+:::
+
+The standard tool for fitting GAMs in R is the `mgcv` package
+[@wood2017generalized]. We'll use `mgcv::gam()`, which fits GAMs by
+**penalized likelihood** with automatic smoothness selection via
+REML or GCV.
+
+## The `gam()` call
+
+The basic syntax mirrors `lm()`/`glm()`, with smoothers introduced by
+`s()`:
+
+```r
+library(mgcv)
+
+m <- gam(
+  y ~ s(age) + s(bmi) + sex + s(year, k = 10),
+  data = d,
+  family = binomial(),
+  method = "REML"
+)
+```
+
+Reading the formula:
+
+- `s(age)`, `s(bmi)`, `s(year, k = 10)` are **smooth terms**. Each
+  introduces a basis expansion plus a smoothness penalty. The
+  default basis (`bs = "tp"`, thin-plate regression spline) is
+  almost always a good choice. `k = 10` requests a basis of
+  dimension up to 10 — large enough that the smoothness penalty,
+  not the basis, controls flexibility.
+- `sex` is a **parametric term**, treated exactly as in a GLM.
+
+You can mix smooth and parametric terms freely. Parametric terms
+contribute conventional regression coefficients (with $t$/$z$
+statistics in the model summary); smooth terms contribute fitted
+curves with effective-degrees-of-freedom summaries.
+
+## Useful `s()` arguments
+
+| Argument | What it controls | When to change from default |
+|:--|:--|:--|
+| `bs` | basis type (`"tp"`, `"cr"`, `"ps"`, ...) | rarely; `"tp"` is the default and the safe choice |
+| `k` | basis dimension (max EDF + 1) | when `gam.check()` flags `k-index < 1` |
+| `by` | factor for per-level smooths | predictor-by-factor interactions |
+| `m` | order of the penalty derivative | rarely; default penalizes 2nd derivative |
+| `pc` | constraint point | when you need the smooth to pass through 0 at a chosen $x$ |
+
+## Smoothness selection: REML vs GCV
+
+`mgcv::gam()` picks the smoothness parameter(s) $\l$ automatically by
+optimizing one of:
+
+- **REML** (restricted maximum likelihood). Treats the penalty as
+  arising from a normal prior on the spline coefficients and selects
+  $\l$ by the marginal likelihood. **Not the default** — `gam()`
+  defaults to `method = "GCV.Cp"`, so you have to pass
+  `method = "REML"` explicitly (as in the examples in this
+  section). Generally **more stable** than GCV, less prone to
+  undersmoothing.
+- **ML** (marginal likelihood). Similar to REML but with a slightly
+  different effective sample size adjustment.
+- **GCV** (generalized cross-validation). The default. Computes
+  faster than REML but can occasionally select a much wigglier
+  smooth than is justified by the data, especially when $n$ is
+  small or signal is weak.
+
+**Recommendation: use `method = "REML"`** for routine work. If
+you're comparing GAMs to GLMs via AIC, also set `select = TRUE` to
+turn on smoothness selection at the term level. `select = TRUE`
+adds an extra null-space penalty to each smooth (separate from the
+wiggliness penalty) that can shrink an *entire* smooth term to zero
+— effectively functioning as variable selection
+[@wood2017generalized, §5.4]. Without it, the ordinary wiggliness
+penalty can only shrink a smooth down to a straight line, never to
+identically zero; `select = TRUE` extends that floor all the way
+down. This makes AIC comparisons more honest because the GAM is
+then allowed to "discover" that a predictor's smooth shouldn't be
+there at all.
+
+## Checking the fit
+
+After fitting, run `gam.check()` to verify:
+
+```r
+gam.check(m)
+```
+
+Output includes:
+
+- A **convergence diagnostic** (REML/GCV optimization should converge
+  to a stationary point — usually it does without intervention).
+- **Basis-dimension check** (`k'`, `edf`, `k-index`, `p-value`) for
+  each smooth. If `edf` is close to `k'`, the smooth is hitting
+  its maximum allowed flexibility — bump `k` up and refit. If
+  `k-index < 1` with a small p-value, the basis may be too small.
+- **Residual diagnostics**: QQ plot, residuals vs. linear predictor,
+  histogram of residuals, observed vs. fitted. Same intuition as for
+  a GLM.
+
+## Plotting smooths
+
+```r
+plot(m, pages = 1, shade = TRUE)
+```
+
+produces a panel of partial-effect plots — one per smooth term —
+with a 2-SE pointwise confidence band. The y-axis is on the **linear
+predictor scale** by default, centered to mean zero (the smooth's
+contribution to $g(\E{Y})$ above its mean), not on the response
+scale.
+
+For a publication-quality plot on the response scale, use
+`predict(m, type = "response", se.fit = TRUE)` on a grid of
+$X$ values with the other predictors held at typical values.
+
diff --git a/_subfiles/generalized-additive-models/_sec_gam_inference.qmd b/_subfiles/generalized-additive-models/_sec_gam_inference.qmd
new file mode 100644
index 0000000000..aec9f5a8f1
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_gam_inference.qmd
@@ -0,0 +1,122 @@
+::: notes
+Adapted from @wood2017generalized [§6.10–6.12].
+:::
+
+GAM output from `summary()` looks similar to GLM output but has a
+few quirks driven by the penalized fit. Here's what to read off it.
+
+## Reading `summary(gam_fit)`
+
+A `summary.gam` object splits the report into two sections:
+
+### Parametric coefficients
+
+These are the non-smooth terms in the formula — intercept,
+categorical predictors, terms entered as `x` rather than `s(x)`.
+Their output is identical to a GLM:
+
+```
+Parametric coefficients:
+            Estimate Std. Error z value Pr(>|z|)
+(Intercept)  -2.143      0.087  -24.63  < 2e-16 ***
+sexFemale     0.342      0.061    5.61  2.0e-08 ***
+```
+
+Read these exactly as you would GLM coefficients.
+
+### Smooth terms
+
+```
+Approximate significance of smooth terms:
+        edf  Ref.df  Chi.sq  p-value
+s(age)  4.82   5.93   76.2   3.6e-14 ***
+s(bmi)  2.13   2.78   18.4   2.4e-04 ***
+```
+
+Each column reports:
+
+- **`edf`** — effective degrees of freedom of the smooth, as defined
+  in @def-effective-degrees-of-freedom. An EDF of 1.0 means the smooth
+  has been penalized down to a straight line; higher values mean more
+  wiggliness.
+- **`Ref.df`** — a reference degrees of freedom used in constructing
+  the p-value (slightly larger than EDF; the difference accounts for
+  the penalized-fit uncertainty in selecting $\l$).
+- **`Chi.sq`** — the test statistic for "is this smooth different
+  from zero?".
+- **`p-value`** — an *approximate* p-value for that null. The standard
+  caveat applies: the smoothing parameter was selected from the
+  data, so the test statistic doesn't have an exact $\chi^2$
+  distribution — `mgcv` uses a careful approximation accounting for
+  the selection, but it's still an approximation [@wood2013pvalues].
+
+### Adjusted $R^2$ and deviance explained
+
+For Gaussian GAMs, the summary reports an adjusted $R^2$; for GLM
+families, **deviance explained** (the GLM analog of $R^2$). Same
+interpretation as in the GLM chapters.
+
+## Confidence bands for smooth terms
+
+The pointwise CIs that `plot.gam()` draws come from the Bayesian
+posterior distribution of the spline coefficients
+[@wood2017generalized, §6.10]. They have the property:
+
+- They are **roughly the right Bayesian credible intervals** for the
+  smooth at each point.
+- They are **roughly the right frequentist confidence bands**
+  "across the function" — i.e. averaged over $x$, the band has
+  approximately 95% coverage of the true $f(x)$.
+- They **are not** pointwise frequentist CIs in the strict sense;
+  at any single $x$, coverage can be a few percentage points off.
+
+For routine plotting and reporting in epidemiology, these bands are
+the right thing to display. If you need something stricter (e.g.
+simultaneous bands valid across the entire $x$-range), posterior
+simulation from the spline coefficients gives the right answer:
+`gratia::smooth_samples()` (gratia >= 0.9) returns draws of the smooth
+from the posterior on the spline coefficients (equivalently, you can
+roll your own via `mgcv::rmvn()` with the fitted `Vp`). See
+[@wood2017generalized, §6.10] for the underlying theory.
+Note that `simulate(m_gam)` is a *different* tool — it returns
+response-scale draws (posterior predictive samples of new
+observations), not smooth-coefficient draws, so it's appropriate for
+predictive checks but not for constructing simultaneous bands on the
+smooth.
+
+## Comparing models
+
+Two GAMs can be compared in the usual ways:
+
+- **AIC**: `AIC(m1, m2)`. Lower is better. `mgcv` reports the AIC
+  with an effective-degrees-of-freedom correction that's
+  appropriate for the penalized fit.
+- **Likelihood-ratio test**: `anova(m1, m2, test = "Chisq")`. Same
+  caveat as the smooth-term p-values — the smoothing parameter was
+  data-chosen, so the LRT is approximate. Use with the same care.
+- **`gratia::compare_smooths()`** (from the companion `gratia`
+  package) — a more comprehensive comparison framework, including
+  visual overlays of smooths and difference plots.
+
+## When to prefer a GLM over a GAM
+
+GAMs are strictly more flexible than GLMs, but that flexibility costs
+something:
+
+- **Communication.** A linear effect is easier to describe in a
+  table of coefficients than a curve.
+- **Power.** With small $n$ and a smooth that's truly linear, the
+  GAM wastes degrees of freedom on penalty selection. The EDF will
+  drop toward 1, but the inference loses a little efficiency.
+- **Reporting standards.** Some epidemiology journals still require
+  linear or quadratic exposure terms.
+
+A reasonable workflow:
+
+1. Plot the data + a LOESS smoother for an initial scatterplot
+   diagnostic.
+2. Fit a GAM with `s(x)` for every continuous predictor where
+   nonlinearity is plausible.
+3. Read the EDF: if EDF $\approx 1$, the linear form is fine — drop
+   `s()` and refit as a GLM for parsimony.
+4. If EDF $> 1$, report the GAM with a curve plot.
diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd
new file mode 100644
index 0000000000..4be0d099e8
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_gams.qmd
@@ -0,0 +1,113 @@
+::: notes
+Adapted from @james2021islr2e [§7.7] and @wood2017generalized [§4–7].
+:::
+
+We're now ready to assemble the **generalized additive model**. The
+idea is to take any of the nonlinear smoothers we've built up —
+regression splines, smoothing splines, local regression — and let
+**each predictor have its own smoother**, with the smoother
+contributions added together on the link scale.
+
+## The GAM formulation
+
+::::{#def-gam}
+#### Generalized additive model
+
+Let $Y$ be a response variable with conditional distribution from
+an exponential family (Gaussian, binomial, Poisson, etc.), and let
+$X = (X_1, \ldots, X_p)$ be predictors. A **generalized additive
+model** (GAM) specifies
+
+$$g(\E{Y \mid X = x}) = \b_0 + f_1(x_1) + f_2(x_2) + \cdots + f_p(x_p),$$
+
+where:
+
+- $g$ is a link function (identity for Gaussian, logit for binary,
+  log for Poisson — same as in a GLM).
+- Each $f_j$ is a **smooth function** of the $j$-th predictor,
+  estimated from the data.
+- $\b_0$ is an intercept (sometimes absorbed into one of the $f_j$'s
+  via a centering constraint).
+
+A "smooth function" in practice means a regression spline,
+smoothing spline, or other smoother — anything that can be written
+as a basis expansion $f_j(x_j) = \sum_k \b_{jk} b_{jk}(x_j)$ with
+the coefficients $\b_{jk}$ estimated jointly across all predictors.
+::::
+
+::::{#exm-gam}
+#### GAM for diabetes risk
+
+For the Pima diabetes data with continuous predictors BMI, plasma
+glucose, and age, a GAM on the logit scale is
+
+$$\logitf{\Pr(\text{diabetes} = 1 \mid \text{bmi}, \text{glu}, \text{age})}
+= \b_0 + f_1(\text{bmi}) + f_2(\text{glu}) + f_3(\text{age}),$$
+
+where each $f_j$ is a smoothing spline estimated by REML. The three
+partial-effect curves can be plotted separately, revealing, for
+example, whether diabetes risk rises linearly with age or accelerates
+nonlinearly.
+::::
+
+The model class includes ordinary linear regression as a special
+case (every $f_j$ is just $\b_j x_j$) and ordinary GLMs as another
+special case (same — every $f_j$ linear, with the right link
+function). The point of the GAM extension is to let *some* $f_j$
+be nonlinear while keeping the predictor contributions additive.
+
+## What "additive" buys you
+
+The additive structure makes GAMs strictly less flexible than fully
+nonparametric models like
+
+$$g(\E{Y \mid X = x}) = f(x_1, x_2, \ldots, x_p),$$
+
+which allow arbitrary interactions among predictors. In return, GAMs
+gain:
+
+- **Interpretability per predictor.** Each $f_j$ can be plotted on its
+  own axis as the predicted shift in the linear predictor as $X_j$
+  varies, holding the other predictors fixed. The same partial-effect
+  intuition that makes linear-model coefficients useful carries over.
+- **Avoidance of the curse of dimensionality.** Fitting a smooth in
+  one dimension takes $O(n)$ data; fitting it in $p$ dimensions
+  jointly takes $O(n^p)$ data. Additivity is a structural assumption
+  that makes the problem tractable even with several predictors.
+- **Direct extension of GLM workflow.** The link function $g$,
+  exponential-family likelihood, residual diagnostics, and AIC
+  comparisons all work essentially unchanged — GAMs are GLMs in
+  spline coordinates.
+
+## Adding interactions and tensor smooths
+
+The strict-additivity restriction can be relaxed when needed. The
+two standard escape hatches are:
+
+- **Predictor-by-factor interactions.** A smooth that varies by
+  category — e.g. an age effect that's different for men and women
+  — is fit as $f_j(X_j) + f_{j,\text{by}}(X_j) \cdot \indicp{\text{sex} = M}$
+  via `s(age, by = sex)` in `mgcv`. When `sex` is a factor, this
+  syntax requires an accompanying parametric main effect for
+  identifiability (the smooth is centered to mean zero in each
+  level, so the level-specific intercepts have to come from
+  somewhere else); the correct idiom is:
+  ```r
+  gam(y ~ sex + s(age, by = sex), data = d, method = "REML")
+  ```
+  The result is still additive in the *factors*, but allows
+  separate smooth shapes within each level.
+- **Tensor-product smooths.** A genuine two-dimensional smooth surface
+  in two continuous predictors — e.g. $f(\text{age}, \text{bmi})$ —
+  fit via `te(age, bmi)` in `mgcv`. The smoothing penalty is applied
+  separately to wiggliness in each marginal direction, so the smooth
+  doesn't assume the two predictors are on comparable scales — the
+  right choice when predictors have different units (e.g. age in years,
+  BMI in kg/m²). An isotropic 2D smooth (`s(age, bmi)`) would penalize
+  the same amount of "wiggliness per year of age" as per "unit of
+  BMI", which is rarely substantively defensible.
+
+For routine epidemiology work, the additive model is usually the
+right starting point; tensor smooths come up mainly when interactions
+between continuous predictors are scientifically central (e.g.
+space–time models, age–calendar-time mortality surfaces).
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_lab.qmd b/_subfiles/generalized-additive-models/_sec_lab.qmd
new file mode 100644
index 0000000000..d45ef30668
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_lab.qmd
@@ -0,0 +1,378 @@
+::: notes
+Lab adapted from @james2021islr2e [§7.8] and @james2023islp [§7.8].
+:::
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-setup-r
+#| eval: false
+#| message: false
+library(ISLR2)
+library(splines)
+library(mgcv)
+library(ggplot2)
+library(boot)
+data(Wage)
+head(Wage[, c("age", "wage", "education", "year")])
+```
+
+#### Python
+The smoothing-spline and GAM sections also require `pygam`, which is
+not part of the standard ISLP/sklearn ecosystem — install with
+`pip install pygam` before running those blocks.
+
+```{python}
+#| eval: false
+#| python.reticulate: false
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from ISLP import load_data
+from sklearn.preprocessing import PolynomialFeatures
+from sklearn.linear_model import LinearRegression, LogisticRegression
+import statsmodels.api as sm
+from patsy import dmatrix
+# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f  # noqa: E402
+Wage = load_data("Wage")
+print(Wage[["age","wage","education","year"]].head())
+```
+:::
+
+## Polynomial regression {.unnumbered}
+
+Fit degree-1 through degree-5 polynomials, then use ANOVA to select degree [@james2021islr2e, §7.1].
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-poly-r
+#| eval: false
+#| code-fold: true
+fit1 <- lm(wage ~ poly(age, 1), data = Wage)
+fit2 <- lm(wage ~ poly(age, 2), data = Wage)
+fit3 <- lm(wage ~ poly(age, 3), data = Wage)
+fit4 <- lm(wage ~ poly(age, 4), data = Wage)
+fit5 <- lm(wage ~ poly(age, 5), data = Wage)
+anova(fit1, fit2, fit3, fit4, fit5)
+```
+
+The ANOVA p-values suggest degree 3 or 4 is adequate.
+
+```{r}
+#| label: lab-poly-plot-r
+#| eval: false
+#| code-fold: true
+age_grid <- seq(min(Wage$age), max(Wage$age), length.out = 200)
+pred <- predict(fit4, newdata = data.frame(age = age_grid), se.fit = TRUE)
+df_pred <- data.frame(
+  age = age_grid,
+  fit = pred$fit,
+  lo  = pred$fit - 2 * pred$se.fit,
+  hi  = pred$fit + 2 * pred$se.fit
+)
+ggplot(Wage, aes(age, wage)) +
+  geom_point(alpha = 0.2, size = 0.8) +
+  geom_ribbon(data = df_pred, aes(y = fit, ymin = lo, ymax = hi),
+              alpha = 0.3, fill = "steelblue") +
+  geom_line(data = df_pred, aes(y = fit), colour = "steelblue", linewidth = 1) +
+  labs(title = "Degree-4 polynomial fit", x = "Age", y = "Wage")
+```
+#### Python
+```{python}
+#| eval: false
+#| python.reticulate: false
+from scipy import stats
+results = {}
+for deg in range(1, 6):
+    X = np.column_stack([Wage["age"] ** k for k in range(1, deg + 1)])
+    X = sm.add_constant(X)
+    results[deg] = sm.OLS(Wage["wage"], X).fit()
+
+# F-tests between consecutive nested models. compare_f_test() is
+# called on the *unrestricted* (more flexible) model and given the
+# restricted model as the argument.
+for d in range(1, 5):
+    f, p, df_diff = results[d + 1].compare_f_test(results[d])
+    print(f"Degree {d} vs {d+1}: F={f:.3f}, p={p:.4f}")
+```
+
+```{python}
+#| eval: false
+#| python.reticulate: false
+age_grid = np.linspace(Wage["age"].min(), Wage["age"].max(), 200)
+X_grid = sm.add_constant(np.column_stack([age_grid ** k for k in range(1, 5)]))
+pred4 = results[4].get_prediction(X_grid)
+pred_df = pred4.summary_frame(alpha=0.05)
+
+plt.figure(figsize=(7, 4))
+plt.scatter(Wage["age"], Wage["wage"], alpha=0.15, s=8)
+plt.plot(age_grid, pred_df["mean"], color="steelblue", linewidth=2)
+plt.fill_between(age_grid, pred_df["mean_ci_lower"], pred_df["mean_ci_upper"],
+                 alpha=0.3, color="steelblue")
+plt.title("Degree-4 polynomial fit"); plt.xlabel("Age"); plt.ylabel("Wage")
+plt.tight_layout(); plt.show()
+```
+:::
+
+## Step functions {.unnumbered}
+
+Cut `age` into four intervals and fit a piecewise-constant model [@james2021islr2e, §7.2].
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-step-r
+#| eval: false
+#| code-fold: true
+Wage$age_cut <- cut(Wage$age, 4)
+fit_step <- lm(wage ~ age_cut, data = Wage)
+summary(fit_step)$coef
+```
+#### Python
+```{python}
+#| eval: false
+#| python.reticulate: false
+Wage["age_cut"] = pd.cut(Wage["age"], bins=4)
+step_dummies = pd.get_dummies(Wage["age_cut"], drop_first=True).astype(float)
+X_step = sm.add_constant(step_dummies)
+fit_step = sm.OLS(Wage["wage"], X_step).fit()
+print(fit_step.summary().tables[1])
+```
+:::
+
+## Regression splines {.unnumbered}
+
+Fit a cubic B-spline with knots at ages 25, 40, and 60, and a natural cubic spline with the same knots [@james2021islr2e, §7.4].
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-splines-r
+#| eval: false
+#| code-fold: true
+# Redefine age_grid so the splines section runs in isolation
+# (it's also defined in the poly-plot chunk above).
+age_grid <- seq(min(Wage$age), max(Wage$age), length.out = 200)
+knots <- c(25, 40, 60)
+fit_bs <- lm(wage ~ bs(age, knots = knots), data = Wage)
+fit_ns <- lm(wage ~ ns(age, knots = knots), data = Wage)
+
+pred_bs <- predict(fit_bs, newdata = data.frame(age = age_grid), se.fit = TRUE)
+pred_ns <- predict(fit_ns, newdata = data.frame(age = age_grid), se.fit = TRUE)
+
+df_sp <- data.frame(
+  age = rep(age_grid, 2),
+  fit = c(pred_bs$fit, pred_ns$fit),
+  lo  = c(pred_bs$fit - 2*pred_bs$se.fit, pred_ns$fit - 2*pred_ns$se.fit),
+  hi  = c(pred_bs$fit + 2*pred_bs$se.fit, pred_ns$fit + 2*pred_ns$se.fit),
+  type = rep(c("B-spline", "Natural spline"), each = 200)
+)
+ggplot(Wage, aes(age, wage)) +
+  geom_point(alpha = 0.15, size = 0.6) +
+  geom_ribbon(data = df_sp, aes(y = fit, ymin = lo, ymax = hi, fill = type), alpha = 0.25) +
+  geom_line(data = df_sp, aes(y = fit, colour = type), linewidth = 1) +
+  labs(x = "Age", y = "Wage", colour = NULL, fill = NULL)
+```
+#### Python
+```{python}
+#| eval: false
+#| python.reticulate: false
+bs_mat = dmatrix("bs(age, knots=(25,40,60), include_intercept=False)",
+                 {"age": Wage["age"]}, return_type="dataframe")
+ns_mat = dmatrix("cr(age, knots=(25,40,60))",
+                 {"age": Wage["age"]}, return_type="dataframe")
+
+fit_bs_py = sm.OLS(Wage["wage"], bs_mat).fit()
+fit_ns_py = sm.OLS(Wage["wage"], ns_mat).fit()
+
+bs_grid = dmatrix("bs(age, knots=(25,40,60), include_intercept=False)",
+                  {"age": age_grid}, return_type="dataframe")
+ns_grid = dmatrix("cr(age, knots=(25,40,60))",
+                  {"age": age_grid}, return_type="dataframe")
+
+plt.figure(figsize=(7, 4))
+plt.scatter(Wage["age"], Wage["wage"], alpha=0.15, s=6)
+plt.plot(age_grid, fit_bs_py.predict(bs_grid), label="B-spline", linewidth=2)
+plt.plot(age_grid, fit_ns_py.predict(ns_grid), label="Natural spline", linewidth=2)
+plt.legend(); plt.xlabel("Age"); plt.ylabel("Wage")
+plt.tight_layout(); plt.show()
+```
+:::
+
+## Smoothing splines and local regression {.unnumbered}
+
+A smoothing spline selects the penalty parameter $\l$ by generalized cross-validation (GCV); LOESS uses a local polynomial fit [@james2021islr2e, §7.5–7.6].
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-smooth-r
+#| eval: false
+#| code-fold: true
+fit_ss_16 <- smooth.spline(Wage$age, Wage$wage, df = 16)
+fit_ss_gcv <- smooth.spline(Wage$age, Wage$wage, cv = FALSE)  # GCV
+fit_lo    <- loess(wage ~ age, data = Wage, span = 0.2)
+
+age_seq <- seq(min(Wage$age), max(Wage$age), length.out = 200)
+df_sm <- data.frame(
+  age  = rep(age_seq, 3),
+  fit  = c(predict(fit_ss_16, age_seq)$y,
+           predict(fit_ss_gcv, age_seq)$y,
+           predict(fit_lo, data.frame(age = age_seq))),
+  type = rep(c("Smooth spline df=16",
+               paste0("Smooth spline GCV (df=",
+                      round(fit_ss_gcv$df, 1), ")"),
+               "LOESS span=0.2"),
+             each = 200)
+)
+ggplot(Wage, aes(age, wage)) +
+  geom_point(alpha = 0.15, size = 0.6) +
+  geom_line(data = df_sm, aes(y = fit, colour = type), linewidth = 1) +
+  labs(x = "Age", y = "Wage", colour = NULL)
+```
+#### Python
+```{python}
+#| eval: false
+#| python.reticulate: false
+from pygam import LinearGAM, s as gam_s
+from statsmodels.nonparametric.smoothers_lowess import lowess
+
+# Smoothing spline via pyGAM
+gam_smooth = LinearGAM(gam_s(0, n_splines=25)).gridsearch(
+    Wage[["age"]].values, Wage["wage"].values)
+XX = gam_smooth.generate_X_grid(term=0, n=200)
+pdep, confi = gam_smooth.partial_dependence(term=0, X=XX, width=0.95)
+
+# LOESS
+lo_fit = lowess(Wage["wage"], Wage["age"], frac=0.2)
+
+fig, axes = plt.subplots(1, 2, figsize=(12, 4))
+axes[0].scatter(Wage["age"], Wage["wage"], alpha=0.15, s=6)
+axes[0].plot(XX.ravel(), pdep, color="steelblue", linewidth=2, label="Smoothing spline (GCV)")
+axes[0].fill_between(XX.ravel(), confi[:,0], confi[:,1], alpha=0.3)
+axes[0].set_xlabel("Age"); axes[0].set_ylabel("Wage"); axes[0].legend()
+
+axes[1].scatter(Wage["age"], Wage["wage"], alpha=0.15, s=6)
+axes[1].plot(lo_fit[:,0], lo_fit[:,1], color="tomato", linewidth=2, label="LOESS span=0.2")
+axes[1].set_xlabel("Age"); axes[1].set_ylabel("Wage"); axes[1].legend()
+plt.tight_layout(); plt.show()
+```
+:::
+
+## GAMs: continuous outcome {.unnumbered}
+
+Fit three nested GAMs for `wage` on the `Wage` data and compare with AIC [@james2021islr2e, §7.7].
+This lab follows the ISLRv2 code, which uses the `mgcv` default
+`method = "GCV.Cp"` for smoothness selection; for new analyses prefer
+`method = "REML"` (see @sec-smoothing-splines and @sec-gam-fitting).
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-gam-r
+#| eval: false
+#| code-fold: true
+gam1 <- gam(wage ~ s(year, k = 4) + s(age, k = 5) + education,
+            data = Wage)
+gam2 <- gam(wage ~ year + s(age, k = 5) + education,
+            data = Wage)
+gam3 <- gam(wage ~ s(year, k = 4) + poly(age, 4) + education,
+            data = Wage)
+
+AIC(gam1, gam2, gam3)
+
+par(mfrow = c(1, 3))
+plot(gam1, shade = TRUE, seWithMean = TRUE,
+     main = "GAM1: s(year) + s(age) + education")
+```
+#### Python
+```{python}
+#| eval: false
+#| python.reticulate: false
+from pygam import LinearGAM, s as gam_s, f as gam_f
+
+edu_codes = Wage["education"].cat.codes.values
+X_gam = np.column_stack([Wage["year"].values,
+                         Wage["age"].values,
+                         edu_codes])
+y_gam = Wage["wage"].values
+
+gam_py = LinearGAM(
+    gam_s(0, n_splines=4) + gam_s(1, n_splines=5) + gam_f(2)
+).gridsearch(X_gam, y_gam)
+
+print("AIC:", gam_py.statistics_["AIC"])
+
+fig, axes = plt.subplots(1, 3, figsize=(14, 4))
+titles = ["year", "age", "education"]
+for i, ax in enumerate(axes):
+    XX = gam_py.generate_X_grid(term=i)
+    pdep, confi = gam_py.partial_dependence(term=i, X=XX, width=0.95)
+    ax.plot(XX[:, i], pdep, linewidth=2)
+    ax.fill_between(XX[:, i], confi[:,0], confi[:,1], alpha=0.3)
+    ax.set_xlabel(titles[i]); ax.set_ylabel("Partial effect")
+plt.tight_layout(); plt.show()
+```
+:::
+
+## GAMs: binary outcome {.unnumbered}
+
+Predict whether a worker earns more than \$250k using a logistic GAM. We subset to workers
+with at least some college education, following @james2021islr2e [§7.7].
+
+:::{.panel-tabset}
+#### R
+```{r}
+#| label: lab-gam-logistic-r
+#| eval: false
+#| code-fold: true
+Wage$high <- factor(ifelse(Wage$wage > 250, "High", "Low"),
+                    levels = c("Low", "High"))
+
+fit_logit_gam <- gam(
+  high ~ s(year, k = 4) + s(age, k = 5) + education,
+  family = binomial,
+  data = Wage,
+  subset = (education != "1. < HS Grad")
+)
+summary(fit_logit_gam)
+
+par(mfrow = c(1, 3))
+plot(fit_logit_gam, shade = TRUE, seWithMean = TRUE,
+     trans = plogis,
+     main = "Logistic GAM (P(wage > 250k))")
+```
+#### Python
+```{python}
+#| eval: false
+#| python.reticulate: false
+from pygam import LogisticGAM, s as gam_s, f as gam_f
+
+mask = Wage["education"] != "1. < HS Grad"
+W2 = Wage[mask].copy()
+W2["high"] = (W2["wage"] > 250).astype(int)
+edu2 = W2["education"].cat.codes.values
+
+X_log = np.column_stack([W2["year"].values,
+                          W2["age"].values,
+                          edu2])
+y_log = W2["high"].values
+
+gam_log = LogisticGAM(
+    gam_s(0, n_splines=4) + gam_s(1, n_splines=5) + gam_f(2)
+).gridsearch(X_log, y_log)
+
+print("Training accuracy:", (gam_log.predict(X_log) == y_log).mean())
+
+fig, axes = plt.subplots(1, 3, figsize=(14, 4))
+titles = ["year", "age", "education"]
+for i, ax in enumerate(axes):
+    XX = gam_log.generate_X_grid(term=i)
+    pdep, confi = gam_log.partial_dependence(term=i, X=XX, width=0.95)
+    ax.plot(XX[:, i], pdep, linewidth=2)
+    ax.fill_between(XX[:, i], confi[:,0], confi[:,1], alpha=0.3)
+    ax.set_xlabel(titles[i]); ax.set_ylabel("Partial effect (log-odds)")
+plt.tight_layout(); plt.show()
+```
+:::
diff --git a/_subfiles/generalized-additive-models/_sec_local_regression.qmd b/_subfiles/generalized-additive-models/_sec_local_regression.qmd
new file mode 100644
index 0000000000..7339b628f6
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_local_regression.qmd
@@ -0,0 +1,97 @@
+::: notes
+Adapted from @james2021islr2e [§7.6].
+:::
+
+**Local regression**, also called **LOESS** (LOcally Estimated
+Scatterplot Smoothing) or **LOWESS**, takes yet another approach to
+nonlinear regression: instead of fitting a single global model with
+flexible basis functions, it fits **many local models**, one for
+each point on the $x$-axis where we want a prediction.
+
+## The local fit
+
+::::{#def-local-regression}
+#### Local regression at $x_0$
+
+To estimate the regression function $f(x_0)$ at a target point $x_0$:
+
+1. Choose a **bandwidth** $s \in (0, 1]$, also called the **span**.
+   Take the fraction $s$ of training observations with $x_i$ closest
+   to $x_0$ — this is the **local neighborhood**.
+2. **Weight** the neighborhood observations using a kernel that
+   gives higher weight to points closer to $x_0$ and zero weight to
+   points outside the neighborhood. The tricube kernel
+   $K(u) = (1 - |u|^3)^3 \indicp{|u| \le 1}$ is the standard choice.
+3. **Fit a low-degree polynomial** (constant, linear, or quadratic)
+   by weighted least squares to the neighborhood, using the kernel
+   weights. The fitted value at $x_0$ is the prediction
+   $\hat f(x_0)$.
+
+To get the entire fitted curve, repeat for a dense grid of $x_0$
+values and connect the resulting predictions.
+::::
+
+::::{#exm-local-regression}
+#### LOESS fit on age--glucose
+
+To smooth the relationship between age and plasma glucose in
+`MASS::Pima.tr` using LOESS with a span of 0.5:
+
+```r
+m_loess <- loess(glu ~ age, data = MASS::Pima.tr, span = 0.5)
+```
+
+At age 40, the local neighborhood is the 50% of observations with
+ages closest to 40. Each observation in that window is weighted by
+the tricube kernel --- down-weighted as its age moves away from 40,
+zero weight outside the window. The predicted glucose at age 40 is
+the fitted value from the weighted least-squares linear regression
+over that neighborhood.
+::::
+
+## Strengths
+
+- **Conceptually simple.** No basis functions, no penalty matrices.
+  Just "fit a line through the nearby points, and call that the
+  curve at this $x$."
+- **Adaptive to local structure.** Regions with rapid change get
+  their own local polynomial, just like flat regions do.
+- **Robust to outliers** in the standard `loess()` implementation,
+  which iteratively down-weights observations with large residuals.
+
+## Limitations
+
+- **No global model.** Local regression doesn't produce a closed-form
+  $\hat f(x)$ — predictions at any new $x$ are computed on demand by
+  re-running the local fit at that point, but there is no parametric
+  formula for the curve. Inference for the entire function (e.g. "is this exposure associated with
+  outcome at all") needs special tooling.
+- **Hard to extend to many predictors.** With $p$ predictors, the
+  local neighborhood is in $p$-dimensional space, and the curse of
+  dimensionality bites hard: even for moderate $p$, "the nearest
+  fraction $s$ of the data" is geometrically nothing like local. LOESS
+  is rarely used with more than 2–4 predictors.
+- **Bandwidth selection.** Just like the smoothing parameter $\l$ in
+  smoothing splines, $s$ has to be chosen. Cross-validation works
+  but is slower.
+
+## Local regression in R
+
+```r
+m_loess <- loess(y ~ x, data = d, span = 0.5)
+grid <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200))
+grid$fit <- predict(m_loess, newdata = grid)
+
+# Inside ggplot2, geom_smooth() with method = "loess"
+# is the standard scatterplot smoother:
+ggplot(d, aes(x = x, y = y)) +
+  geom_point() +
+  geom_smooth(method = "loess", span = 0.5)
+```
+
+For most modeling work in this chapter we'll prefer the
+smoothing-spline approach via `mgcv::s()`, which produces a global
+fit with full inferential machinery. LOESS is best thought of as a
+**diagnostic smoother** — an unobtrusive way to overlay the local
+mean of a scatterplot to check whether a parametric fit is missing
+nonlinearity.
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_method_comparison.qmd b/_subfiles/generalized-additive-models/_sec_method_comparison.qmd
new file mode 100644
index 0000000000..92a54bb248
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_method_comparison.qmd
@@ -0,0 +1,244 @@
+::: notes
+This section illustrates the methods of @sec-polynomial-regression
+through @sec-local-regression on a shared synthetic dataset, so
+the strengths and failure modes can be compared side-by-side.
+:::
+
+```{r}
+#| label: method-comparison-setup
+#| include: false
+set.seed(204)
+library(ggplot2)
+
+n <- 200
+x_lim <- c(0, 2 * pi)
+truth <- function(x) sin(x) + 0.3 * sin(3 * x)
+demo_data <- data.frame(
+  x = runif(n, x_lim[1], x_lim[2])
+)
+demo_data$y <- truth(demo_data$x) + rnorm(n, sd = 0.4)
+
+grid <- data.frame(x = seq(x_lim[1], x_lim[2], length.out = 400))
+grid$truth <- truth(grid$x)
+
+predict_with <- function(model) {
+  predict(model, newdata = grid)
+}
+
+theme_set(theme_minimal())
+```
+
+We generate $n = `r n`$ noisy observations from a smooth nonlinear
+function
+
+$$f(x) = \sin(x) + 0.3 \sin(3 x), \quad x \in [0, 2\pi],$$
+
+with additive Gaussian noise ($\s = 0.4$). The truth is bumpy
+enough to expose differences between the methods but not so wild
+that any one of them obviously breaks.
+
+```{r}
+#| label: method-comparison-data-fits
+#| include: false
+# Fit each method on the same demo_data; predictions on `grid` for
+# overlay. Comments on each fit explain the design choice.
+
+# (1) Linear baseline.
+fit_lin <- lm(y ~ x, data = demo_data)
+grid$linear <- predict_with(fit_lin)
+
+# (2) Quadratic polynomial.
+fit_poly2 <- lm(y ~ poly(x, 2), data = demo_data)
+grid$poly2 <- predict_with(fit_poly2)
+
+# (3) Degree-12 polynomial — deliberately high degree to surface
+# Runge-style boundary swinging.
+fit_poly12 <- lm(y ~ poly(x, 12), data = demo_data)
+grid$poly12 <- predict_with(fit_poly12)
+
+# (4) Step function with 5 bins on equal-quantile cuts. Pin the
+# outer breakpoints to x_lim so the grid endpoints don't fall
+# outside breaks[1] / breaks[length(breaks)] — otherwise cut()
+# returns NA at the boundaries, leaving the step line truncated.
+breaks <- quantile(demo_data$x, probs = seq(0, 1, length.out = 6))
+breaks[c(1L, length(breaks))] <- x_lim
+demo_data$bin <- cut(demo_data$x, breaks = breaks,
+                     include.lowest = TRUE)
+grid$bin <- cut(grid$x, breaks = breaks, include.lowest = TRUE)
+fit_step <- lm(y ~ bin, data = demo_data)
+grid$step <- predict_with(fit_step)
+
+# (5) Natural cubic spline, 6 df (knots at internal quantiles).
+fit_ns <- lm(y ~ splines::ns(x, df = 6), data = demo_data)
+grid$ns <- predict_with(fit_ns)
+
+# (6) Smoothing spline via mgcv::gam, REML smoothness selection.
+fit_gam <- mgcv::gam(
+  y ~ s(x, bs = "tp", k = 20),
+  data = demo_data,
+  method = "REML"
+)
+grid$gam <- predict_with(fit_gam)
+
+# (7) LOESS, default span.
+fit_loess <- loess(y ~ x, data = demo_data, span = 0.5)
+grid$loess <- predict_with(fit_loess)
+```
+
+The fitted curves are shown in @fig-method-comparison-overlay.
+
+::::{#fig-method-comparison-overlay layout-ncol="2"}
+
+:::{#fig-method-comparison-poly}
+
+```{r}
+#| label: method-comparison-poly-plot
+#| code-fold: true
+ggplot(grid, aes(x = x)) +
+  geom_point(
+    data = demo_data, aes(y = y),
+    alpha = 0.3, size = 0.8
+  ) +
+  geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) +
+  geom_line(aes(y = linear, color = "Linear"),
+            linewidth = 0.7, linetype = "longdash") +
+  geom_line(aes(y = poly2, color = "Quadratic"), linewidth = 0.7) +
+  geom_line(aes(y = poly12, color = "Degree 12"),
+            linewidth = 0.7) +
+  scale_color_manual(
+    name = NULL,
+    values = c("Truth" = "black", "Linear" = "grey50",
+               "Quadratic" = "tomato", "Degree 12" = "steelblue")
+  ) +
+  labs(x = "x", y = "y") +
+  theme(legend.position = "bottom")
+```
+
+Polynomial fits. Quadratic underfits the central inflection; the
+degree-12 polynomial chases the noise and swings wildly near the
+boundaries (Runge's phenomenon).
+:::
+
+:::{#fig-method-comparison-step}
+
+```{r}
+#| label: method-comparison-step-plot
+#| code-fold: true
+ggplot(grid, aes(x = x)) +
+  geom_point(
+    data = demo_data, aes(y = y),
+    alpha = 0.3, size = 0.8
+  ) +
+  geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) +
+  geom_step(aes(y = step, color = "Step (5 bins)"),
+            linewidth = 0.7) +
+  scale_color_manual(
+    name = NULL,
+    values = c("Truth" = "black", "Step (5 bins)" = "tomato")
+  ) +
+  labs(x = "x", y = "y") +
+  theme(legend.position = "bottom")
+```
+
+Step function with 5 quantile-spaced bins. Captures the broad shape
+but is discontinuous at each cutpoint — almost never the right
+model for a continuous exposure.
+:::
+
+:::{#fig-method-comparison-splines}
+
+```{r}
+#| label: method-comparison-splines-plot
+#| code-fold: true
+ggplot(grid, aes(x = x)) +
+  geom_point(
+    data = demo_data, aes(y = y),
+    alpha = 0.3, size = 0.8
+  ) +
+  geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) +
+  geom_line(
+    aes(y = ns, color = "Natural cubic spline (df = 6)"),
+    linewidth = 0.7
+  ) +
+  geom_line(
+    aes(y = gam, color = "Smoothing spline (REML, k = 20)"),
+    linewidth = 0.7
+  ) +
+  scale_color_manual(
+    name = NULL,
+    values = c(
+      "Truth" = "black",
+      "Natural cubic spline (df = 6)" = "tomato",
+      "Smoothing spline (REML, k = 20)" = "steelblue"
+    )
+  ) +
+  labs(x = "x", y = "y") +
+  theme(legend.position = "bottom")
+```
+
+Splines. Both spline fits track the truth closely without the
+boundary swings of the degree-12 polynomial. The smoothing spline
+(REML-selected $\l$) has slightly less peak-to-peak amplitude
+because the penalty shrinks toward a straight line.
+:::
+
+:::{#fig-method-comparison-loess}
+
+```{r}
+#| label: method-comparison-loess-plot
+#| code-fold: true
+ggplot(grid, aes(x = x)) +
+  geom_point(
+    data = demo_data, aes(y = y),
+    alpha = 0.3, size = 0.8
+  ) +
+  geom_line(aes(y = truth, color = "Truth"), linewidth = 0.9) +
+  geom_line(aes(y = loess, color = "LOESS (span = 0.5)"),
+            linewidth = 0.7) +
+  scale_color_manual(
+    name = NULL,
+    values = c("Truth" = "black", "LOESS (span = 0.5)" = "tomato")
+  ) +
+  labs(x = "x", y = "y") +
+  theme(legend.position = "bottom")
+```
+
+LOESS with the default 50% span. Similar shape to the splines but
+without a global closed-form parametric representation — the curve
+is computed on demand at any evaluation point rather than stored as
+a parametric formula.
+:::
+
+Comparison of nonlinear regression methods on synthetic data
+$y = \sin(x) + 0.3\sin(3x) + \eps$, $\eps \sim \ndist{0, 0.4^2}$,
+$n = `r n`$. Splines (both regression and smoothing) and LOESS
+track the truth closely; quadratic polynomial underfits the multi-
+modal structure; degree-12 polynomial overfits and swings at
+boundaries.
+::::
+
+## Takeaways
+
+- **Polynomials beyond degree 3–4 are usually a bad idea.** The
+  degree-12 fit is a clean illustration: it's interpolating noise in
+  the middle of the data and swinging beyond the data range. With 13
+  parameters (intercept + 12 polynomial terms) it has more flexibility
+  than the natural cubic spline (`df = 6`), yet uses that flexibility
+  on noise rather than on the curve's shape — a reminder that global
+  polynomials concentrate their degrees of freedom in the wrong
+  places.
+- **Splines and LOESS are essentially indistinguishable on this
+  example.** Both produce smooth curves close to the truth. The
+  practical differences are in how easily they extend to multi-
+  predictor settings (splines do, LOESS doesn't beyond 2-D) and in
+  how inference works (splines have full Wald/Bayesian machinery,
+  LOESS doesn't).
+- **REML-selected smoothing spline doesn't require a degrees-of-
+  freedom choice.** The `df = 6` for the natural cubic spline was a
+  modeler choice; the REML smoothing spline picked its own
+  smoothness from the data.
+- **Step functions are for categorization, not smoothing.** The 5-
+  bin step function is the worst fit on a smooth-truth dataset, but
+  on real epidemiologic data where bins have substantive meaning
+  (clinical BMI categories, age groups), the interpretability
+  benefit can outweigh the within-bin information loss.
diff --git a/_subfiles/generalized-additive-models/_sec_motivation.qmd b/_subfiles/generalized-additive-models/_sec_motivation.qmd
new file mode 100644
index 0000000000..46f4a0b785
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_motivation.qmd
@@ -0,0 +1,83 @@
+::: notes
+Adapted from @james2021islr2e [§7] and @wood2017generalized [§1].
+:::
+
+So far the regression models in this book have assumed that each
+continuous predictor enters the linear predictor through a single
+coefficient:
+
+$$g(\E{Y \mid X = x}) = \b_0 + \b_1 x_1 + \b_2 x_2 + \cdots + \b_p x_p$$
+
+This forces the effect of every predictor to be **monotone**: if
+$\b_j > 0$, raising $x_j$ always increases the predicted response on
+the link scale; if $\b_j < 0$, it always decreases it. Many real
+epidemiologic exposures don't behave that way.
+
+::::{#exm-age-mortality}
+#### Age and all-cause mortality
+
+All-cause mortality is **U-shaped** in age: high in infancy, low through
+young adulthood, then rising steeply in old age [@me4]. A linear model in
+$\text{age}$ would average those regimes into a single slope, badly
+misrepresenting risk at both ends.
+::::
+
+::::{#exm-bmi-mortality}
+#### BMI and mortality
+
+BMI–mortality curves are **J-** or **U-shaped**: very low and very high
+BMI both carry excess risk, with the minimum somewhere in the 22–25
+kg/m² range [@aune2016bmi].
+A linear or even quadratic fit misses the asymmetry between the
+underweight and obese tails.
+::::
+
+::::{#exm-temperature-mortality}
+#### Outdoor temperature and daily mortality
+
+Time-series studies of mortality and daily temperature consistently
+show a **smooth nonlinear "bathtub"** relationship: both very cold
+and very hot days raise short-term mortality risk, with a
+location-specific optimum in between [@gasparrini2015mortality]. The
+optimum and the steepness of the cold/heat slopes vary by city,
+season, and population — features that no single linear term can
+capture.
+::::
+
+The pattern in @exm-age-mortality, @exm-bmi-mortality, and
+@exm-temperature-mortality is general: many epidemiologic exposures
+have an effect on outcome that is **smooth but not monotone**. We need
+a regression framework that lets the response depend on a continuous
+predictor through an arbitrary smooth function, while keeping the
+GLM machinery (link function, likelihood, inference) we have already
+built up.
+
+**Generalized additive models** (GAMs) provide exactly this. A GAM
+replaces the linear predictor
+
+$$g(\E{Y \mid X = x}) = \b_0 + \sum_{j=1}^p \b_j x_j$$
+
+with an **additive** sum of smooth functions
+
+$$g(\E{Y \mid X = x}) = \b_0 + \sum_{j=1}^p f_j(x_j),$$
+
+where each $f_j$ is estimated nonparametrically from the data. The
+linear predictor remains additive across predictors (so each
+predictor's contribution is still interpretable on its own), but the
+shape of each $f_j$ is no longer constrained to be linear.
+
+The rest of this chapter builds up to GAMs in stages:
+
+1. **Polynomial regression** and **step functions** — the simplest
+   ways to introduce nonlinearity, useful but limited.
+2. **Basis functions** — the general framework that polynomial and
+   step expansions both fit into, and the route to splines.
+3. **Regression splines** and **smoothing splines** — the smoothers
+   most commonly used inside GAMs.
+4. **Local regression (LOESS)** — an alternative smoother family.
+5. **GAMs** — additive combinations of any of the above smoothers,
+   fit via penalized likelihood with automatic smoothness selection.
+
+We'll fit GAMs in R using the `mgcv` package [@wood2017generalized],
+which implements the modern penalized-regression approach with
+automatic smoothness selection via REML or GCV.
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_polynomial.qmd b/_subfiles/generalized-additive-models/_sec_polynomial.qmd
new file mode 100644
index 0000000000..2ad8332b48
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_polynomial.qmd
@@ -0,0 +1,98 @@
+::: notes
+Adapted from @james2021islr2e [§7.1].
+:::
+
+The simplest way to allow a nonlinear effect of $X$ is to extend the
+linear regression model by including powers of $X$ as additional
+predictors:
+
+::::{#def-polynomial-regression}
+#### Polynomial regression
+
+For a continuous predictor $X$ and degree $d \ge 1$, the
+**polynomial regression model** is
+
+$$y_i = \b_0 + \b_1 x_i + \b_2 x_i^2 + \cdots + \b_d x_i^d + \eps_i$$
+
+with $\eps_i \siid \ndist{0, \s^2}$.
+
+This is still a *linear* regression model in the coefficients
+$\b_0, \ldots, \b_d$ — the nonlinearity lives in the deterministic
+transformations $x_i^k$ of the predictor, not in the relationship
+between coefficients and response. So everything we know about
+linear-regression estimation, inference, and prediction (least squares,
+standard errors, confidence intervals) applies directly.
+::::
+
+::::{#exm-polynomial-quadratic}
+#### Quadratic fit to a U-shaped exposure
+
+For BMI–mortality, a degree-2 polynomial
+
+$$\logitf{\Pr(Y = 1 \mid \text{BMI} = b)} = \b_0 + \b_1 b + \b_2 b^2$$
+
+can capture a single-minimum U-shape, with the minimum-risk BMI at
+$-\b_1 / (2 \b_2)$ (assuming $\b_2 > 0$, i.e., the parabola opens
+upward). A quadratic is the lowest-degree polynomial that
+admits a turning point, so it's a natural first step beyond a linear
+fit.
+::::
+
+## Limitations of polynomial regression
+
+Polynomials are convenient but have well-known problems:
+
+- **Global behavior.** A single $\b_k$ controls the function's
+  behavior everywhere on the $x$-axis. A polynomial that fits the
+  middle of the data well often **swings wildly** at the extremes.
+  The classical name for this in the *interpolation* setting is
+  **Runge's phenomenon**; for high-degree least-squares regression
+  polynomials the analogous oscillation comes from boundary
+  instability and ill-conditioning of the design matrix, but the
+  qualitative warning is the same.
+- **No local control.** If the true relationship is bumpy in one
+  region and flat in another, no single polynomial fits both well.
+- **Degree selection is unstable.** Adding one more degree changes
+  every fitted value, not just the local fit. Standard model
+  selection criteria (AIC, cross-validation) can recommend very
+  different degrees from one dataset to the next.
+- **Extrapolation explodes.** Outside the observed range of $x$, the
+  highest-degree term dominates: a degree-4 fit predicts that
+  log-odds grows like $x^4$ at extreme exposures, which is essentially
+  never the right scientific model.
+
+A common rule of thumb is to use polynomials of degree at most 3 or 4,
+and only when the underlying mechanism plausibly produces that shape.
+For genuinely flexible nonlinear effects, the **basis-function** and
+**spline** approaches developed later in this chapter are usually
+better choices.
+
+## Polynomial regression in R
+
+In R, polynomial regression is fit by including `poly(x, degree)` in
+the formula:
+
+```r
+# Naive polynomial: raw powers x, x^2, x^3
+lm(y ~ x + I(x^2) + I(x^3), data = d)
+
+# Orthogonal polynomial: same fitted values, better-conditioned design
+lm(y ~ poly(x, 3), data = d)
+```
+
+The two parameterizations produce identical fitted values $\hat y_i$
+and identical hypothesis tests of "is the polynomial nonzero", but
+differ in the interpretation of individual $\hat\b_k$ coefficients:
+
+- `x + I(x^2) + I(x^3)` uses **raw powers**. Coefficients are tightly
+  correlated (the columns of the design matrix are not orthogonal),
+  so the individual $t$-statistic on $\hat\b_2$, say, is hard to
+  interpret on its own.
+- `poly(x, 3)` uses an **orthogonal basis** that decorrelates the
+  columns. Individual $t$-statistics now test the marginal
+  contribution of each degree given the lower-degree terms, which is
+  usually what we want for "does this degree improve the fit".
+
+For prediction (the fitted curve), both parameterizations are
+equivalent — but the orthogonal form has better numerical
+conditioning and is the default we recommend.
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_regression_splines.qmd b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd
new file mode 100644
index 0000000000..4c8716fa36
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd
@@ -0,0 +1,161 @@
+::: notes
+Adapted from @james2021islr2e [§7.4] and @wood2017generalized [§5].
+:::
+
+A **regression spline** is a piecewise polynomial that's constrained
+to be smooth at the points where the pieces meet ("knots"). The
+result is much more flexible than a single global polynomial but
+much smoother than a step function.
+
+## Piecewise polynomials and knots
+
+::::{#def-piecewise-polynomial}
+#### Piecewise polynomial
+
+Choose **knots** $\xi_1 < \xi_2 < \cdots < \xi_K$ on the support of
+$X$. A **piecewise polynomial of degree $d$** with knots
+$\xi_1, \ldots, \xi_K$ is a function that is a polynomial of degree
+$d$ on each of the intervals
+$(-\infty, \xi_1], (\xi_1, \xi_2], \ldots, (\xi_K, \infty)$ — but
+the polynomials in different intervals can have different
+coefficients.
+::::
+
+::::{#exm-piecewise-polynomial}
+#### Unconstrained piecewise polynomial parameter count
+
+Without any continuity constraint, an unconstrained piecewise
+polynomial has $(K+1)(d+1)$ free coefficients ($d+1$ per piece, $K+1$
+pieces) and can have **jumps and corners** at every knot — basically
+$K+1$ independent polynomial fits glued side by side. That's almost
+always *too* flexible.
+::::
+
+## Continuity and smoothness constraints
+
+Splines pay for their flexibility by adding **smoothness
+constraints** at the knots.
+
+::::{#def-spline}
+#### Spline
+
+A **spline of degree $d$** with knots $\xi_1, \ldots, \xi_K$ is a
+piecewise polynomial of degree $d$ that is continuous, has
+continuous first derivative, ..., has continuous $(d-1)$-th
+derivative at every knot.
+::::
+
+::::{#exm-spline}
+#### Effective parameter count for a cubic spline
+
+Each continuity constraint removes one degree of freedom. A
+piecewise-cubic ($d = 3$) with $K$ knots has $(K+1) \cdot 4 = 4K + 4$
+unconstrained parameters. Requiring continuity of the function and
+its first two derivatives at each of the $K$ knots imposes $3K$
+constraints, leaving $4K + 4 - 3K = K + 4$ effective parameters.
+::::
+
+The most common choice in practice is $d = 3$ — the **cubic
+spline** — because human eyes can't see discontinuities in third or
+higher derivatives, so a cubic spline already looks "smooth" to the
+viewer.
+
+## Truncated power basis
+
+One concrete way to construct a degree-$d$ spline with knots
+$\xi_1, \ldots, \xi_K$ is the **truncated power basis**:
+
+$$\b_0 + \b_1 X + \b_2 X^2 + \cdots + \b_d X^d
++ \sum_{k=1}^K \b_{d+k} (X - \xi_k)_+^d,$$
+
+where $(u)_+ = \max(u, 0)$. The first $d+1$ terms are a global
+polynomial; each truncated-power term $(X - \xi_k)_+^d$ adds a "kink"
+of order $d$ that turns on at knot $\xi_k$. The basis has $K + d + 1$
+columns — one for the intercept, $d$ for the global polynomial, and
+$K$ for the knot-activated terms — matching the $K + 4$ effective
+parameters of a cubic spline (when $d = 3$).
+
+This basis is conceptually clean — every basis function has an
+obvious geometric meaning — but **numerically poorly conditioned**
+when knots are close together or far from zero. The design matrix
+$\tp{\mat{B}} \mat{B}$ can be nearly singular, so OLS
+estimates are unstable. For numerical work, the **B-spline basis**
+(equivalent space of splines, locally supported basis functions) is
+the standard choice. R's `splines::bs()` returns a B-spline basis.
+
+## Natural cubic splines
+
+Cubic splines often **misbehave at the boundary**: with no data
+beyond the outermost knots, the cubic terms can swing wildly past the
+last observed $x$. The **natural cubic spline** fixes this by adding
+four more constraints — the function is required to be **linear**
+beyond the outermost knots, which is equivalent to forcing the
+second and third derivatives to vanish at each boundary knot:
+
+$$f''(\xi_1) = f'''(\xi_1) = f''(\xi_K) = f'''(\xi_K) = 0.$$
+
+These four boundary constraints (linearity past each end of the
+data) reduce the effective parameter count from $K + 4$ to $K$. The
+fit is identical to a cubic spline in the *interior* but is forced
+to be linear in the tails — almost always a better-behaved
+extrapolation than an unconstrained cubic.
+
+R provides natural cubic splines via `splines::ns()`. For routine
+work, **natural cubic splines are the default recommendation** over
+unconstrained cubic splines.
+
+## Knot placement
+
+A spline fit depends on three choices:
+
+1. **Number of knots** $K$. More knots = more flexibility = more risk
+   of overfitting. Common practice: pick $K$ so that the resulting
+   degrees of freedom is in the 4–8 range for most exposures, or
+   tune $K$ by cross-validation.
+2. **Location of knots** $\xi_1, \ldots, \xi_K$. The standard
+   recommendation is to place knots at **quantiles of the $X$
+   distribution** (e.g. for $K = 4$: at the 20th, 40th, 60th, 80th
+   percentiles). This puts more knots where there's more data — the
+   exact opposite of what equally-spaced knots would do.
+3. **Boundary handling** (cubic vs natural cubic).
+
+The `df` argument to `ns()` or `bs()` automates (1) and (2): R
+chooses knots at the appropriate sample quantiles to produce a basis
+of the requested dimension.
+
+## Regression splines in R
+
+```r
+library(splines)
+
+# Natural cubic spline with 4 degrees of freedom
+# (knots auto-placed at quantiles)
+m_ns <- lm(y ~ ns(x, df = 4), data = d)
+
+# Cubic B-spline basis with 6 degrees of freedom
+m_bs <- lm(y ~ bs(x, df = 6), data = d)
+
+# In a GLM, just substitute glm() and a family
+m_glm <- glm(y ~ ns(x, df = 4),
+             data = d, family = binomial())
+```
+
+Both `ns()` and `bs()` return numeric matrices that slot directly
+into `lm()`/`glm()` formulas. The `df` argument is the **exact**
+degrees of freedom (basis dimension), in contrast to the `k`
+argument of `mgcv::s()` introduced later in this chapter, which is
+an **upper bound** that REML penalization can reduce to the actual
+effective degrees of freedom. With `splines::ns(df = 4)` you always
+get 4 df; with `mgcv::s(x, k = 20)` you might end up with EDF 3 or
+EDF 15 depending on the data.
+
+The fitted curve and pointwise CIs come from `predict()` with
+`se.fit = TRUE` in the usual way:
+
+```r
+grid  <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200))
+pred  <- predict(m_ns, newdata = grid, se.fit = TRUE)
+grid$fit   <- pred$fit
+grid$lower <- pred$fit - 1.96 * pred$se.fit
+grid$upper <- pred$fit + 1.96 * pred$se.fit
+```
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd
new file mode 100644
index 0000000000..6019114d2d
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd
@@ -0,0 +1,177 @@
+::: notes
+Adapted from @james2021islr2e [§7.5] and @wood2017generalized [§5–6].
+:::
+
+Regression splines force the modeler to make several choices —
+basis type, number of knots, knot locations — that affect the
+result. **Smoothing splines** sidestep that decision tree by taking
+a different route: they place a knot at every data point and then
+**penalize wiggliness** to control overfitting.
+
+## The penalized fit
+
+::::{#def-smoothing-spline}
+#### Smoothing spline (penalized least squares)
+
+Among all twice-differentiable functions $g$, the **smoothing
+spline with smoothing parameter $\l \ge 0$** is the minimizer of
+
+$$\underbrace{\sum_{i=1}^n (y_i - g(x_i))^2}_{\text{fit to data}}
++ \l \underbrace{\int g''(t)^2 \, dt}_{\text{penalty on wiggliness}}.$$
+
+The first term rewards close fit to the data; the second term
+penalizes large second derivatives, i.e. fast changes in slope.
+The trade-off is controlled by $\l$:
+
+- $\l \to 0$: the function passes through every data point
+  ($g(x_i) = y_i$). No penalty for wiggling — pure interpolation,
+  badly overfits.
+- $\l \to \infty$: the integrated squared second derivative is
+  forced to zero, so $g$ is a straight line. Maximum smoothness —
+  reduces to ordinary linear regression.
+- Intermediate $\l$: a smooth nonparametric fit.
+
+A foundational result is that the minimizer is always a **natural
+cubic spline with knots at every unique $x_i$**
+[@wood2017generalized, §5.1; @james2021islr2e, §7.5]. So even though
+the problem is defined over all twice-differentiable functions, the
+answer lives in a finite-dimensional space we can compute with.
+::::
+
+::::{#exm-smoothing-spline}
+#### Effect of the smoothing parameter on a fitted curve
+
+Consider fitting a smoothing spline to a scatterplot of 200
+observations from a U-shaped exposure--outcome relationship.
+
+- With $\l$ very small (near zero), the fitted curve passes through
+  every point --- it wiggles through the noise and overfits.
+- With $\l$ very large, the fitted curve is a straight line ---
+  it misses the U-shape entirely.
+- At the GCV/REML-optimal $\hat\l$, the effective degrees of freedom
+  lies somewhere between 2 (linear) and $n$ (interpolation),
+  capturing the U-shape without chasing individual data points.
+
+The optimal $\hat\l$ is selected automatically by
+`smooth.spline(x, y)` (the default `cv = FALSE` uses GCV; pass
+`cv = TRUE` to switch to exact leave-one-out CV) or by
+`mgcv::gam(y ~ s(x), method = "REML")`.
+::::
+
+## Effective degrees of freedom
+
+The smoothing spline fit can be written as
+$\hat{\mat{y}} = \mat{S}_\l \mat{y}$
+for a smoother matrix $\mat{S}_\l$ that depends on $\l$ and the
+$x_i$ values. Because the function is penalized rather than
+projected onto a low-dimensional subspace, the usual "number of
+parameters" count doesn't apply.
+
+The standard replacement is the **effective degrees of freedom**
+(EDF):
+
+::::{#def-effective-degrees-of-freedom}
+#### Effective degrees of freedom
+
+$$\text{EDF}(\l) = \text{tr}(\mat{S}_\l)
+= \sum_{i=1}^n \frac{\partial \hat y_i}{\partial y_i}.$$
+
+The EDF interpolates smoothly between two endpoints. For the **full
+smoothing-spline model** (intercept + smooth term), EDF ranges from:
+
+- $\text{EDF} \approx 2$ when $\l$ is large (model is essentially linear,
+  i.e. 2 parameters: intercept + slope).
+- $\text{EDF} \approx n$ when $\l \to 0$ (model interpolates every data
+  point).
+
+When reading `mgcv::summary.gam()` output later in this chapter, the
+column labeled `edf` reports a **per-smooth EDF that excludes the
+global model intercept**, so a smooth that has been penalized down
+to a straight line shows up as EDF $\approx 1$ — *not* 2. The two
+conventions are consistent (the intercept is counted once, outside
+the per-smooth column); they just split the parameter count
+differently. An EDF of 1 in `summary()` output therefore means
+"this smooth has collapsed to a line"; 5–8 means a moderately
+wiggly curve; larger EDFs indicate more local features.
+::::
+
+## Choosing $\l$ automatically
+
+We don't pick $\l$ by hand — we pick it from the data using one
+of:
+
+- **Cross-validation** (typically leave-one-out for smoothing splines,
+  which has a closed-form formula avoiding an actual leave-one-out
+  loop).
+- **Generalized cross-validation** (GCV), a fast computational
+  approximation to LOO-CV.
+- **REML** (restricted maximum likelihood), which treats the
+  penalty as arising from a normal-prior random effect and selects
+  $\l$ by likelihood. REML is not the default in `mgcv::gam()`
+  (which uses `method = "GCV.Cp"`), but it is generally preferred for
+  its better statistical properties
+  [@wood2017generalized, §6.2]; pass `method = "REML"` explicitly.
+
+The user typically specifies a maximum dimension for the basis
+(`k = ...` in `mgcv::s()`) and lets the software select the
+effective smoothness automatically.
+
+## Smoothing splines vs regression splines
+
+| Aspect | Regression spline | Smoothing spline |
+|:--|:--|:--|
+| Knot count | Modeler picks $K$ (small) | One per data point ($\sim n$) |
+| Knot placement | Modeler or quantile rule | All $x_i$ values |
+| Smoothness control | Number of knots | Penalty parameter $\l$ |
+| Overfitting risk | Choose $K$ too large | Choose $\l$ too small |
+| Inference | Standard linear-model output | Bayesian / EDF-based |
+| Software default | `splines::ns(df = ...)` | `mgcv::s(...)`, `smooth.spline()` |
+
+The "inference" row reflects the typical software pairings shown in
+the last row, not a fundamental property of the bases. Regression
+splines fit via `splines::ns()` + `lm()` / `glm()` use the usual
+linear-model machinery; smoothing splines fit via `mgcv::s()` +
+`mgcv::gam()` use Bayesian posterior intervals. A regression spline
+fit through `mgcv::gam()` with a fixed penalty (`sp = 0`) gets the
+Bayesian intervals too — the table is about software defaults, not
+the underlying bases.
+
+The two approaches **span the same function space** (piecewise
+cubic polynomials with continuity constraints) and often produce
+very similar fitted curves. The practical difference is who picks
+the smoothness:
+
+- **Regression splines**: the modeler picks `df` (or `k`), then OLS
+  picks the best fit at that complexity.
+- **Smoothing splines**: the basis is intentionally over-large, and
+  the smoothness parameter is data-adaptive.
+
+For routine work, **smoothing splines via `mgcv::s()` are the
+default recommendation**: REML smoothness selection just works,
+and the EDF in the model summary gives an honest answer to "how
+wiggly is my smooth".
+
+## Smoothing splines in R
+
+Base R has `smooth.spline()`, but for any nontrivial use we'll go
+through `mgcv`:
+
+```r
+library(mgcv)
+
+# Univariate smoother, REML smoothness selection
+m_ss <- gam(y ~ s(x, k = 20, bs = "tp"),
+            data = d, method = "REML")
+
+# Same idea inside a GLM
+m_ss_logistic <- gam(y ~ s(x, k = 20, bs = "tp"),
+                     data = d, family = binomial(),
+                     method = "REML")
+
+summary(m_ss)  # reports EDF of the s(x) term
+plot(m_ss)     # plots the fitted smooth with CI band
+```
+
+We'll return to `mgcv::s()` in the GAM-fitting section below — the
+same `s()` syntax extends to multi-predictor additive models with
+no additional machinery.
\ No newline at end of file
diff --git a/_subfiles/generalized-additive-models/_sec_step_functions.qmd b/_subfiles/generalized-additive-models/_sec_step_functions.qmd
new file mode 100644
index 0000000000..1b63c1ece9
--- /dev/null
+++ b/_subfiles/generalized-additive-models/_sec_step_functions.qmd
@@ -0,0 +1,98 @@
+::: notes
+Adapted from @james2021islr2e [§7.2].
+:::
+
+**Step functions** are the opposite extreme from polynomials: instead
+of letting the response depend smoothly on $X$, they cut the range of
+$X$ into bins and fit a separate constant within each bin. The fit is
+piecewise-constant — a staircase rather than a curve.
+
+::::{#def-step-function-regression}
+#### Step-function regression
+
+Choose **cutpoints** $c_1 < c_2 < \cdots < c_K$ on the support of $X$,
+producing $K+1$ bins:
+$(-\infty, c_1]$, $(c_1, c_2]$, ..., $(c_K, \infty)$.
+Using the conventions $c_0 = -\infty$ and $c_{K+1} = \infty$, define indicator
+variables
+
+$$C_k(X) = \indicp{c_{k-1} < X \le c_k}, \quad k = 1, \ldots, K+1.$$
+
+Because
+$\sum_{k=1}^{K+1} C_k(X) = 1$ for every $X$ (exactly one indicator is
+"on"), the full set of $K+1$ indicators plus an intercept is linearly
+dependent. One bin is therefore dropped as the **reference category**.
+Taking bin $1$ as the reference and using $C_2, \ldots, C_{K+1}$, the
+identified step-function regression model is
+
+$$y_i = \b_0 + \b_2 C_2(x_i) + \b_3 C_3(x_i) + \cdots + \b_{K+1} C_{K+1}(x_i) + \eps_i,$$
+
+with $K$ free slope parameters (one per non-reference bin) plus the
+intercept. $\b_0$ is the expected response in the reference bin and
+$\b_k$ ($k \ge 2$) is the contrast in mean response between bin $k$
+and the reference bin.
+::::
+
+::::{#exm-step-function-regression}
+#### BMI categories as a step function
+
+Step-function regression is the regression-modeling equivalent of
+categorizing a continuous predictor. The familiar "age groups" or
+"BMI categories" used in many epidemiology tables are exactly this:
+cut the continuous predictor into bins, then enter the bin
+indicators as regressors. For example, WHO BMI categories ---
+underweight ($< 18.5$), normal weight ($18.5$--$24.9$), overweight
+($25$--$29.9$), and obese ($\ge 30$) --- use $K = 3$ cutpoints
+producing four bin indicators, with three slope parameters estimating
+the mean response contrast between each non-reference category and
+the reference (here, normal weight).
+::::
+
+## Strengths
+
+- **Interpretable on the original scale.** Each $\b_k$ is just "mean
+  response in bin $k$ minus mean response in the reference bin," with
+  the same units as the outcome (or on the link scale for a GLM).
+  Easy to put in a table.
+- **No assumption about within-bin shape.** Step functions don't
+  impose linearity, monotonicity, or any other within-bin
+  relationship — they just average.
+- **Robust to outliers in $X$.** Extreme $X$ values pull only the
+  outermost bin's mean.
+
+## Limitations
+
+- **Discontinuous fit.** Step functions can't represent a smooth
+  relationship: the fitted curve jumps at each cutpoint, which is
+  almost never the right scientific model for a continuous exposure.
+- **Bin choice is arbitrary.** Different cutpoint sets produce
+  different fits, and there is no statistically principled way to
+  pick cutpoints from the data without invalidating inference.
+- **Power loss.** Within each bin, all variation in $X$ is treated as
+  noise. Compared with a smooth fit that exploits the continuous
+  structure, step functions waste information about where in the bin
+  each observation falls.
+- **Effective parameter count.** A step function with $K$ cutpoints
+  uses $K$ parameters beyond the intercept. For a meaningful fit, $K$
+  has to be large enough to resolve the underlying shape — but every
+  extra cutpoint costs a degree of freedom.
+
+Step functions are most useful when bin boundaries are
+**substantively meaningful** (e.g. clinical BMI categories) and the
+goal is to communicate effects in those familiar categories rather
+than to estimate a smooth dose-response curve. For pure curve
+estimation, splines almost always do better.
+
+## Step functions in R
+
+The base-R function `cut()` produces a factor coding the bins:
+
+```r
+d$bmi_cat <- cut(d$bmi, breaks = c(-Inf, 18.5, 25, 30, Inf),
+                 labels = c("underweight", "normal", "overweight", "obese"))
+lm(y ~ bmi_cat, data = d)
+```
+
+The factor is automatically expanded into $K$ contrast indicators by
+`model.matrix`, with the first level as the reference. The fitted
+coefficients are the contrasts relative to that reference category.
\ No newline at end of file
diff --git a/chapters/generalized-additive-models.qmd b/chapters/generalized-additive-models.qmd
new file mode 100644
index 0000000000..b759dec7aa
--- /dev/null
+++ b/chapters/generalized-additive-models.qmd
@@ -0,0 +1,78 @@
+---
+title: "Generalized Additive Models"
+format:
+  html: default
+  revealjs:
+    output-file: generalized-additive-models-slides.html
+  pdf:
+    output-file: generalized-additive-models-handout.pdf
+  docx:
+    output-file: generalized-additive-models-handout.docx
+
+---
+
+{{< include shared-config.qmd >}}
+
+# Motivation: beyond the linear predictor {#sec-gam-motivation}
+
+{{< include _subfiles/generalized-additive-models/_sec_motivation.qmd >}}
+
+# Polynomial regression {#sec-polynomial-regression}
+
+{{< include _subfiles/generalized-additive-models/_sec_polynomial.qmd >}}
+
+# Step functions {#sec-step-functions}
+
+{{< include _subfiles/generalized-additive-models/_sec_step_functions.qmd >}}
+
+# Basis functions {#sec-basis-functions}
+
+{{< include _subfiles/generalized-additive-models/_sec_basis_functions.qmd >}}
+
+# Regression splines {#sec-regression-splines}
+
+{{< include _subfiles/generalized-additive-models/_sec_regression_splines.qmd >}}
+
+# Smoothing splines {#sec-smoothing-splines}
+
+{{< include _subfiles/generalized-additive-models/_sec_smoothing_splines.qmd >}}
+
+# Local regression {#sec-local-regression}
+
+{{< include _subfiles/generalized-additive-models/_sec_local_regression.qmd >}}
+
+# Method comparison on a shared synthetic dataset {#sec-method-comparison}
+
+{{< include _subfiles/generalized-additive-models/_sec_method_comparison.qmd >}}
+
+# Generalized additive models {#sec-gams}
+
+{{< include _subfiles/generalized-additive-models/_sec_gams.qmd >}}
+
+# Fitting GAMs in R {#sec-gam-fitting}
+
+{{< include _subfiles/generalized-additive-models/_sec_gam_fitting.qmd >}}
+
+# Inference and model selection {#sec-gam-inference}
+
+{{< include _subfiles/generalized-additive-models/_sec_gam_inference.qmd >}}
+
+# Worked example {#sec-gam-example}
+
+{{< include _subfiles/generalized-additive-models/_sec_gam_example.qmd >}}
+
+# Lab: Non-linear modeling {#sec-gam-lab}
+
+## Setup {.unnumbered}
+
+{{< include _subfiles/generalized-additive-models/_sec_lab.qmd >}}
+
+# Exercises {#sec-gam-exercises}
+
+## Conceptual exercises
+
+{{< include _subfiles/generalized-additive-models/_exr-gams-conceptual.qmd >}}
+
+## Applied exercises
+
+{{< include _subfiles/generalized-additive-models/_exr-gams-applied.qmd >}}
diff --git a/references.bib b/references.bib
index 845703a653..5246243b6e 100644
--- a/references.bib
+++ b/references.bib
@@ -798,7 +798,8 @@ @book{wood2017generalized
   title={Generalized additive models: an introduction with R},
   author={Wood, Simon N},
   year={2017},
-  publisher={chapman and hall/CRC}
+  edition={2},
+  publisher={Chapman and Hall/CRC}
 }
 
 @book{roback2021beyond,
@@ -1689,3 +1690,45 @@ @article{debruyn2011mira
   year={2011},
   doi={10.1136/sti.2010.044990}
 }
+
+@article{aune2016bmi,
+  title = {{BMI} and all cause mortality: systematic review and non-linear dose-response meta-analysis of 230 cohort studies with 3.74 million deaths among 30.3 million participants},
+  author = {Aune, Dagfinn and Sen, Abhijit and Prasad, Manya and Norat, Teresa and Janszky, Imre and Tonstad, Serena and Romundstad, Pal and Vatten, Lars J},
+  journal = {BMJ},
+  volume = {353},
+  pages = {i2156},
+  year = {2016},
+  doi = {10.1136/bmj.i2156}
+}
+
+@article{gasparrini2015mortality,
+  title = {Mortality risk attributable to high and low ambient temperature: a multicountry observational study},
+  author = {Gasparrini, Antonio and Guo, Yuming and Hashizume, Masahiro and Lavigne, Eric and Zanobetti, Antonella and Schwartz, Joel and Tobias, Aurelio and Tong, Shilu and Rocklov, Joacim and Forsberg, Bertil and others},
+  journal = {The Lancet},
+  volume = {386},
+  number = {9991},
+  pages = {369--375},
+  year = {2015},
+  doi = {10.1016/S0140-6736(14)62114-0}
+}
+
+@article{wood2013pvalues,
+  title = {On p-values for smooth components of an extended generalized additive model},
+  author = {Wood, Simon N},
+  journal = {Biometrika},
+  volume = {100},
+  number = {1},
+  pages = {221--228},
+  year = {2013},
+  doi = {10.1093/biomet/ass048}
+}
+
+@book{james2023islp,
+  title     = {An Introduction to Statistical Learning: with Applications in {Python}},
+  author    = {James, Gareth and Witten, Daniela and Hastie, Trevor and Tibshirani, Robert and Taylor, Jonathan},
+  year      = {2023},
+  publisher = {Springer},
+  address   = {New York},
+  isbn      = {978-3-031-38746-3},
+  url       = {https://www.statlearning.com}
+}

From 719594b02e00122a7369987eb89edd8c65dbe669 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 9 Jun 2026 08:02:26 +0000
Subject: [PATCH 2/6] GAM chapter cleanups: trailing newlines, eval:false,
 content-hidden, deps

- Add trailing newlines to 9 _subfiles/generalized-additive-models/
  files. POSIX requires text files to end with a newline; many tools
  (cat, sed, diff, git's last-line tracking) misbehave without one.
- Convert 7 subfiles' display-only ```r blocks to ```{r}\n#| eval: false
  so they render as code fences without trying to evaluate. Keeps
  the prose-illustration semantics but pulls the snippets into
  Quarto's code-display path (gets syntax highlighting + copy
  button + per-language config), instead of the plain Pandoc one.
- _sec_gam_example.qmd: remove the outer .content-hidden div around
  the callout-warning. The whole point of the warning ("dataset is
  illustrative; swap for an Epi 204 case study") is to be visible
  to readers; hiding it defeats it. The unwrapped callout shows in
  HTML, slides, and PDF, which is the intended audience.
- DESCRIPTION: declare gratia (>=0.9) and mgcv in Suggests. Both
  are loaded in the chapter (eval:false), and listing them lets
  renv resolve consistent versions when the chapter ever gets
  flipped to eval:true.
---
 DESCRIPTION                                   |  2 ++
 .../_exr-gams-applied.qmd                     |  2 +-
 .../_sec_basis_functions.qmd                  |  2 +-
 .../_sec_gam_example.qmd                      | 26 ++++++++++++-------
 .../_sec_gam_fitting.qmd                      |  9 ++++---
 .../generalized-additive-models/_sec_gams.qmd |  2 +-
 .../_sec_local_regression.qmd                 |  8 +++---
 .../_sec_motivation.qmd                       |  2 +-
 .../_sec_polynomial.qmd                       |  5 ++--
 .../_sec_regression_splines.qmd               |  8 +++---
 .../_sec_smoothing_splines.qmd                |  5 ++--
 .../_sec_step_functions.qmd                   |  5 ++--
 12 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 6cffdb3d7d..7269bf6dad 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -46,9 +46,11 @@ Suggests:
     ggdag,
     ggplot2,
     glmx,
+    gratia (>= 0.9),
     haven,
     kableExtra,
     KMsurv,
+    mgcv,
     muhaz,
     palmerpenguins,
     parameters,
diff --git a/_subfiles/generalized-additive-models/_exr-gams-applied.qmd b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd
index 09b313c94f..1f6c6cd131 100644
--- a/_subfiles/generalized-additive-models/_exr-gams-applied.qmd
+++ b/_subfiles/generalized-additive-models/_exr-gams-applied.qmd
@@ -157,4 +157,4 @@ print(f"OLS:         b1={b_ols[1]:.4f}, b2={b_ols[2]:.4f}")
 The coefficients agree, confirming backfitting converges to OLS when
 all component smoothers are linear [@hastie2009esl2e, §9.1.1].
 
-::::
\ No newline at end of file
+::::
diff --git a/_subfiles/generalized-additive-models/_sec_basis_functions.qmd b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd
index 14cfbab9f5..521ad849a7 100644
--- a/_subfiles/generalized-additive-models/_sec_basis_functions.qmd
+++ b/_subfiles/generalized-additive-models/_sec_basis_functions.qmd
@@ -93,4 +93,4 @@ $b_1(\cdot), \ldots, b_K(\cdot)$:
 
 The remaining design decision is **which basis functions to use** —
 and that choice is what distinguishes regression splines from
-smoothing splines from local regression from GAMs.
\ No newline at end of file
+smoothing splines from local regression from GAMs.
diff --git a/_subfiles/generalized-additive-models/_sec_gam_example.qmd b/_subfiles/generalized-additive-models/_sec_gam_example.qmd
index 223a0cd61f..d48cadeda1 100644
--- a/_subfiles/generalized-additive-models/_sec_gam_example.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gam_example.qmd
@@ -1,10 +1,8 @@
-:::: {.content-hidden}
 ::: callout-warning
 This section is illustrative only — the dataset is a built-in R
 example, not an Epi 204 case study. Replace with a course-appropriate
 exposure when the chapter is finalized.
 :::
-::::
 
 We'll fit a GAM to the built-in `MASS::Pima.tr` dataset, predicting
 diabetes status (`type`) from continuous risk factors. The exposures
@@ -13,7 +11,8 @@ age (`age`).
 
 ## Setup {.unnumbered}
 
-```r
+```{r}
+#| eval: false
 library(MASS)        # Pima.tr dataset
 library(mgcv)        # gam(), s()
 library(ggplot2)
@@ -38,7 +37,8 @@ Pima.tr$diabetes <- as.integer(Pima.tr$type == "Yes")
 
 Start with an ordinary logistic GLM as a baseline:
 
-```r
+```{r}
+#| eval: false
 m_lin <- glm(
   diabetes ~ bmi + glu + age,
   family = binomial(), data = Pima.tr
@@ -55,7 +55,8 @@ age, where diabetes risk is well known to be nonlinear.
 
 Now refit with smooths on each continuous predictor:
 
-```r
+```{r}
+#| eval: false
 m_gam <- gam(
   diabetes ~ s(bmi) + s(glu) + s(age),
   family = binomial(),
@@ -85,7 +86,8 @@ For each smooth, ask:
 If `gam.check(m_gam)` flags any smooth as having `edf` close to
 `k'`, bump `k` up:
 
-```r
+```{r}
+#| eval: false
 m_gam2 <- gam(
   diabetes ~ s(bmi) + s(glu, k = 20) + s(age),
   family = binomial(),
@@ -96,7 +98,8 @@ m_gam2 <- gam(
 
 ## Plotting partial-effect curves {.unnumbered}
 
-```r
+```{r}
+#| eval: false
 plot(m_gam, pages = 1, shade = TRUE, seWithMean = TRUE)
 ```
 
@@ -112,7 +115,8 @@ figure.
 For a cleaner ggplot rendering, `gratia::draw()` is a one-line
 alternative:
 
-```r
+```{r}
+#| eval: false
 gratia::draw(m_gam)
 ```
 
@@ -123,7 +127,8 @@ binary). To plot predicted probability vs a single predictor, hold
 the others at typical values, build the Wald CI on the **link** scale,
 then back-transform with `plogis()`:
 
-```r
+```{r}
+#| eval: false
 typical <- data.frame(
   bmi = median(Pima.tr$bmi),
   glu = median(Pima.tr$glu),
@@ -160,7 +165,8 @@ the middle), which is the right qualitative shape for a probability.
 
 ## Comparing GLM and GAM {.unnumbered}
 
-```r
+```{r}
+#| eval: false
 # Refit the GAM with method = "ML" for AIC comparison.
 # REML and ML likelihoods are on different scales; comparing AIC
 # from glm() (always ML) with a REML-fitted gam() mixes criteria.
diff --git a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
index 971694b45f..fec824cf68 100644
--- a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
@@ -12,7 +12,8 @@ REML or GCV.
 The basic syntax mirrors `lm()`/`glm()`, with smoothers introduced by
 `s()`:
 
-```r
+```{r}
+#| eval: false
 library(mgcv)
 
 m <- gam(
@@ -84,7 +85,8 @@ there at all.
 
 After fitting, run `gam.check()` to verify:
 
-```r
+```{r}
+#| eval: false
 gam.check(m)
 ```
 
@@ -102,7 +104,8 @@ Output includes:
 
 ## Plotting smooths
 
-```r
+```{r}
+#| eval: false
 plot(m, pages = 1, shade = TRUE)
 ```
 
diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd
index 4be0d099e8..98f3d90969 100644
--- a/_subfiles/generalized-additive-models/_sec_gams.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gams.qmd
@@ -110,4 +110,4 @@ two standard escape hatches are:
 For routine epidemiology work, the additive model is usually the
 right starting point; tensor smooths come up mainly when interactions
 between continuous predictors are scientifically central (e.g.
-space–time models, age–calendar-time mortality surfaces).
\ No newline at end of file
+space–time models, age–calendar-time mortality surfaces).
diff --git a/_subfiles/generalized-additive-models/_sec_local_regression.qmd b/_subfiles/generalized-additive-models/_sec_local_regression.qmd
index 7339b628f6..b32ac93378 100644
--- a/_subfiles/generalized-additive-models/_sec_local_regression.qmd
+++ b/_subfiles/generalized-additive-models/_sec_local_regression.qmd
@@ -37,7 +37,8 @@ values and connect the resulting predictions.
 To smooth the relationship between age and plasma glucose in
 `MASS::Pima.tr` using LOESS with a span of 0.5:
 
-```r
+```{r}
+#| eval: false
 m_loess <- loess(glu ~ age, data = MASS::Pima.tr, span = 0.5)
 ```
 
@@ -77,7 +78,8 @@ over that neighborhood.
 
 ## Local regression in R
 
-```r
+```{r}
+#| eval: false
 m_loess <- loess(y ~ x, data = d, span = 0.5)
 grid <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200))
 grid$fit <- predict(m_loess, newdata = grid)
@@ -94,4 +96,4 @@ smoothing-spline approach via `mgcv::s()`, which produces a global
 fit with full inferential machinery. LOESS is best thought of as a
 **diagnostic smoother** — an unobtrusive way to overlay the local
 mean of a scatterplot to check whether a parametric fit is missing
-nonlinearity.
\ No newline at end of file
+nonlinearity.
diff --git a/_subfiles/generalized-additive-models/_sec_motivation.qmd b/_subfiles/generalized-additive-models/_sec_motivation.qmd
index 46f4a0b785..57b837e4e9 100644
--- a/_subfiles/generalized-additive-models/_sec_motivation.qmd
+++ b/_subfiles/generalized-additive-models/_sec_motivation.qmd
@@ -80,4 +80,4 @@ The rest of this chapter builds up to GAMs in stages:
 
 We'll fit GAMs in R using the `mgcv` package [@wood2017generalized],
 which implements the modern penalized-regression approach with
-automatic smoothness selection via REML or GCV.
\ No newline at end of file
+automatic smoothness selection via REML or GCV.
diff --git a/_subfiles/generalized-additive-models/_sec_polynomial.qmd b/_subfiles/generalized-additive-models/_sec_polynomial.qmd
index 2ad8332b48..cbdcc7f354 100644
--- a/_subfiles/generalized-additive-models/_sec_polynomial.qmd
+++ b/_subfiles/generalized-additive-models/_sec_polynomial.qmd
@@ -72,7 +72,8 @@ better choices.
 In R, polynomial regression is fit by including `poly(x, degree)` in
 the formula:
 
-```r
+```{r}
+#| eval: false
 # Naive polynomial: raw powers x, x^2, x^3
 lm(y ~ x + I(x^2) + I(x^3), data = d)
 
@@ -95,4 +96,4 @@ differ in the interpretation of individual $\hat\b_k$ coefficients:
 
 For prediction (the fitted curve), both parameterizations are
 equivalent — but the orthogonal form has better numerical
-conditioning and is the default we recommend.
\ No newline at end of file
+conditioning and is the default we recommend.
diff --git a/_subfiles/generalized-additive-models/_sec_regression_splines.qmd b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd
index 4c8716fa36..d81d0806a0 100644
--- a/_subfiles/generalized-additive-models/_sec_regression_splines.qmd
+++ b/_subfiles/generalized-additive-models/_sec_regression_splines.qmd
@@ -125,7 +125,8 @@ of the requested dimension.
 
 ## Regression splines in R
 
-```r
+```{r}
+#| eval: false
 library(splines)
 
 # Natural cubic spline with 4 degrees of freedom
@@ -152,10 +153,11 @@ EDF 15 depending on the data.
 The fitted curve and pointwise CIs come from `predict()` with
 `se.fit = TRUE` in the usual way:
 
-```r
+```{r}
+#| eval: false
 grid  <- data.frame(x = seq(min(d$x), max(d$x), length.out = 200))
 pred  <- predict(m_ns, newdata = grid, se.fit = TRUE)
 grid$fit   <- pred$fit
 grid$lower <- pred$fit - 1.96 * pred$se.fit
 grid$upper <- pred$fit + 1.96 * pred$se.fit
-```
\ No newline at end of file
+```
diff --git a/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd
index 6019114d2d..048d75f5ea 100644
--- a/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd
+++ b/_subfiles/generalized-additive-models/_sec_smoothing_splines.qmd
@@ -156,7 +156,8 @@ wiggly is my smooth".
 Base R has `smooth.spline()`, but for any nontrivial use we'll go
 through `mgcv`:
 
-```r
+```{r}
+#| eval: false
 library(mgcv)
 
 # Univariate smoother, REML smoothness selection
@@ -174,4 +175,4 @@ plot(m_ss)     # plots the fitted smooth with CI band
 
 We'll return to `mgcv::s()` in the GAM-fitting section below — the
 same `s()` syntax extends to multi-predictor additive models with
-no additional machinery.
\ No newline at end of file
+no additional machinery.
diff --git a/_subfiles/generalized-additive-models/_sec_step_functions.qmd b/_subfiles/generalized-additive-models/_sec_step_functions.qmd
index 1b63c1ece9..29d96f4108 100644
--- a/_subfiles/generalized-additive-models/_sec_step_functions.qmd
+++ b/_subfiles/generalized-additive-models/_sec_step_functions.qmd
@@ -87,7 +87,8 @@ estimation, splines almost always do better.
 
 The base-R function `cut()` produces a factor coding the bins:
 
-```r
+```{r}
+#| eval: false
 d$bmi_cat <- cut(d$bmi, breaks = c(-Inf, 18.5, 25, 30, Inf),
                  labels = c("underweight", "normal", "overweight", "obese"))
 lm(y ~ bmi_cat, data = d)
@@ -95,4 +96,4 @@ lm(y ~ bmi_cat, data = d)
 
 The factor is automatically expanded into $K$ contrast indicators by
 `model.matrix`, with the first level as the reference. The fitted
-coefficients are the contrasts relative to that reference category.
\ No newline at end of file
+coefficients are the contrasts relative to that reference category.

From 91b60e96f867c22061d4e742c8007bea58ea26a2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 9 Jun 2026 20:35:24 +0000
Subject: [PATCH 3/6] GAM worked example: use WCGS instead of MASS::Pima.tr

Replace the built-in MASS::Pima.tr placeholder with the course's WCGS
coronary-heart-disease cohort, addressing the review concern that the
worked example was not an Epi 204 case study (and the student-visible
'illustrative only / replace dataset' callout-warning).

- Outcome: incident CHD (chd69); smoothed continuous predictors:
  age, bmi, chol, sbp.
- Load WCGS from the package's bundled inst/extdata/wcgs.rds.
- Drop the placeholder callout-warning.
- Adjust the narrative to WCGS reality (most smooths near EDF 1,
  cholesterol the predictor most likely to show curvature) without
  asserting specific run-dependent numbers.

Chunks remain eval:false, consistent with the section's existing
style and the book's other WCGS code listings.

https://claude.ai/code/session_01NQSLJmjMN22RpwpgJ6Lw6u
---
 .../_sec_gam_example.qmd                      | 112 +++++++++---------
 1 file changed, 53 insertions(+), 59 deletions(-)

diff --git a/_subfiles/generalized-additive-models/_sec_gam_example.qmd b/_subfiles/generalized-additive-models/_sec_gam_example.qmd
index d48cadeda1..e88c016480 100644
--- a/_subfiles/generalized-additive-models/_sec_gam_example.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gam_example.qmd
@@ -1,36 +1,29 @@
-::: callout-warning
-This section is illustrative only — the dataset is a built-in R
-example, not an Epi 204 case study. Replace with a course-appropriate
-exposure when the chapter is finalized.
-:::
-
-We'll fit a GAM to the built-in `MASS::Pima.tr` dataset, predicting
-diabetes status (`type`) from continuous risk factors. The exposures
-of interest are body mass index (`bmi`), plasma glucose (`glu`), and
-age (`age`).
+We'll fit a logistic GAM to the Western Collaborative Group Study (WCGS)
+data [@rosenman1975coronary],
+the coronary-heart-disease cohort introduced in the
+[logistic regression chapter](logistic-regression.qmd).
+The outcome is incident coronary heart disease (`chd69`);
+the continuous risk factors we smooth are
+age (`age`),
+body mass index (`bmi`),
+serum cholesterol (`chol`),
+and systolic blood pressure (`sbp`) —
+predictors whose effect on CHD risk need not be linear on the log-odds scale.
 
 ## Setup {.unnumbered}
 
 ```{r}
 #| eval: false
-library(MASS)        # Pima.tr dataset
 library(mgcv)        # gam(), s()
 library(ggplot2)
 library(gratia)      # tidy plots for gam fits (optional)
 
-data(Pima.tr)
-str(Pima.tr)
-#  'data.frame': 200 obs. of  8 variables:  (sample values omitted; columns are)
-#  $ npreg: int  number of pregnancies
-#  $ glu  : int  plasma glucose concentration (2 h OGTT)
-#  $ bp   : int  diastolic blood pressure (mm Hg)
-#  $ skin : int  triceps skinfold thickness (mm)
-#  $ bmi  : num  body mass index (kg/m^2)
-#  $ ped  : num  diabetes pedigree function
-#  $ age  : int  age (years)
-#  $ type : Factor w/ 2 levels "No","Yes":  diabetes diagnosis
-
-Pima.tr$diabetes <- as.integer(Pima.tr$type == "Yes")
+# WCGS ships with this package:
+wcgs <- readRDS(system.file("extdata", "wcgs.rds", package = "rme"))
+
+# Binary 0/1 outcome, and drop the one known cholesterol outlier (645):
+wcgs <- subset(wcgs, !is.na(chol) & chol < 645)
+wcgs$chd <- as.integer(wcgs$chd69 == "Yes")
 ```
 
 ## Linear baseline {.unnumbered}
@@ -40,35 +33,40 @@ Start with an ordinary logistic GLM as a baseline:
 ```{r}
 #| eval: false
 m_lin <- glm(
-  diabetes ~ bmi + glu + age,
-  family = binomial(), data = Pima.tr
+  chd ~ age + bmi + chol + sbp,
+  family = binomial(), data = wcgs
 )
 summary(m_lin)
 ```
 
-Each coefficient is a log-odds increment per unit of the predictor —
-a single number for the slope of risk across the entire range of
-BMI, glucose, and age. That's a strong assumption, especially for
-age, where diabetes risk is well known to be nonlinear.
+Each coefficient is a single log-odds increment per unit of the
+predictor — one slope for the entire range of age, BMI, cholesterol,
+and blood pressure. That's a strong assumption: it forces risk to
+climb at a constant rate across the whole range of each exposure.
 
 ## GAM with smooth terms {.unnumbered}
 
-Now refit with smooths on each continuous predictor:
+Now refit with a smooth on each continuous predictor:
 
 ```{r}
 #| eval: false
 m_gam <- gam(
-  diabetes ~ s(bmi) + s(glu) + s(age),
+  chd ~ s(age) + s(bmi) + s(chol) + s(sbp),
   family = binomial(),
-  data = Pima.tr,
+  data = wcgs,
   method = "REML"
 )
 summary(m_gam)
 ```
 
-The summary will report effective degrees of freedom for each
-smooth. In a typical run on `Pima.tr` you'll see EDFs in the
-1.5–4 range — modest nonlinearity but not extreme.
+The summary reports an effective degrees of freedom (EDF) for each
+smooth. A smooth whose EDF stays close to 1 has collapsed to a
+straight line — the data give no evidence of curvature for that
+predictor — while an EDF meaningfully above 1 signals a nonlinear
+effect. In this dataset most of the smooths sit near 1 (the linear
+baseline already captures them well), with cholesterol the predictor
+most likely to show curvature; treat the actual EDFs from your run,
+not these expectations, as the evidence.
 
 ## Reading the EDFs {.unnumbered}
 
@@ -89,9 +87,9 @@ If `gam.check(m_gam)` flags any smooth as having `edf` close to
 ```{r}
 #| eval: false
 m_gam2 <- gam(
-  diabetes ~ s(bmi) + s(glu, k = 20) + s(age),
+  chd ~ s(age) + s(bmi) + s(chol, k = 20) + s(sbp),
   family = binomial(),
-  data = Pima.tr,
+  data = wcgs,
   method = "REML"
 )
 ```
@@ -123,21 +121,18 @@ gratia::draw(m_gam)
 ## Translating to the response scale {.unnumbered}
 
 Partial-effect plots are on the **link scale** (log-odds for
-binary). To plot predicted probability vs a single predictor, hold
+binary). To plot predicted probability against a single predictor —
+here cholesterol, the predictor with the clearest curvature — hold
 the others at typical values, build the Wald CI on the **link** scale,
 then back-transform with `plogis()`:
 
 ```{r}
 #| eval: false
-typical <- data.frame(
-  bmi = median(Pima.tr$bmi),
-  glu = median(Pima.tr$glu),
-  age = median(Pima.tr$age)
-)
 grid <- data.frame(
-  bmi = seq(min(Pima.tr$bmi), max(Pima.tr$bmi), length.out = 200),
-  glu = typical$glu,
-  age = typical$age
+  chol = seq(min(wcgs$chol), max(wcgs$chol), length.out = 200),
+  age  = median(wcgs$age),
+  bmi  = median(wcgs$bmi),
+  sbp  = median(wcgs$sbp)
 )
 pred <- predict(m_gam, newdata = grid,
                 type = "link", se.fit = TRUE)
@@ -145,11 +140,11 @@ grid$prob  <- plogis(pred$fit)
 grid$lower <- plogis(pred$fit - 1.96 * pred$se.fit)
 grid$upper <- plogis(pred$fit + 1.96 * pred$se.fit)
 
-ggplot(grid, aes(x = bmi, y = prob)) +
+ggplot(grid, aes(x = chol, y = prob)) +
   geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.2) +
   geom_line() +
-  labs(x = "BMI (kg/m²)",
-       y = "Predicted probability of diabetes",
+  labs(x = "Serum cholesterol (mg/dL)",
+       y = "Predicted probability of CHD",
        caption = "Other predictors held at sample medians") +
   theme_minimal()
 ```
@@ -174,22 +169,21 @@ the middle), which is the right qualitative shape for a probability.
 # null-space penalty), so a smooth that's really just a line — or
 # zero — isn't penalized as more complex than the linear baseline.
 m_gam_ml <- gam(
-  diabetes ~ s(bmi) + s(glu) + s(age),
+  chd ~ s(age) + s(bmi) + s(chol) + s(sbp),
   family = binomial(),
-  data = Pima.tr,
+  data = wcgs,
   method = "ML",
   select = TRUE
 )
 AIC(m_lin, m_gam_ml)
 ```
 
-For `Pima.tr`, the GAM typically improves AIC by a few units —
-consistent with the modest EDFs above 1 we saw in the summary.
-Whether that improvement is worth the added model complexity is a
-judgment call: if the smooths all have EDF very close to 1, the GLM
-is a parsimonious choice; if EDFs are clearly $> 1$ and the curve
-plots show a substantively interesting shape (e.g. a J in BMI), the
-GAM is the better report.
+Compare the two fits on AIC. When most smooths sit at EDF near 1 —
+as several do here — the GLM is the parsimonious choice and the AIC
+gap will be small or may even favor the GLM. The GAM earns its extra
+complexity only when one or more smooths show clear curvature and
+the curve plots reveal a substantively interesting shape; if they
+do, the GAM is the better report.
 
 The refit with `method = "ML"` is required only for the AIC
 comparison; the REML fit `m_gam` above remains the preferred fit for

From 225baa773274944daa7786288ff5e1ef8a85e18d Mon Sep 17 00:00:00 2001
From: Douglas Ezra Morrison <demorrison@ucdavis.edu>
Date: Thu, 18 Jun 2026 10:34:14 -0700
Subject: [PATCH 4/6] Address 3 rme#779 low-priority items + merge main
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. _sec_lab.qmd:36 — remove the # noqa: E402 trailing the commented-
   out pygam import. The import is already commented out, so the
   linter suppression is meaningless and just noise.

2. _sec_gams.qmd:95 — the inline `s(age, by = sex)` example was the
   one ```r block missed by the prior cleanup commit; convert to
   ```{r} #| eval: false to match the rest of the chapter.

3. _sec_gam_fitting.qmd: add the `id` argument to the s() argument
   table — used to share a single smoothness parameter across two
   or more smooths (e.g. s(x1, id = 1) + s(x2, id = 1)). Commonly
   needed in multi-predictor epi models.

Also merged origin/main (the branch was 125 commits behind).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Co-authored-by: d-morrison <2474437+d-morrison@users.noreply.github.com>
---
 _subfiles/generalized-additive-models/_sec_gam_fitting.qmd | 1 +
 _subfiles/generalized-additive-models/_sec_gams.qmd        | 3 ++-
 _subfiles/generalized-additive-models/_sec_lab.qmd         | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
index fec824cf68..cd511d4d5c 100644
--- a/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gam_fitting.qmd
@@ -46,6 +46,7 @@ curves with effective-degrees-of-freedom summaries.
 | `bs` | basis type (`"tp"`, `"cr"`, `"ps"`, ...) | rarely; `"tp"` is the default and the safe choice |
 | `k` | basis dimension (max EDF + 1) | when `gam.check()` flags `k-index < 1` |
 | `by` | factor for per-level smooths | predictor-by-factor interactions |
+| `id` | label for sharing smoothness across smooths | when two or more smooths should share a single $\l$, e.g. `s(x1, id = 1) + s(x2, id = 1)` |
 | `m` | order of the penalty derivative | rarely; default penalizes 2nd derivative |
 | `pc` | constraint point | when you need the smooth to pass through 0 at a chosen $x$ |
 
diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd
index 98f3d90969..1de8c2e813 100644
--- a/_subfiles/generalized-additive-models/_sec_gams.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gams.qmd
@@ -92,7 +92,8 @@ two standard escape hatches are:
   identifiability (the smooth is centered to mean zero in each
   level, so the level-specific intercepts have to come from
   somewhere else); the correct idiom is:
-  ```r
+  ```{r}
+  #| eval: false
   gam(y ~ sex + s(age, by = sex), data = d, method = "REML")
   ```
   The result is still additive in the *factors*, but allows
diff --git a/_subfiles/generalized-additive-models/_sec_lab.qmd b/_subfiles/generalized-additive-models/_sec_lab.qmd
index d45ef30668..61ef98b924 100644
--- a/_subfiles/generalized-additive-models/_sec_lab.qmd
+++ b/_subfiles/generalized-additive-models/_sec_lab.qmd
@@ -33,7 +33,7 @@ from sklearn.preprocessing import PolynomialFeatures
 from sklearn.linear_model import LinearRegression, LogisticRegression
 import statsmodels.api as sm
 from patsy import dmatrix
-# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f  # noqa: E402
+# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f
 Wage = load_data("Wage")
 print(Wage[["age","wage","education","year"]].head())
 ```

From 7ad9aa82f9bdce55d7fcc75437b2227bfaffc996 Mon Sep 17 00:00:00 2001
From: Douglas Ezra Morrison <demorrison@ucdavis.edu>
Date: Thu, 18 Jun 2026 10:42:46 -0700
Subject: [PATCH 5/6] Drop dead pygam import + switch Pima examples to WCGS
 (rme#779)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

1. _sec_lab.qmd:36 — drop the commented-out pygam import entirely.
   The active imports of LogisticGAM/gam_f live in the chunks that
   use them (lines 293, 350), so the setup-block comment was just
   dead code that puzzled students.

2. _sec_gams.qmd:38 — switch #exm-gam conceptual example from "Pima
   diabetes data with bmi/glu/age → diabetes" to "WCGS data with
   age/sbp/chol → chd69" so it matches the worked example's WCGS
   framing.

3. _sec_local_regression.qmd:34 — switch #exm-local-regression from
   loess(glu ~ age, MASS::Pima.tr) to loess(chol ~ age, wcgs) for
   the same reason; pivot the surrounding prose from age 40/glucose
   to age 50/cholesterol to match.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Co-authored-by: d-morrison <2474437+d-morrison@users.noreply.github.com>
---
 .../generalized-additive-models/_sec_gams.qmd  | 12 ++++++------
 .../generalized-additive-models/_sec_lab.qmd   |  1 -
 .../_sec_local_regression.qmd                  | 18 +++++++++---------
 3 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/_subfiles/generalized-additive-models/_sec_gams.qmd b/_subfiles/generalized-additive-models/_sec_gams.qmd
index 1de8c2e813..6b893b1422 100644
--- a/_subfiles/generalized-additive-models/_sec_gams.qmd
+++ b/_subfiles/generalized-additive-models/_sec_gams.qmd
@@ -36,17 +36,17 @@ the coefficients $\b_{jk}$ estimated jointly across all predictors.
 ::::
 
 ::::{#exm-gam}
-#### GAM for diabetes risk
+#### GAM for CHD risk
 
-For the Pima diabetes data with continuous predictors BMI, plasma
-glucose, and age, a GAM on the logit scale is
+For the WCGS data with continuous predictors age, systolic blood
+pressure, and cholesterol, a GAM for CHD on the logit scale is
 
-$$\logitf{\Pr(\text{diabetes} = 1 \mid \text{bmi}, \text{glu}, \text{age})}
-= \b_0 + f_1(\text{bmi}) + f_2(\text{glu}) + f_3(\text{age}),$$
+$$\logitf{\Pr(\text{chd69} = 1 \mid \text{age}, \text{sbp}, \text{chol})}
+= \b_0 + f_1(\text{age}) + f_2(\text{sbp}) + f_3(\text{chol}),$$
 
 where each $f_j$ is a smoothing spline estimated by REML. The three
 partial-effect curves can be plotted separately, revealing, for
-example, whether diabetes risk rises linearly with age or accelerates
+example, whether CHD risk rises linearly with age or accelerates
 nonlinearly.
 ::::
 
diff --git a/_subfiles/generalized-additive-models/_sec_lab.qmd b/_subfiles/generalized-additive-models/_sec_lab.qmd
index 61ef98b924..dffe3c058f 100644
--- a/_subfiles/generalized-additive-models/_sec_lab.qmd
+++ b/_subfiles/generalized-additive-models/_sec_lab.qmd
@@ -33,7 +33,6 @@ from sklearn.preprocessing import PolynomialFeatures
 from sklearn.linear_model import LinearRegression, LogisticRegression
 import statsmodels.api as sm
 from patsy import dmatrix
-# from pygam import LinearGAM, LogisticGAM, s as gam_s, f as gam_f
 Wage = load_data("Wage")
 print(Wage[["age","wage","education","year"]].head())
 ```
diff --git a/_subfiles/generalized-additive-models/_sec_local_regression.qmd b/_subfiles/generalized-additive-models/_sec_local_regression.qmd
index b32ac93378..07a63ffc1b 100644
--- a/_subfiles/generalized-additive-models/_sec_local_regression.qmd
+++ b/_subfiles/generalized-additive-models/_sec_local_regression.qmd
@@ -32,21 +32,21 @@ values and connect the resulting predictions.
 ::::
 
 ::::{#exm-local-regression}
-#### LOESS fit on age--glucose
+#### LOESS fit on age--cholesterol
 
-To smooth the relationship between age and plasma glucose in
-`MASS::Pima.tr` using LOESS with a span of 0.5:
+To smooth the relationship between age and total cholesterol in the
+WCGS data using LOESS with a span of 0.5:
 
 ```{r}
 #| eval: false
-m_loess <- loess(glu ~ age, data = MASS::Pima.tr, span = 0.5)
+m_loess <- loess(chol ~ age, data = wcgs, span = 0.5)
 ```
 
-At age 40, the local neighborhood is the 50% of observations with
-ages closest to 40. Each observation in that window is weighted by
-the tricube kernel --- down-weighted as its age moves away from 40,
-zero weight outside the window. The predicted glucose at age 40 is
-the fitted value from the weighted least-squares linear regression
+At age 50, the local neighborhood is the 50% of observations with
+ages closest to 50. Each observation in that window is weighted by
+the tricube kernel --- down-weighted as its age moves away from 50,
+zero weight outside the window. The predicted cholesterol at age 50
+is the fitted value from the weighted least-squares linear regression
 over that neighborhood.
 ::::
 

From 46f37d81e539b895ff8d97fb96ec2fa965e75a31 Mon Sep 17 00:00:00 2001
From: Douglas Ezra Morrison <demorrison@ucdavis.edu>
Date: Thu, 18 Jun 2026 10:51:39 -0700
Subject: [PATCH 6/6] Record gratia 0.11.2 in renv.lock (rme#779 low-priority)

DESCRIPTION:49 declares gratia (>= 0.9) in Suggests, but
renv::snapshot() doesn't auto-detect it because all gratia code
lives in eval:false chunks. Force-record via renv::record('gratia')
so renv::restore() on a fresh clone installs it.

Other Suggests dependencies are already present in renv.lock (mgcv,
ISLR2, boot, etc.), so this brings gratia into line with that
pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

Co-authored-by: d-morrison <2474437+d-morrison@users.noreply.github.com>
---
 renv.lock | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/renv.lock b/renv.lock
index f5008745dd..257974e31f 100644
--- a/renv.lock
+++ b/renv.lock
@@ -4801,6 +4801,12 @@
       "Maintainer": "David Schoch <david@schochastics.net>",
       "Repository": "CRAN"
     },
+    "gratia": {
+      "Package": "gratia",
+      "Version": "0.11.2",
+      "Source": "Repository",
+      "Repository": "CRAN"
+    },
     "gridExtra": {
       "Package": "gridExtra",
       "Version": "2.3",