diff --git a/.claude/commands/new-chapter.md b/.claude/commands/new-chapter.md index 08d8051435..133cc0b5de 100644 --- a/.claude/commands/new-chapter.md +++ b/.claude/commands/new-chapter.md @@ -14,9 +14,12 @@ First, parse `$ARGUMENTS`: the first whitespace-delimited token is the **slug** Steps: 1. Create `chapters/.qmd` with YAML frontmatter holding just `title:` (set to the title). Do NOT set `date:` — the book sets `date: last-modified` globally, and a per-page `date:` would override it. Do NOT add a top-level `#` heading in the body — Quarto renders the frontmatter `title:` as the page heading. -2. Register the chapter in the `book.chapters:` list in `_quarto-book.yml` at a logical position (read the file first). If it belongs to an existing `part:`, nest it under that part. +2. Register the chapter in the `book.chapters:` list in `_quarto-book.yml` at a logical position (read the file first). If it belongs to an existing `part:`, nest it under that part. **Also** add an entry to the appropriate navbar dropdown (`Chapters` or `Appendices`) in `_quarto-website.yml`, and ensure the file is in the `render:` list. The navbar is NOT auto-generated from `_quarto-book.yml` -- manual addition is required. 3. If the chapter is long, you may split content into includes under `chapters/_subfiles//`. Subfiles must NOT start with a heading and must NOT contain a references section. 4. Confirm it renders: `quarto render chapters/.qmd --to html`. +5. If the chapter contains `def`/`thm`/`lem`/`cor`/`prp` callout divs, + re-run `Rscript data-raw/callout-graph.R` to refresh + `inst/extdata/callout-graph.rds` and keep the concept map current. Style rules (see `.github/instructions/`): diff --git a/_quarto-book.yml b/_quarto-book.yml index 0910c98925..5d6783472e 100644 --- a/_quarto-book.yml +++ b/_quarto-book.yml @@ -57,6 +57,7 @@ book: - chapters/CONTRIBUTING.qmd - chapters/exam-formula-sheet.qmd - chapters/practice-exam-mle-linreg.qmd + - chapters/concept-map.qmd back-to-top-navigation: true page-navigation: true diff --git a/_quarto-website.yml b/_quarto-website.yml index 47d8ff5e11..9f1f915072 100644 --- a/_quarto-website.yml +++ b/_quarto-website.yml @@ -36,6 +36,7 @@ project: - chapters/CONTRIBUTING.qmd - chapters/exam-formula-sheet.qmd - chapters/package-versions.qmd + - chapters/concept-map.qmd website: title: "Regression Models for Epidemiology" @@ -125,6 +126,8 @@ website: href: chapters/exam-formula-sheet.qmd - text: "Package Versions" href: chapters/package-versions.qmd + - text: "Concept Map" + href: chapters/concept-map.qmd bibliography: references.bib diff --git a/chapters/concept-map.qmd b/chapters/concept-map.qmd new file mode 100644 index 0000000000..923c9c776c --- /dev/null +++ b/chapters/concept-map.qmd @@ -0,0 +1,187 @@ +--- +title: "Concept Map: Definitions and Results" +--- + +{{< include shared-config.qmd >}} + +This appendix shows how the definitions and results +(`def`, `thm`, `lem`, `cor`, `prp` callouts) +in the notes depend on one another. + +We say result $B$ is a **descendant** of result $A$ +if $A$ is referenced inside the statement or proof of $B$ +(directly, or transitively through a chain of intermediate results). +The more descendants a result has, +the more of the rest of the notes rests on it --- +so descendant count is a rough measure of how foundational a result is. + +The dependency graph is built by `data-raw/callout-graph.R`, +which scans the source `.qmd` files and saves the result to +`inst/extdata/callout-graph.rds`. +This appendix reads that saved file rather than re-scanning the notes on +every render, so **re-run that script whenever divs are added, removed, or +re-titled** to refresh the diagram and table below. + +```{r} +#| label: load-concept-graph +#| code-fold: true +#| message: false +#| warning: false + +library(igraph) +library(ggraph) +library(ggrepel) + +cg <- readRDS(here::here("inst/extdata/callout-graph.rds")) + +type_levels <- c("def", "thm", "lem", "cor", "prp") +type_labels <- c( + def = "Definition", thm = "Theorem", lem = "Lemma", + cor = "Corollary", prp = "Proposition" +) +type_palette <- c( + def = "#1b9e77", thm = "#d95f02", lem = "#7570b3", + cor = "#e7298a", prp = "#66a61e" +) +``` + +There are `r nrow(cg$nodes)` labeled definitions and results in the notes, +connected by `r nrow(cg$edges)` direct dependency links. + +## Dependency diagram + +::: {.content-visible unless-format="pdf"} +::: {.content-visible unless-format="docx"} + +@fig-concept-map shows the results that participate in at least one +dependency link, laid out so that connected results sit near each other. +Color encodes the type of result. +Results with no detected dependency links are omitted from the diagram. +Node labels are larger for results with more total descendants, +so the most foundational results are visually prominent. +Zoom in with Ctrl+scroll (or pinch on mobile) to read individual labels. +Lines from labels to dots indicate where ggrepel moved a label to avoid overlap. + +:::{#fig-concept-map} + +```{r} +#| label: concept-map-ggraph +#| code-fold: true +#| message: false +#| warning: false +#| fig-width: 50 +#| fig-height: 50 +#| out-width: "100%" +#| fig-format: "svg" + +connected <- cg$nodes$id[cg$nodes$id %in% c(cg$edges$from, cg$edges$to)] +core <- graph_from_data_frame( + cg$edges, + vertices = cg$nodes[cg$nodes$id %in% connected, ], + directed = TRUE +) +V(core)$type <- factor(V(core)$type, levels = type_levels) + +set.seed(204) +# Fruchterman-Reingold applies repulsion between all node pairs (not just +# connected ones), spreading nodes more evenly across the canvas. Normalise +# axes afterwards to fill the full canvas and eliminate edge whitespace. +layout <- create_layout(core, layout = "fr", niter = 2000) +norm_axis <- function(x) { + r <- diff(range(x)) + if (r == 0) return(x) + (x - min(x)) / r * 2 - 1 +} +layout$x <- norm_axis(layout$x) +layout$y <- norm_axis(layout$y) + +ggraph(layout) + + geom_edge_link( + arrow = arrow(length = unit(1.8, "mm"), type = "closed"), + end_cap = circle(2, "mm"), + edge_alpha = 0.25, edge_width = 0.3 + ) + + geom_node_point(color = "grey60", size = 0.8, alpha = 0.7) + + geom_label_repel( + data = as.data.frame(layout), + aes(x = x, y = y, label = title, fill = type, size = n_desc), + inherit.aes = FALSE, + color = "white", + fontface = "bold", + max.overlaps = Inf, + force = 3, + force_pull = 0.5, + box.padding = unit(0.25, "lines"), + label.padding = unit(0.1, "lines"), + label.r = unit(0.05, "lines"), + label.size = 0.1, + alpha = 0.88, + segment.color = "grey60", + segment.alpha = 0.6, + segment.size = 0.3, + min.segment.length = 0, + seed = 204 + ) + + scale_fill_manual( + values = type_palette, labels = type_labels, name = "Type", drop = FALSE + ) + + scale_size_continuous(range = c(8.0, 16.0), guide = "none") + + theme_void() + + theme(legend.position = "bottom") +``` + +Dependency structure of the definitions and results in the notes. +An arrow points from a result to each result that uses it. +Color indicates result type (see legend). +Label font size is proportional to the number of total descendants. + +::: + +::: +::: + +::: {.content-visible when-format="pdf"} + +The dependency diagram is only available in the +[HTML version of the notes](https://d-morrison.github.io/rme/chapters/concept-map.html). + +::: + +::: {.content-visible when-format="docx"} + +The dependency diagram is only available in the +[HTML version of the notes](https://d-morrison.github.io/rme/chapters/concept-map.html). + +::: + +## Descendants of each result {#sec-descendants-table} + +@tbl-descendants lists every result that has at least one descendant, +sorted by the number of total descendants (direct or transitive). + +::: {#tbl-descendants} + +```{r} +#| label: descendants-table-build +#| code-fold: true +#| message: false +#| warning: false + +ranked <- cg$nodes[cg$nodes$n_desc >= 1, ] +ranked <- ranked[order(-ranked$n_desc, ranked$id), ] + +descendant_table <- data.frame( + Result = ranked$title, + Type = type_labels[ranked$type], + `Direct descendants` = ranked$n_direct, + `Total descendants` = ranked$n_desc, + check.names = FALSE, row.names = NULL, stringsAsFactors = FALSE +) + +knitr::kable(descendant_table, format = "pipe", align = "llrr") +``` + +All results with at least one descendant, +sorted by number of total descendants (most-foundational first). + +::: diff --git a/data-raw/callout-graph.R b/data-raw/callout-graph.R new file mode 100644 index 0000000000..62ebda9c72 --- /dev/null +++ b/data-raw/callout-graph.R @@ -0,0 +1,210 @@ +# Build the dependency graph of definitions and results in the lecture notes. +# +# This scans every `.qmd` for `def`/`thm`/`lem`/`cor`/`prp` callout divs and the +# cross-references (`@type-id`) inside each one, then saves the resulting graph +# to `inst/extdata/callout-graph.rds`. The `concept-map.qmd` appendix reads that +# saved artifact, so the scan does NOT re-run on every render. +# +# Re-run this script (from the repo root) whenever divs are added, removed, or +# re-titled: +# +# Rscript data-raw/callout-graph.R + +library(igraph) +library(stringr) + +`%||%` <- function(a, b) if (is.null(a)) b else a + +# Scan all `.qmd` files and return a list of `nodes` and `edges` data frames. +# +# A reference creates a dependency edge from the referenced result to the result +# whose statement or proof contains it. References are attributed to: +# * the enclosing callout div, if the reference is inside one; otherwise +# * the callout that an enclosing `proof`/`solution` div *immediately* follows +# (only blank lines and `---`/slidebreak separators may sit between them). +# References that are neither inside a callout nor inside a directly-attached +# proof (e.g. plain prose) are not turned into edges. +extract_callout_graph <- function(root) { + qmds <- list.files( + c(file.path(root, "chapters"), file.path(root, "_subfiles")), + pattern = "[.]qmd$", recursive = TRUE, full.names = TRUE + ) + # `chapters/_subfiles` is a symlink to `_subfiles`; drop the duplicate paths. + qmds <- qmds[!grepl("/chapters/_subfiles/", qmds)] + + open_re <- "^:::+\\s*\\{#(def|thm|lem|cor|prp)-([A-Za-z0-9_-]+)([^}]*)\\}\\s*$" + head_re <- "^#{2,6}\\s+(.*\\S)\\s*$" + ref_re <- "@((?:def|thm|lem|cor|prp)-[A-Za-z0-9_-]+)" + sep_re <- "^(-{3,}|\\{\\{< *slidebreak *>\\}\\})$" # adjacency-preserving lines + + nodes <- list() + edges <- list() + + for (f in qmds) { + lines <- readLines(f, warn = FALSE) + stack <- list() # open fenced divs, innermost last; may carry $owner + pending <- NA_character_ # callout just closed, still adjacent to a proof + n <- length(lines) + for (i in seq_len(n)) { + l <- lines[i] + + mo <- str_match(l, open_re) + if (!is.na(mo[1, 1])) { + type <- mo[1, 2] + full <- paste0(type, "-", mo[1, 3]) + # The title is the heading on the first non-blank line inside the div. + title <- NA_character_ + for (j in seq(i + 1, min(i + 5, n))) { + if (j > n || str_trim(lines[j]) != "") { + hm <- str_match(lines[j], head_re) + if (!is.na(hm[1, 1])) { + title <- str_trim(gsub("\\\\index\\{[^}]*\\}", "", hm[1, 2])) + } + break + } + } + nodes[[full]] <- data.frame( + id = full, type = type, + title = title %||% full, + file = sub(paste0(root, "/"), "", f, fixed = TRUE), + stringsAsFactors = FALSE + ) + stack[[length(stack) + 1]] <- list(kind = "callout", id = full) + pending <- NA_character_ + next + } + + if (grepl("^:::+", l)) { + content <- str_trim(sub("^:::+", "", l)) + if (nzchar(content)) { + # A proof/solution div directly following a callout inherits it. + is_proof <- grepl("\\b(proof|solution)\\b", content, ignore.case = TRUE) + owner <- if (is_proof) pending else NA_character_ + stack[[length(stack) + 1]] <- + list(kind = "div", class = content, owner = owner) + pending <- NA_character_ + } else if (length(stack)) { + top <- stack[[length(stack)]] + stack[[length(stack)]] <- NULL + pending <- if (top$kind == "callout") top$id else NA_character_ + } + next + } + + refs <- str_match_all(l, ref_re)[[1]] + if (nrow(refs)) { + target <- NA_character_ + for (k in rev(seq_along(stack))) { + s <- stack[[k]] + if (s$kind == "callout") { + target <- s$id + break + } + if (!is.null(s$owner) && !is.na(s$owner)) { + target <- s$owner + break + } + } + if (!is.na(target)) { + for (r in refs[, 2]) { + if (r != target) { + edges[[length(edges) + 1]] <- + data.frame(from = r, to = target, stringsAsFactors = FALSE) + } + } + } + } + + # Real content at the top level ends the previous callout's eligibility to + # bind a following proof (blank lines and separators don't count). + trimmed <- str_trim(l) + if (!length(stack) && trimmed != "" && !grepl(sep_re, trimmed)) { + pending <- NA_character_ + } + } + } + + nodes <- if (length(nodes)) do.call(rbind, nodes) else + data.frame(id = character(), type = character(), + title = character(), file = character(), + stringsAsFactors = FALSE) + rownames(nodes) <- NULL + edges <- if (length(edges)) unique(do.call(rbind, edges)) else + data.frame(from = character(), to = character()) + edges <- edges[edges$from %in% nodes$id & edges$to %in% nodes$id, , drop = FALSE] + rownames(edges) <- NULL + list(nodes = nodes, edges = edges) +} + +root <- here::here() +cg <- extract_callout_graph(root) + +# Curated *implicit* dependencies: foundational prerequisites that one result +# relies on but does not cite with an explicit `@ref` (so the scan above misses +# them). Each row is `prerequisite -> dependent` (same direction as the scanned +# edges: from the thing depended on, to the thing that depends on it). Extend +# this list as obvious gaps are noticed; ids must match `cg$nodes$id`. +implicit_edges <- data.frame( + stringsAsFactors = FALSE, + rbind( + c("def-probability", "def-odds"), + c("def-probability", "def-conditional-prob"), + c("def-probability", "def-indpt"), + c("def-probability", "def-pdf"), + c("def-probability", "def-cdf"), + c("def-conditional-prob", "def-c-odds"), + c("def-conditional-prob", "def-cond-expectation"), + c("def-odds", "def-logodds"), + c("def-odds", "def-odds-fn"), + c("def-odds", "def-c-odds"), + c("def-logodds", "def-logit-fn"), + c("def-logit-fn", "def-expit"), + c("def-indpt", "def-iid"), + c("def-indpt", "def-cident"), + c("def-indpt", "def-independence-diagnostics"), + c("def-expectation", "def-variance"), + c("def-expectation", "def-cov"), + c("def-expectation", "def-cond-expectation"), + c("def-variance", "def-cov"), + c("def-variance", "def-cov-vec-x"), + c("def-cov", "def-cov-vec-x"), + c("def-cdf", "def-pdf"), + c("def-cdf", "def-surv-fn"), + c("def-hazard", "def-cuhaz"), + c("def-hazard", "def-cond-hazard"), + c("def-hazard", "def-hazard-ratio"), + c("def-cond-hazard", "def-cond-loghaz") + ) +) +names(implicit_edges) <- c("from", "to") +cg$edges <- unique(rbind(cg$edges, implicit_edges)) +cg$edges <- cg$edges[ + cg$edges$from %in% cg$nodes$id & cg$edges$to %in% cg$nodes$id, , + drop = FALSE +] +rownames(cg$edges) <- NULL + +# Precompute descendant counts and the direct/indirect descendant id lists, so +# the chapter does no graph analysis at render time. +ig <- graph_from_data_frame(cg$edges, vertices = cg$nodes, directed = TRUE) +cg$nodes$n_desc <- vapply( + cg$nodes$id, + function(v) length(bfs(ig, v, mode = "out", unreachable = FALSE)$order) - 1L, + integer(1) +) +cg$nodes$n_direct <- as.integer(degree(ig, mode = "out")[cg$nodes$id]) + +cg$descendants <- lapply(stats::setNames(cg$nodes$id, cg$nodes$id), function(v) { + direct <- setdiff(names(which(distances(ig, v, mode = "out")[1, ] == 1)), v) + reach <- setdiff(names(bfs(ig, v, mode = "out", unreachable = FALSE)$order), v) + list(direct = sort(direct), indirect = sort(setdiff(reach, direct))) +}) + +cg$generated_from <- "data-raw/callout-graph.R" + +saveRDS(cg, here::here("inst/extdata/callout-graph.rds")) + +message(sprintf( + "callout-graph.rds: %d results, %d dependency links (%d results with >=1 descendant)", + nrow(cg$nodes), nrow(cg$edges), sum(cg$nodes$n_direct >= 1) +)) diff --git a/inst/extdata/callout-graph.rds b/inst/extdata/callout-graph.rds new file mode 100644 index 0000000000..19b5337ac7 Binary files /dev/null and b/inst/extdata/callout-graph.rds differ