Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 14 additions & 2 deletions configuration-reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -343,14 +343,14 @@ html_theme_options = {
#### `llm_generate_md`
- **Type:** String (`"true"` or `"false"`)
- **Default:** `"true"` (enabled by default)
- **Description:** When enabled, generates a clean `.md` (Markdown) file alongside each `.html` page in the build output, and links to `.md` files in `llms.txt` instead of `.html` files. Also generates `llms-full.txt`, a single file concatenating all page content for easy LLM ingestion (per the [llms.txt spec](https://llmstxt.org/)).
- **Description:** When enabled, generates a clean `.md` (Markdown) file alongside each `.html` page in the build output, and links to `.md` files in `llms.txt` instead of `.html` files.

The generated `.md` files strip all navigation, sidebars, scripts, and theme-injected metadata (e.g., date info), producing clean, readable content suitable for LLMs. Unicode smart quotes and other typographic characters are normalized to ASCII equivalents for maximum compatibility.

**Generated files:**
- `*.md` — one per HTML page, alongside the `.html` files
- `llms.txt` — page index with links to `.md` files
- `llms-full.txt` — all page content concatenated into a single file
- `llms-full.txt` — all page content concatenated (controlled separately by `llm_generate_full`)

```python
html_theme_options = {
Expand All @@ -366,6 +366,18 @@ html_theme_options = {
}
```

#### `llm_generate_full`
- **Type:** String (`"true"` or `"false"`)
- **Default:** `"true"` (enabled by default)
- **Description:** When enabled, generates `llms-full.txt` — a single file concatenating all page content for LLM ingestion. For large projects (e.g., PyTorch with thousands of API pages), this file can be extremely large and may not fit in most LLM context windows. Set to `"false"` to skip generating `llms-full.txt` while still generating individual `.md` files and `llms.txt`.

```python
html_theme_options = {
"llm_generate_md": "true",
"llm_generate_full": "false", # Skip llms-full.txt for large projects
}
```

#### `llm_deduplicate_titles`
- **Type:** String (`"true"` or `"false"`)
- **Default:** `"false"`
Expand Down
38 changes: 17 additions & 21 deletions pytorch_sphinx_theme2/llm_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

import re
import shutil
from concurrent.futures import ProcessPoolExecutor, as_completed
from concurrent.futures import ThreadPoolExecutor, as_completed
from html import unescape
from pathlib import Path

Expand Down Expand Up @@ -336,9 +336,9 @@ def _convert_single_doc(args):
def _generate_md_files(app, docs):
"""Generate .md files from built HTML pages using parallel processing.

Uses ProcessPoolExecutor for parallel conversion, with automatic fallback
to sequential processing if multiprocessing is unavailable (e.g., in
sandboxed build environments like Netlify).
Uses ThreadPoolExecutor for parallel conversion. Preferred over
ProcessPoolExecutor because fork-based multiprocessing can deadlock
in complex Python environments like Sphinx builds.

Args:
app: The Sphinx application object.
Expand All @@ -351,21 +351,13 @@ def _generate_md_files(app, docs):
args_list = [(doc["docname"], outdir_str) for doc in docs]
results = {}

try:
with ProcessPoolExecutor() as executor:
futures = {
executor.submit(_convert_single_doc, args): args[0]
for args in args_list
}
for future in as_completed(futures):
docname, md_content = future.result()
if md_content is not None:
results[docname] = md_content
except (OSError, RuntimeError) as e:
print(f"Parallel processing unavailable ({e}), falling back to sequential")
results = {}
for args in args_list:
docname, md_content = _convert_single_doc(args)
with ThreadPoolExecutor() as executor:
futures = {
executor.submit(_convert_single_doc, args): args[0]
for args in args_list
}
for future in as_completed(futures):
docname, md_content = future.result()
if md_content is not None:
results[docname] = md_content

Expand Down Expand Up @@ -542,12 +534,16 @@ def make_url(relative_path):
print(f"Warning: Could not discover pages for llms.txt: {e}")

# Generate .md files from HTML if enabled
generate_full = (
str(theme_options.get("llm_generate_full", "true")).lower() == "true"
)
if generate_md and docs:
md_contents = _generate_md_files(app, docs)
print(f"Generated {len(md_contents)} markdown files from HTML pages")

# Also generate llms-full.txt with all content concatenated (using in-memory content)
_generate_llms_full_txt(app, docs, app.outdir, md_contents)
# Generate llms-full.txt with all content concatenated (unless disabled)
if generate_full:
_generate_llms_full_txt(app, docs, app.outdir, md_contents)

# Deduplicate titles if enabled
# This adds a disambiguating suffix to duplicate titles based on their URL path
Expand Down
4 changes: 3 additions & 1 deletion pytorch_sphinx_theme2/theme.conf
Original file line number Diff line number Diff line change
Expand Up @@ -73,5 +73,7 @@ llm_custom_file =
# Set to true to deduplicate titles
llm_deduplicate_titles = false
# Set to true to generate .md files from HTML and link to them in llms.txt
# Also generates llms-full.txt with all content concatenated
llm_generate_md = true
# Set to true to generate llms-full.txt with all content concatenated
# For large projects, set to false to skip this (it can be very large)
llm_generate_full = true
Loading