Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 83 additions & 4 deletions claude_code_log/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,23 +306,102 @@ def __init__(
else:
self.db_path = get_cache_db_path(project_path.parent)

# When inside a batch() scope this holds the single shared connection
# reused by every _get_connection() call; None means the default
# open-a-connection-per-call behaviour. Set before _init_database() so
# the field always exists before any _get_connection() could run, even
# if migrations are ever routed through it.
self._shared_conn: Optional[sqlite3.Connection] = None

# Initialise database and ensure project exists
self._init_database()
self._project_id: Optional[int] = None
self._ensure_project_exists()

@contextmanager
def _get_connection(self) -> Generator[sqlite3.Connection, None, None]:
"""Get a database connection with proper settings."""
conn = sqlite3.connect(self.db_path, timeout=30.0)
def _configure_connection(self, conn: sqlite3.Connection) -> None:
"""Apply the standard pragmas/row factory to a fresh connection."""
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA foreign_keys = ON")
conn.execute("PRAGMA journal_mode = WAL")
# synchronous=NORMAL is the recommended pairing for WAL: it keeps
# durability across application crashes (only a power/OS crash can lose
# the last committed transaction) while skipping an fsync on every
# commit. The cache is fully regenerable from the JSONL source, so that
# residual risk is acceptable.
conn.execute("PRAGMA synchronous = NORMAL")

def _open_configured_connection(self) -> sqlite3.Connection:
"""Open a connection and apply pragmas, closing it if setup fails.

If a PRAGMA in ``_configure_connection`` raises, the just-opened
handle is closed before re-raising so it can't leak and lock the
.db/.db-wal/.db-shm files — the exact failure mode the connection
lifecycle elsewhere is careful to avoid (Windows WinError 32).
"""
conn = sqlite3.connect(self.db_path, timeout=30.0)
try:
self._configure_connection(conn)
except BaseException:
conn.close()
raise
return conn

@contextmanager
def _get_connection(self) -> Generator[sqlite3.Connection, None, None]:
"""Get a database connection with proper settings.

Inside a ``batch()`` scope this yields the shared connection without
closing it (the batch owns its lifecycle). Otherwise it opens a fresh
connection and closes it on exit — the default, Windows-safe behaviour
(no lingering file handle on the .db/.db-wal/.db-shm files).
"""
if self._shared_conn is not None:
yield self._shared_conn
return

conn = self._open_configured_connection()
try:
yield conn
finally:
conn.close()

@contextmanager
def batch(self) -> Generator[None, None, None]:
"""Reuse a single connection for every operation within the scope.

A full project build issues ~190 ``_get_connection`` calls; opening a
fresh SQLite connection each time dominates cache-build cost. Wrapping
the build in ``with cache_manager.batch():`` collapses those to one
connection.

Lifecycle guarantees (the risky part — see the integrity tests):
- The shared connection is **always closed on scope exit**, including
when the body raises (the ``finally``). This releases the file lock
on the .db/.db-wal/.db-shm files *before* any caller tears down a
TemporaryDirectory or runs ``clear_cache``/rmtree — critical on
Windows, which refuses to delete open files (WinError 32).
- Outside a batch, behaviour is unchanged (connection-per-call).
- Nesting is a no-op: an inner ``batch()`` reuses the outer connection
and does NOT close it, so the converter loop can't double-open or
close a connection still in use by an enclosing scope.
"""
if self._shared_conn is not None:
# Already batching — reuse the existing shared connection and leave
# its lifecycle to the outermost batch().
yield
return

# Open+configure first; only publish to _shared_conn once setup has
# succeeded, so a failed PRAGMA never leaves a half-initialised (and
# now-closed) handle assigned for other calls to reuse.
conn = self._open_configured_connection()
self._shared_conn = conn
try:
yield
finally:
self._shared_conn = None
conn.close()

def _init_database(self) -> None:
"""Create schema if needed using migration runner."""
# Run any pending migrations
Expand Down
243 changes: 133 additions & 110 deletions claude_code_log/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -835,11 +835,19 @@ def load_directory_transcripts(
f for f in directory_path.glob("*.jsonl") if not f.name.startswith("agent-")
]

for jsonl_file in jsonl_files:
messages = load_transcript(
jsonl_file, cache_manager, from_date, to_date, silent
)
all_messages.extend(messages)
# Reuse one connection across all per-file cache reads/writes in this load
# pass. Nested under an outer batch() (e.g. ensure_fresh_cache) this is a
# no-op reuse; called standalone (Phase 2 reload) it opens and closes its
# own shared connection. nullcontext keeps the no-cache path unchanged.
load_batch = (
cache_manager.batch() if cache_manager is not None else contextlib.nullcontext()
)
with load_batch:
for jsonl_file in jsonl_files:
messages = load_transcript(
jsonl_file, cache_manager, from_date, to_date, silent
)
all_messages.extend(messages)

# Parent agent entries and assign synthetic session IDs so they
# form separate DAG-lines spliced at their anchor points.
Expand Down Expand Up @@ -2022,33 +2030,39 @@ def ensure_fresh_cache(
if not session_jsonl_files:
return False

# Get cached project data
cached_project_data = cache_manager.get_cached_project_data()

# Check various invalidation conditions
modified_files = cache_manager.get_modified_files(session_jsonl_files)
needs_update = (
cached_project_data is None
or from_date is not None
or to_date is not None
or bool(modified_files) # Session files changed
or (
cached_project_data.total_message_count == 0 and session_jsonl_files
) # Stale cache
)
# Reuse one connection for the invalidation reads AND the whole populate
# pass (per-file load + save + the session/aggregate writes) instead of
# opening one per call. batch() closes the shared connection on scope exit
# (incl. on exception), so the cache files are unlocked before any caller
# tears down a temp dir.
with cache_manager.batch():
# Get cached project data
cached_project_data = cache_manager.get_cached_project_data()

# Check various invalidation conditions
modified_files = cache_manager.get_modified_files(session_jsonl_files)
needs_update = (
cached_project_data is None
or from_date is not None
or to_date is not None
or bool(modified_files) # Session files changed
or (
cached_project_data.total_message_count == 0 and session_jsonl_files
) # Stale cache
)

if not needs_update:
return False # Cache is already fresh
if not needs_update:
return False # Cache is already fresh

# Load and process messages to populate cache
if not silent:
print(f"Updating cache for {project_dir.name}...")
messages, _tree = load_directory_transcripts(
project_dir, cache_manager, from_date, to_date, silent
)
# Load and process messages to populate cache
if not silent:
print(f"Updating cache for {project_dir.name}...")
messages, _tree = load_directory_transcripts(
project_dir, cache_manager, from_date, to_date, silent
)

# Update cache with fresh data
_update_cache_with_session_data(cache_manager, messages)
# Update cache with fresh data
_update_cache_with_session_data(cache_manager, messages)
return True


Expand Down Expand Up @@ -2228,93 +2242,102 @@ def _generate_individual_session_files(
)
regenerated_count = 0

# Generate HTML file for each session
for session_id in session_ids:
# Create session-specific title using cache data if available
session_title = build_session_title(
project_title,
session_id,
session_data.get(session_id),
)

# Add date range if specified
if from_date or to_date:
date_range_parts: list[str] = []
if from_date:
date_range_parts.append(f"from {from_date}")
if to_date:
date_range_parts.append(f"to {to_date}")
date_range_str = " ".join(date_range_parts)
session_title += f" ({date_range_str})"

# Check if session file needs regeneration
session_file_name = f"session-{session_id}{suffix}.{ext}"
session_file_path = output_dir / session_file_name

# Use incremental regeneration: check per-session staleness via html_cache
if cache_manager is not None and format == "html":
is_stale, _reason = cache_manager.is_html_stale(
session_file_name, session_id
)
should_regenerate_session = (
is_stale
or renderer.is_outdated(session_file_path)
or from_date is not None
or to_date is not None
or not session_file_path.exists()
)
else:
# Fallback without cache or non-HTML formats
should_regenerate_session = (
renderer.is_outdated(session_file_path)
or from_date is not None
or to_date is not None
or not session_file_path.exists()
or cache_was_updated
)

if should_regenerate_session:
# Generate session content. Under `--combined no` the
# combined file is never written, so the per-session
# back-link would 404 — suppress it.
session_content = renderer.generate_session(
messages,
# Reuse one connection for every per-session staleness check + html_cache
# write, plus the per-session cache reads inside renderer.generate_session.
# Without this each session reopens the DB several times. nullcontext keeps
# the no-cache path unchanged; nested under an outer batch it's a no-op
# reuse, and the shared connection is closed on scope exit.
session_batch = (
cache_manager.batch() if cache_manager is not None else contextlib.nullcontext()
)
with session_batch:
# Generate HTML file for each session
for session_id in session_ids:
# Create session-specific title using cache data if available
session_title = build_session_title(
project_title,
session_id,
session_title,
cache_manager,
output_dir,
session_tree=session_tree,
suppress_combined_link=not write_combined,
)
assert session_content is not None
# Write session file
# See issue #139: errors="replace" for lone-surrogate safety.
session_file_path.write_text(
session_content, encoding="utf-8", errors="replace"
session_data.get(session_id),
)
regenerated_count += 1

# Update html_cache to track this generation (HTML only)
# Add date range if specified
if from_date or to_date:
date_range_parts: list[str] = []
if from_date:
date_range_parts.append(f"from {from_date}")
if to_date:
date_range_parts.append(f"to {to_date}")
date_range_str = " ".join(date_range_parts)
session_title += f" ({date_range_str})"

# Check if session file needs regeneration
session_file_name = f"session-{session_id}{suffix}.{ext}"
session_file_path = output_dir / session_file_name

# Use incremental regeneration: check per-session staleness via html_cache
if cache_manager is not None and format == "html":
# Use message count from cache (pre-deduplication) to match
# the count used in is_html_stale()
if session_id in session_data:
session_message_count = session_data[session_id].message_count
else:
# Fallback: count from messages list (less accurate due to dedup)
session_message_count = sum(
1
for m in messages
if hasattr(m, "sessionId")
and getattr(m, "sessionId") == session_id
is_stale, _reason = cache_manager.is_html_stale(
session_file_name, session_id
)
should_regenerate_session = (
is_stale
or renderer.is_outdated(session_file_path)
or from_date is not None
or to_date is not None
or not session_file_path.exists()
)
else:
# Fallback without cache or non-HTML formats
should_regenerate_session = (
renderer.is_outdated(session_file_path)
or from_date is not None
or to_date is not None
or not session_file_path.exists()
or cache_was_updated
)

if should_regenerate_session:
# Generate session content. Under `--combined no` the
# combined file is never written, so the per-session
# back-link would 404 — suppress it.
session_content = renderer.generate_session(
messages,
session_id,
session_title,
cache_manager,
output_dir,
session_tree=session_tree,
suppress_combined_link=not write_combined,
)
assert session_content is not None
# Write session file
# See issue #139: errors="replace" for lone-surrogate safety.
session_file_path.write_text(
session_content, encoding="utf-8", errors="replace"
)
regenerated_count += 1

# Update html_cache to track this generation (HTML only)
if cache_manager is not None and format == "html":
# Use message count from cache (pre-deduplication) to match
# the count used in is_html_stale()
if session_id in session_data:
session_message_count = session_data[session_id].message_count
else:
# Fallback: count from messages list (less accurate due to dedup)
session_message_count = sum(
1
for m in messages
if hasattr(m, "sessionId")
and getattr(m, "sessionId") == session_id
)
cache_manager.update_html_cache(
session_file_name, session_id, session_message_count
)
cache_manager.update_html_cache(
session_file_name, session_id, session_message_count
elif not silent:
print(
f"Session file {session_file_path.name} is current, skipping regeneration"
)
elif not silent:
print(
f"Session file {session_file_path.name} is current, skipping regeneration"
)

return regenerated_count

Expand Down
10 changes: 10 additions & 0 deletions dev-docs/application_model.md
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ cache row, the session is reparsed. The schema-version row also
invalidates the entire HTML cache when migrations bump the version,
since rendered output may have changed even when source data hasn't.

Connections run in WAL mode with `synchronous=NORMAL` (durable across
app crashes; only a power/OS crash can lose the last commit — fine for a
regenerable cache). By default `_get_connection()` opens and closes a
connection per call, so no file handle lingers to block temp-dir cleanup
on Windows. A build issues ~190 such opens, which dominates cache-build
cost, so the converter wraps its hotspots (`ensure_fresh_cache`, the
per-file load loop, per-session generation) in `CacheManager.batch()`:
one shared connection reused for the scope and closed on exit (including
on exception). `batch()` nesting is a no-op reuse, so the wraps compose.

For the operations / recovery side (archived sessions, manual
deletion, `cleanupPeriodDays`), see
[`docs/restoring-archived-sessions.md`](../docs/restoring-archived-sessions.md).
Expand Down
Loading
Loading