From abd5eb5bd6212d43fc891b0c52771abdcac4cabb Mon Sep 17 00:00:00 2001
From: Mircea Lungu <mircea.lungu@gmail.com>
Date: Mon, 1 Jun 2026 22:33:48 +0200
Subject: [PATCH] perf(video): cache caption tokenization + batch past-bookmark
 lookup

`GET /user_video?video_id=` returned ~1 MB and took ~2s end-to-end on
captioned videos with hundreds of segments. Two dominant costs:

1. `Video.video_info(with_content=True)` ran Stanza tokenization on every
   caption on every request -- no cache. Articles have
   `article_tokenization_cache`; videos didn't.
2. `UserVideo.user_video_info` then looped over the freshly-tokenized
   captions and made one DB query per caption to fetch past bookmarks
   (N+1).

This PR adds:

- `caption_tokenization_cache` table (caption_id PK, tokenized_text
  MEDIUMTEXT, created_at) mirroring the article cache. Captions are
  immutable post-ingestion so entries are populated lazily on first read
  and never invalidated. Foreign-key cascade on delete.
- `CaptionTokenizationCache` model with `find_or_create` (race-safe via
  flush+IntegrityError-then-fetch, same pattern as the article one),
  `get_many` for batched reads, and `delete_older_than` for cleanup.
- `Video.video_info` now batch-fetches the cache, parses cached JSON
  when present, runs Stanza only on misses, persists the new rows, and
  commits once at the end.
- `VideoCaptionContext.get_user_bookmarks_grouped_by_caption` -- one
  IN-clause query that replaces the per-caption helper; returns a
  {caption_id: [bookmark_json, ...]} dict.
- `UserVideo.user_video_info` uses the grouped result instead of the
  per-caption helper. The single-caption helper is left in place for
  other callers.

Expected effect after first warm cache: /user_video drops from ~2s to
roughly the time of one batched SELECT plus the JSON serialisation. The
N+1 collapses to one query regardless of caption count.

Migration: `tools/migrations/26-06-01--add_caption_tokenization_cache.sql`
must be run before/with deploy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ...-06-01--add_caption_tokenization_cache.sql | 15 ++++
 zeeguu/core/model/__init__.py                 |  1 +
 .../core/model/caption_tokenization_cache.py  | 74 +++++++++++++++++++
 zeeguu/core/model/user_video.py               | 16 ++--
 zeeguu/core/model/video.py                    | 57 ++++++++++----
 zeeguu/core/model/video_caption_context.py    | 27 +++++++
 6 files changed, 171 insertions(+), 19 deletions(-)
 create mode 100644 tools/migrations/26-06-01--add_caption_tokenization_cache.sql
 create mode 100644 zeeguu/core/model/caption_tokenization_cache.py

diff --git a/tools/migrations/26-06-01--add_caption_tokenization_cache.sql b/tools/migrations/26-06-01--add_caption_tokenization_cache.sql
new file mode 100644
index 000000000..00ee33ae3
--- /dev/null
+++ b/tools/migrations/26-06-01--add_caption_tokenization_cache.sql
@@ -0,0 +1,15 @@
+-- Cache for Stanza-tokenized caption text so /user_video doesn't re-tokenize
+-- every caption on every request (a captioned 16-min video has hundreds of
+-- captions, each one a Stanza call -- it was the dominant cost of the
+-- ~2-second /user_video response observed on share-to-video traffic).
+--
+-- Captions are immutable after ingestion, so entries are populated lazily on
+-- first read and never invalidated. delete_older_than() exists for housekeeping.
+
+CREATE TABLE caption_tokenization_cache (
+    caption_id     INT NOT NULL PRIMARY KEY,
+    tokenized_text MEDIUMTEXT,
+    created_at     DATETIME DEFAULT CURRENT_TIMESTAMP,
+    CONSTRAINT fk_caption_tok_cache_caption_id
+        FOREIGN KEY (caption_id) REFERENCES caption (id) ON DELETE CASCADE
+);
diff --git a/zeeguu/core/model/__init__.py b/zeeguu/core/model/__init__.py
index e6533ae9c..7731b1726 100644
--- a/zeeguu/core/model/__init__.py
+++ b/zeeguu/core/model/__init__.py
@@ -10,6 +10,7 @@
 from .article_upload import ArticleUpload
 from .article_cefr_assessment import ArticleCefrAssessment
 from .article_tokenization_cache import ArticleTokenizationCache
+from .caption_tokenization_cache import CaptionTokenizationCache
 from .text import Text
 from .phrase import Phrase
 from .user import User
diff --git a/zeeguu/core/model/caption_tokenization_cache.py b/zeeguu/core/model/caption_tokenization_cache.py
new file mode 100644
index 000000000..542fc1230
--- /dev/null
+++ b/zeeguu/core/model/caption_tokenization_cache.py
@@ -0,0 +1,74 @@
+import json
+import logging
+
+from sqlalchemy import Column, Integer, UnicodeText, ForeignKey, DateTime
+from sqlalchemy.orm import relationship
+from sqlalchemy.exc import IntegrityError, OperationalError
+from datetime import datetime, timedelta
+from zeeguu.core.model.db import db
+
+log = logging.getLogger(__name__)
+
+
+class CaptionTokenizationCache(db.Model):
+    """
+    Caches the Stanza-based tokenized text for video captions to avoid
+    re-running tokenization on every GET /user_video. Mirrors the
+    ArticleTokenizationCache pattern.
+
+    Captions are immutable once a video is ingested, so cache entries do not
+    need to be invalidated -- only populated on first read.
+    """
+
+    __tablename__ = "caption_tokenization_cache"
+
+    caption_id = Column(
+        Integer, ForeignKey("caption.id", ondelete="CASCADE"), primary_key=True
+    )
+    tokenized_text = Column(UnicodeText)
+    created_at = Column(DateTime, default=datetime.now)
+
+    @classmethod
+    def find_or_create(cls, session, caption_id):
+        cache = session.query(cls).filter_by(caption_id=caption_id).first()
+        if cache:
+            return cache
+
+        cache = cls(caption_id=caption_id)
+        session.add(cache)
+        try:
+            session.flush()
+        except IntegrityError:
+            # Another request beat us to the insert -- fetch and return that one.
+            session.rollback()
+            cache = session.query(cls).filter_by(caption_id=caption_id).first()
+        except OperationalError as e:
+            log.warning(
+                f"[CACHE] OperationalError during cache creation for caption "
+                f"{caption_id}: {e}"
+            )
+            session.rollback()
+            cache = session.query(cls).filter_by(caption_id=caption_id).first()
+        return cache
+
+    @classmethod
+    def get_many(cls, session, caption_ids):
+        """One query for many caption ids. Returns {caption_id: tokenized_text_json_string}."""
+        if not caption_ids:
+            return {}
+        rows = (
+            session.query(cls.caption_id, cls.tokenized_text)
+            .filter(cls.caption_id.in_(caption_ids))
+            .all()
+        )
+        return {cid: tok for (cid, tok) in rows}
+
+    @classmethod
+    def delete_older_than(cls, session, days=30):
+        cutoff = datetime.now() - timedelta(days=days)
+        deleted = session.query(cls).filter(cls.created_at < cutoff).delete()
+        session.commit()
+        log.info(
+            f"[CACHE-CLEANUP] Deleted {deleted} caption-cache entries older than {days} days"
+        )
+        return deleted
diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py
index 1e15b9fd2..9797fe320 100644
--- a/zeeguu/core/model/user_video.py
+++ b/zeeguu/core/model/user_video.py
@@ -190,12 +190,18 @@ def user_video_info(
                 ]
 
             if "captions" in returned_info:
+                # One query for all captions' past bookmarks instead of N --
+                # this used to be the N+1 next to the Stanza loop.
+                caption_ids = [
+                    c["context_identifier"]["video_caption_id"]
+                    for c in returned_info["captions"]
+                ]
+                grouped = VideoCaptionContext.get_user_bookmarks_grouped_by_caption(
+                    user.id, caption_ids
+                )
                 for caption in returned_info["captions"]:
-                    caption["past_bookmarks"] = (
-                        VideoCaptionContext.get_all_user_bookmarks_for_caption(
-                            user.id, caption["context_identifier"]["video_caption_id"]
-                        )
-                    )
+                    caption_id = caption["context_identifier"]["video_caption_id"]
+                    caption["past_bookmarks"] = grouped.get(caption_id, [])
 
             if "tokenized_title" in returned_info:
                 returned_info["tokenized_title"]["past_bookmarks"] = (
diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py
index fc6f73669..51339e6d2 100644
--- a/zeeguu/core/model/video.py
+++ b/zeeguu/core/model/video.py
@@ -281,22 +281,51 @@ def video_info(self, with_content=False):
             result_dict["published_time"] = datetime_to_json(self.published_time)
 
         if with_content:
+            import json
             from zeeguu.core.mwe import tokenize_for_reading
-
-            result_dict["captions"] = [
-                {
-                    "time_start": caption.time_start / 1000,  # convert to seconds
-                    "time_end": caption.time_end / 1000,
-                    "text": caption.get_content(),
-                    "tokenized_text": tokenize_for_reading(
+            from zeeguu.core.model.caption_tokenization_cache import (
+                CaptionTokenizationCache,
+            )
+            from . import db
+
+            # Stanza tokenization is by far the slowest piece of this method --
+            # for a 16-min auto-captioned video, the uncached path was ~2s
+            # spread across hundreds of Stanza calls. Batch-load the cache so
+            # the typical (warm-cache) request does zero tokenization work.
+            caption_ids = [c.id for c in self.captions]
+            cached = CaptionTokenizationCache.get_many(db.session, caption_ids)
+            populated_any = False
+
+            captions_out = []
+            for caption in self.captions:
+                cache_json = cached.get(caption.id)
+                if cache_json:
+                    tokenized = json.loads(cache_json)
+                else:
+                    tokenized = tokenize_for_reading(
                         caption.get_content(), self.language, mode="stanza"
-                    ),
-                    "context_identifier": ContextIdentifier(
-                        ContextType.VIDEO_CAPTION, video_caption_id=caption.id
-                    ).as_dictionary(),
-                }
-                for caption in self.captions
-            ]
+                    )
+                    row = CaptionTokenizationCache.find_or_create(
+                        db.session, caption.id
+                    )
+                    row.tokenized_text = json.dumps(tokenized)
+                    populated_any = True
+
+                captions_out.append(
+                    {
+                        "time_start": caption.time_start / 1000,  # convert to seconds
+                        "time_end": caption.time_end / 1000,
+                        "text": caption.get_content(),
+                        "tokenized_text": tokenized,
+                        "context_identifier": ContextIdentifier(
+                            ContextType.VIDEO_CAPTION, video_caption_id=caption.id
+                        ).as_dictionary(),
+                    }
+                )
+            result_dict["captions"] = captions_out
+
+            if populated_any:
+                db.session.commit()
 
             result_dict["tokenized_title"] = {
                 "tokens": tokenize_for_reading(self.title, self.language, mode="stanza"),
diff --git a/zeeguu/core/model/video_caption_context.py b/zeeguu/core/model/video_caption_context.py
index 80cf187ae..e32383bd6 100644
--- a/zeeguu/core/model/video_caption_context.py
+++ b/zeeguu/core/model/video_caption_context.py
@@ -76,3 +76,30 @@ def get_all_user_bookmarks_for_caption(
         ).all()
 
         return [each.to_json(True) if as_json_serializable else each for each in result]
+
+    @classmethod
+    def get_user_bookmarks_grouped_by_caption(cls, user_id: int, caption_ids):
+        """Batched companion to get_all_user_bookmarks_for_caption.
+
+        One query for many captions; returns {caption_id: [bookmark_json, ...]}.
+        Avoids the N+1 of calling the single-caption helper per caption when
+        rendering the whole transcript of a video.
+        """
+        if not caption_ids:
+            return {}
+
+        from zeeguu.core.model.user_word import UserWord
+
+        rows = (
+            Bookmark.query.join(cls)
+            .join(UserWord, Bookmark.user_word_id == UserWord.id)
+            .filter(cls.caption_id.in_(caption_ids))
+            .filter(UserWord.user_id == user_id)
+            .add_columns(cls.caption_id)
+            .all()
+        )
+
+        grouped = {}
+        for bookmark, caption_id in rows:
+            grouped.setdefault(caption_id, []).append(bookmark.to_json(True))
+        return grouped