zeeguu · mircealungu · Jun 1, 2026
diff --git a/tools/migrations/26-06-01--add_caption_tokenization_cache.sql b/tools/migrations/26-06-01--add_caption_tokenization_cache.sql
@@ -0,0 +1,15 @@
+-- Cache for Stanza-tokenized caption text so /user_video doesn't re-tokenize
+-- every caption on every request (a captioned 16-min video has hundreds of
+-- captions, each one a Stanza call -- it was the dominant cost of the
+-- ~2-second /user_video response observed on share-to-video traffic).
+--
+-- Captions are immutable after ingestion, so entries are populated lazily on
+-- first read and never invalidated. delete_older_than() exists for housekeeping.
+
+CREATE TABLE caption_tokenization_cache (
+    caption_id     INT NOT NULL PRIMARY KEY,
+    tokenized_text MEDIUMTEXT,
+    created_at     DATETIME DEFAULT CURRENT_TIMESTAMP,
+    CONSTRAINT fk_caption_tok_cache_caption_id
+        FOREIGN KEY (caption_id) REFERENCES caption (id) ON DELETE CASCADE
+);
diff --git a/zeeguu/core/model/__init__.py b/zeeguu/core/model/__init__.py
@@ -10,6 +10,7 @@
 from .article_upload import ArticleUpload
 from .article_cefr_assessment import ArticleCefrAssessment
 from .article_tokenization_cache import ArticleTokenizationCache
+from .caption_tokenization_cache import CaptionTokenizationCache
 from .text import Text
 from .phrase import Phrase
 from .user import User

diff --git a/zeeguu/core/model/caption_tokenization_cache.py b/zeeguu/core/model/caption_tokenization_cache.py
@@ -0,0 +1,74 @@
+import json
+import logging
+
+from sqlalchemy import Column, Integer, UnicodeText, ForeignKey, DateTime
+from sqlalchemy.orm import relationship
+from sqlalchemy.exc import IntegrityError, OperationalError
+from datetime import datetime, timedelta
+from zeeguu.core.model.db import db
+
+log = logging.getLogger(__name__)
+
+
+class CaptionTokenizationCache(db.Model):
+    """
+    Caches the Stanza-based tokenized text for video captions to avoid
+    re-running tokenization on every GET /user_video. Mirrors the
+    ArticleTokenizationCache pattern.
+
+    Captions are immutable once a video is ingested, so cache entries do not
+    need to be invalidated -- only populated on first read.
+    """
+
+    __tablename__ = "caption_tokenization_cache"
+
+    caption_id = Column(
+        Integer, ForeignKey("caption.id", ondelete="CASCADE"), primary_key=True
+    )
+    tokenized_text = Column(UnicodeText)
+    created_at = Column(DateTime, default=datetime.now)
+
+    @classmethod
+    def find_or_create(cls, session, caption_id):
+        cache = session.query(cls).filter_by(caption_id=caption_id).first()
+        if cache:
+            return cache
+
+        cache = cls(caption_id=caption_id)
+        session.add(cache)
+        try:
+            session.flush()
+        except IntegrityError:
+            # Another request beat us to the insert -- fetch and return that one.
+            session.rollback()
+            cache = session.query(cls).filter_by(caption_id=caption_id).first()
+        except OperationalError as e:
+            log.warning(
+                f"[CACHE] OperationalError during cache creation for caption "
+                f"{caption_id}: {e}"
+            )
+            session.rollback()
+            cache = session.query(cls).filter_by(caption_id=caption_id).first()
+        return cache
+
+    @classmethod
+    def get_many(cls, session, caption_ids):
+        """One query for many caption ids. Returns {caption_id: tokenized_text_json_string}."""
+        if not caption_ids:
+            return {}
+        rows = (
+            session.query(cls.caption_id, cls.tokenized_text)
+            .filter(cls.caption_id.in_(caption_ids))
+            .all()
+        )
+        return {cid: tok for (cid, tok) in rows}
+
+    @classmethod
+    def delete_older_than(cls, session, days=30):
+        cutoff = datetime.now() - timedelta(days=days)
+        deleted = session.query(cls).filter(cls.created_at < cutoff).delete()
+        session.commit()
+        log.info(
+            f"[CACHE-CLEANUP] Deleted {deleted} caption-cache entries older than {days} days"
+        )
+        return deleted
diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py
@@ -190,12 +190,18 @@ def user_video_info(
                 ]
 
             if "captions" in returned_info:
+                # One query for all captions' past bookmarks instead of N --
+                # this used to be the N+1 next to the Stanza loop.
+                caption_ids = [
+                    c["context_identifier"]["video_caption_id"]
+                    for c in returned_info["captions"]
+                ]
+                grouped = VideoCaptionContext.get_user_bookmarks_grouped_by_caption(
+                    user.id, caption_ids
+                )
                 for caption in returned_info["captions"]:
-                    caption["past_bookmarks"] = (
-                        VideoCaptionContext.get_all_user_bookmarks_for_caption(
-                            user.id, caption["context_identifier"]["video_caption_id"]
-                        )
-                    )
+                    caption_id = caption["context_identifier"]["video_caption_id"]
+                    caption["past_bookmarks"] = grouped.get(caption_id, [])
 
             if "tokenized_title" in returned_info:
                 returned_info["tokenized_title"]["past_bookmarks"] = (

diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py
@@ -281,22 +281,51 @@ def video_info(self, with_content=False):
             result_dict["published_time"] = datetime_to_json(self.published_time)
 
         if with_content:
+            import json
             from zeeguu.core.mwe import tokenize_for_reading
-
-            result_dict["captions"] = [
-                {
-                    "time_start": caption.time_start / 1000,  # convert to seconds
-                    "time_end": caption.time_end / 1000,
-                    "text": caption.get_content(),
-                    "tokenized_text": tokenize_for_reading(
+            from zeeguu.core.model.caption_tokenization_cache import (
+                CaptionTokenizationCache,
+            )
+            from . import db
+
+            # Stanza tokenization is by far the slowest piece of this method --
+            # for a 16-min auto-captioned video, the uncached path was ~2s
+            # spread across hundreds of Stanza calls. Batch-load the cache so
+            # the typical (warm-cache) request does zero tokenization work.
+            caption_ids = [c.id for c in self.captions]
+            cached = CaptionTokenizationCache.get_many(db.session, caption_ids)
+            populated_any = False
+
+            captions_out = []
+            for caption in self.captions:
+                cache_json = cached.get(caption.id)
+                if cache_json:
+                    tokenized = json.loads(cache_json)
+                else:
+                    tokenized = tokenize_for_reading(
                         caption.get_content(), self.language, mode="stanza"
-                    ),
-                    "context_identifier": ContextIdentifier(
-                        ContextType.VIDEO_CAPTION, video_caption_id=caption.id
-                    ).as_dictionary(),
-                }
-                for caption in self.captions
-            ]
+                    )
+                    row = CaptionTokenizationCache.find_or_create(
+                        db.session, caption.id
+                    )
+                    row.tokenized_text = json.dumps(tokenized)
+                    populated_any = True
+
+                captions_out.append(
+                    {
+                        "time_start": caption.time_start / 1000,  # convert to seconds
+                        "time_end": caption.time_end / 1000,
+                        "text": caption.get_content(),
+                        "tokenized_text": tokenized,
+                        "context_identifier": ContextIdentifier(
+                            ContextType.VIDEO_CAPTION, video_caption_id=caption.id
+                        ).as_dictionary(),
+                    }
+                )
+            result_dict["captions"] = captions_out
+
+            if populated_any:
+                db.session.commit()
 
             result_dict["tokenized_title"] = {
                 "tokens": tokenize_for_reading(self.title, self.language, mode="stanza"),

diff --git a/zeeguu/core/model/video_caption_context.py b/zeeguu/core/model/video_caption_context.py
@@ -76,3 +76,30 @@ def get_all_user_bookmarks_for_caption(
         ).all()
 
         return [each.to_json(True) if as_json_serializable else each for each in result]
+
+    @classmethod
+    def get_user_bookmarks_grouped_by_caption(cls, user_id: int, caption_ids):
+        """Batched companion to get_all_user_bookmarks_for_caption.
+
+        One query for many captions; returns {caption_id: [bookmark_json, ...]}.
+        Avoids the N+1 of calling the single-caption helper per caption when
+        rendering the whole transcript of a video.
+        """
+        if not caption_ids:
+            return {}
+
+        from zeeguu.core.model.user_word import UserWord
+
+        rows = (
+            Bookmark.query.join(cls)
+            .join(UserWord, Bookmark.user_word_id == UserWord.id)
+            .filter(cls.caption_id.in_(caption_ids))
+            .filter(UserWord.user_id == user_id)
+            .add_columns(cls.caption_id)
+            .all()
+        )
+
+        grouped = {}
+        for bookmark, caption_id in rows:
+            grouped.setdefault(caption_id, []).append(bookmark.to_json(True))
+        return grouped