From abd5eb5bd6212d43fc891b0c52771abdcac4cabb Mon Sep 17 00:00:00 2001 From: Mircea Lungu Date: Mon, 1 Jun 2026 22:33:48 +0200 Subject: [PATCH] perf(video): cache caption tokenization + batch past-bookmark lookup `GET /user_video?video_id=` returned ~1 MB and took ~2s end-to-end on captioned videos with hundreds of segments. Two dominant costs: 1. `Video.video_info(with_content=True)` ran Stanza tokenization on every caption on every request -- no cache. Articles have `article_tokenization_cache`; videos didn't. 2. `UserVideo.user_video_info` then looped over the freshly-tokenized captions and made one DB query per caption to fetch past bookmarks (N+1). This PR adds: - `caption_tokenization_cache` table (caption_id PK, tokenized_text MEDIUMTEXT, created_at) mirroring the article cache. Captions are immutable post-ingestion so entries are populated lazily on first read and never invalidated. Foreign-key cascade on delete. - `CaptionTokenizationCache` model with `find_or_create` (race-safe via flush+IntegrityError-then-fetch, same pattern as the article one), `get_many` for batched reads, and `delete_older_than` for cleanup. - `Video.video_info` now batch-fetches the cache, parses cached JSON when present, runs Stanza only on misses, persists the new rows, and commits once at the end. - `VideoCaptionContext.get_user_bookmarks_grouped_by_caption` -- one IN-clause query that replaces the per-caption helper; returns a {caption_id: [bookmark_json, ...]} dict. - `UserVideo.user_video_info` uses the grouped result instead of the per-caption helper. The single-caption helper is left in place for other callers. Expected effect after first warm cache: /user_video drops from ~2s to roughly the time of one batched SELECT plus the JSON serialisation. The N+1 collapses to one query regardless of caption count. Migration: `tools/migrations/26-06-01--add_caption_tokenization_cache.sql` must be run before/with deploy. Co-Authored-By: Claude Opus 4.7 (1M context) --- ...-06-01--add_caption_tokenization_cache.sql | 15 ++++ zeeguu/core/model/__init__.py | 1 + .../core/model/caption_tokenization_cache.py | 74 +++++++++++++++++++ zeeguu/core/model/user_video.py | 16 ++-- zeeguu/core/model/video.py | 57 ++++++++++---- zeeguu/core/model/video_caption_context.py | 27 +++++++ 6 files changed, 171 insertions(+), 19 deletions(-) create mode 100644 tools/migrations/26-06-01--add_caption_tokenization_cache.sql create mode 100644 zeeguu/core/model/caption_tokenization_cache.py diff --git a/tools/migrations/26-06-01--add_caption_tokenization_cache.sql b/tools/migrations/26-06-01--add_caption_tokenization_cache.sql new file mode 100644 index 000000000..00ee33ae3 --- /dev/null +++ b/tools/migrations/26-06-01--add_caption_tokenization_cache.sql @@ -0,0 +1,15 @@ +-- Cache for Stanza-tokenized caption text so /user_video doesn't re-tokenize +-- every caption on every request (a captioned 16-min video has hundreds of +-- captions, each one a Stanza call -- it was the dominant cost of the +-- ~2-second /user_video response observed on share-to-video traffic). +-- +-- Captions are immutable after ingestion, so entries are populated lazily on +-- first read and never invalidated. delete_older_than() exists for housekeeping. + +CREATE TABLE caption_tokenization_cache ( + caption_id INT NOT NULL PRIMARY KEY, + tokenized_text MEDIUMTEXT, + created_at DATETIME DEFAULT CURRENT_TIMESTAMP, + CONSTRAINT fk_caption_tok_cache_caption_id + FOREIGN KEY (caption_id) REFERENCES caption (id) ON DELETE CASCADE +); diff --git a/zeeguu/core/model/__init__.py b/zeeguu/core/model/__init__.py index e6533ae9c..7731b1726 100644 --- a/zeeguu/core/model/__init__.py +++ b/zeeguu/core/model/__init__.py @@ -10,6 +10,7 @@ from .article_upload import ArticleUpload from .article_cefr_assessment import ArticleCefrAssessment from .article_tokenization_cache import ArticleTokenizationCache +from .caption_tokenization_cache import CaptionTokenizationCache from .text import Text from .phrase import Phrase from .user import User diff --git a/zeeguu/core/model/caption_tokenization_cache.py b/zeeguu/core/model/caption_tokenization_cache.py new file mode 100644 index 000000000..542fc1230 --- /dev/null +++ b/zeeguu/core/model/caption_tokenization_cache.py @@ -0,0 +1,74 @@ +import json +import logging + +from sqlalchemy import Column, Integer, UnicodeText, ForeignKey, DateTime +from sqlalchemy.orm import relationship +from sqlalchemy.exc import IntegrityError, OperationalError +from datetime import datetime, timedelta +from zeeguu.core.model.db import db + +log = logging.getLogger(__name__) + + +class CaptionTokenizationCache(db.Model): + """ + Caches the Stanza-based tokenized text for video captions to avoid + re-running tokenization on every GET /user_video. Mirrors the + ArticleTokenizationCache pattern. + + Captions are immutable once a video is ingested, so cache entries do not + need to be invalidated -- only populated on first read. + """ + + __tablename__ = "caption_tokenization_cache" + + caption_id = Column( + Integer, ForeignKey("caption.id", ondelete="CASCADE"), primary_key=True + ) + tokenized_text = Column(UnicodeText) + created_at = Column(DateTime, default=datetime.now) + + @classmethod + def find_or_create(cls, session, caption_id): + cache = session.query(cls).filter_by(caption_id=caption_id).first() + if cache: + return cache + + cache = cls(caption_id=caption_id) + session.add(cache) + try: + session.flush() + except IntegrityError: + # Another request beat us to the insert -- fetch and return that one. + session.rollback() + cache = session.query(cls).filter_by(caption_id=caption_id).first() + except OperationalError as e: + log.warning( + f"[CACHE] OperationalError during cache creation for caption " + f"{caption_id}: {e}" + ) + session.rollback() + cache = session.query(cls).filter_by(caption_id=caption_id).first() + return cache + + @classmethod + def get_many(cls, session, caption_ids): + """One query for many caption ids. Returns {caption_id: tokenized_text_json_string}.""" + if not caption_ids: + return {} + rows = ( + session.query(cls.caption_id, cls.tokenized_text) + .filter(cls.caption_id.in_(caption_ids)) + .all() + ) + return {cid: tok for (cid, tok) in rows} + + @classmethod + def delete_older_than(cls, session, days=30): + cutoff = datetime.now() - timedelta(days=days) + deleted = session.query(cls).filter(cls.created_at < cutoff).delete() + session.commit() + log.info( + f"[CACHE-CLEANUP] Deleted {deleted} caption-cache entries older than {days} days" + ) + return deleted diff --git a/zeeguu/core/model/user_video.py b/zeeguu/core/model/user_video.py index 1e15b9fd2..9797fe320 100644 --- a/zeeguu/core/model/user_video.py +++ b/zeeguu/core/model/user_video.py @@ -190,12 +190,18 @@ def user_video_info( ] if "captions" in returned_info: + # One query for all captions' past bookmarks instead of N -- + # this used to be the N+1 next to the Stanza loop. + caption_ids = [ + c["context_identifier"]["video_caption_id"] + for c in returned_info["captions"] + ] + grouped = VideoCaptionContext.get_user_bookmarks_grouped_by_caption( + user.id, caption_ids + ) for caption in returned_info["captions"]: - caption["past_bookmarks"] = ( - VideoCaptionContext.get_all_user_bookmarks_for_caption( - user.id, caption["context_identifier"]["video_caption_id"] - ) - ) + caption_id = caption["context_identifier"]["video_caption_id"] + caption["past_bookmarks"] = grouped.get(caption_id, []) if "tokenized_title" in returned_info: returned_info["tokenized_title"]["past_bookmarks"] = ( diff --git a/zeeguu/core/model/video.py b/zeeguu/core/model/video.py index fc6f73669..51339e6d2 100644 --- a/zeeguu/core/model/video.py +++ b/zeeguu/core/model/video.py @@ -281,22 +281,51 @@ def video_info(self, with_content=False): result_dict["published_time"] = datetime_to_json(self.published_time) if with_content: + import json from zeeguu.core.mwe import tokenize_for_reading - - result_dict["captions"] = [ - { - "time_start": caption.time_start / 1000, # convert to seconds - "time_end": caption.time_end / 1000, - "text": caption.get_content(), - "tokenized_text": tokenize_for_reading( + from zeeguu.core.model.caption_tokenization_cache import ( + CaptionTokenizationCache, + ) + from . import db + + # Stanza tokenization is by far the slowest piece of this method -- + # for a 16-min auto-captioned video, the uncached path was ~2s + # spread across hundreds of Stanza calls. Batch-load the cache so + # the typical (warm-cache) request does zero tokenization work. + caption_ids = [c.id for c in self.captions] + cached = CaptionTokenizationCache.get_many(db.session, caption_ids) + populated_any = False + + captions_out = [] + for caption in self.captions: + cache_json = cached.get(caption.id) + if cache_json: + tokenized = json.loads(cache_json) + else: + tokenized = tokenize_for_reading( caption.get_content(), self.language, mode="stanza" - ), - "context_identifier": ContextIdentifier( - ContextType.VIDEO_CAPTION, video_caption_id=caption.id - ).as_dictionary(), - } - for caption in self.captions - ] + ) + row = CaptionTokenizationCache.find_or_create( + db.session, caption.id + ) + row.tokenized_text = json.dumps(tokenized) + populated_any = True + + captions_out.append( + { + "time_start": caption.time_start / 1000, # convert to seconds + "time_end": caption.time_end / 1000, + "text": caption.get_content(), + "tokenized_text": tokenized, + "context_identifier": ContextIdentifier( + ContextType.VIDEO_CAPTION, video_caption_id=caption.id + ).as_dictionary(), + } + ) + result_dict["captions"] = captions_out + + if populated_any: + db.session.commit() result_dict["tokenized_title"] = { "tokens": tokenize_for_reading(self.title, self.language, mode="stanza"), diff --git a/zeeguu/core/model/video_caption_context.py b/zeeguu/core/model/video_caption_context.py index 80cf187ae..e32383bd6 100644 --- a/zeeguu/core/model/video_caption_context.py +++ b/zeeguu/core/model/video_caption_context.py @@ -76,3 +76,30 @@ def get_all_user_bookmarks_for_caption( ).all() return [each.to_json(True) if as_json_serializable else each for each in result] + + @classmethod + def get_user_bookmarks_grouped_by_caption(cls, user_id: int, caption_ids): + """Batched companion to get_all_user_bookmarks_for_caption. + + One query for many captions; returns {caption_id: [bookmark_json, ...]}. + Avoids the N+1 of calling the single-caption helper per caption when + rendering the whole transcript of a video. + """ + if not caption_ids: + return {} + + from zeeguu.core.model.user_word import UserWord + + rows = ( + Bookmark.query.join(cls) + .join(UserWord, Bookmark.user_word_id == UserWord.id) + .filter(cls.caption_id.in_(caption_ids)) + .filter(UserWord.user_id == user_id) + .add_columns(cls.caption_id) + .all() + ) + + grouped = {} + for bookmark, caption_id in rows: + grouped.setdefault(caption_id, []).append(bookmark.to_json(True)) + return grouped