Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions tools/migrations/26-06-01--add_caption_tokenization_cache.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- Cache for Stanza-tokenized caption text so /user_video doesn't re-tokenize
-- every caption on every request (a captioned 16-min video has hundreds of
-- captions, each one a Stanza call -- it was the dominant cost of the
-- ~2-second /user_video response observed on share-to-video traffic).
--
-- Captions are immutable after ingestion, so entries are populated lazily on
-- first read and never invalidated. delete_older_than() exists for housekeeping.

CREATE TABLE caption_tokenization_cache (
caption_id INT NOT NULL PRIMARY KEY,
tokenized_text MEDIUMTEXT,
created_at DATETIME DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT fk_caption_tok_cache_caption_id
FOREIGN KEY (caption_id) REFERENCES caption (id) ON DELETE CASCADE
);
1 change: 1 addition & 0 deletions zeeguu/core/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .article_upload import ArticleUpload
from .article_cefr_assessment import ArticleCefrAssessment
from .article_tokenization_cache import ArticleTokenizationCache
from .caption_tokenization_cache import CaptionTokenizationCache
from .text import Text
from .phrase import Phrase
from .user import User
Expand Down
74 changes: 74 additions & 0 deletions zeeguu/core/model/caption_tokenization_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import json
import logging

from sqlalchemy import Column, Integer, UnicodeText, ForeignKey, DateTime
from sqlalchemy.orm import relationship
from sqlalchemy.exc import IntegrityError, OperationalError
from datetime import datetime, timedelta
from zeeguu.core.model.db import db

log = logging.getLogger(__name__)


class CaptionTokenizationCache(db.Model):
"""
Caches the Stanza-based tokenized text for video captions to avoid
re-running tokenization on every GET /user_video. Mirrors the
ArticleTokenizationCache pattern.

Captions are immutable once a video is ingested, so cache entries do not
need to be invalidated -- only populated on first read.
"""

__tablename__ = "caption_tokenization_cache"

caption_id = Column(
Integer, ForeignKey("caption.id", ondelete="CASCADE"), primary_key=True
)
tokenized_text = Column(UnicodeText)
created_at = Column(DateTime, default=datetime.now)

@classmethod
def find_or_create(cls, session, caption_id):
cache = session.query(cls).filter_by(caption_id=caption_id).first()
if cache:
return cache

cache = cls(caption_id=caption_id)
session.add(cache)
try:
session.flush()
except IntegrityError:
# Another request beat us to the insert -- fetch and return that one.
session.rollback()
cache = session.query(cls).filter_by(caption_id=caption_id).first()
except OperationalError as e:
log.warning(
f"[CACHE] OperationalError during cache creation for caption "
f"{caption_id}: {e}"
)
session.rollback()
cache = session.query(cls).filter_by(caption_id=caption_id).first()
return cache

@classmethod
def get_many(cls, session, caption_ids):
"""One query for many caption ids. Returns {caption_id: tokenized_text_json_string}."""
if not caption_ids:
return {}
rows = (
session.query(cls.caption_id, cls.tokenized_text)
.filter(cls.caption_id.in_(caption_ids))
.all()
)
return {cid: tok for (cid, tok) in rows}

@classmethod
def delete_older_than(cls, session, days=30):
cutoff = datetime.now() - timedelta(days=days)
deleted = session.query(cls).filter(cls.created_at < cutoff).delete()
session.commit()
log.info(
f"[CACHE-CLEANUP] Deleted {deleted} caption-cache entries older than {days} days"
)
return deleted
16 changes: 11 additions & 5 deletions zeeguu/core/model/user_video.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,12 +190,18 @@ def user_video_info(
]

if "captions" in returned_info:
# One query for all captions' past bookmarks instead of N --
# this used to be the N+1 next to the Stanza loop.
caption_ids = [
c["context_identifier"]["video_caption_id"]
for c in returned_info["captions"]
]
grouped = VideoCaptionContext.get_user_bookmarks_grouped_by_caption(
user.id, caption_ids
)
for caption in returned_info["captions"]:
caption["past_bookmarks"] = (
VideoCaptionContext.get_all_user_bookmarks_for_caption(
user.id, caption["context_identifier"]["video_caption_id"]
)
)
caption_id = caption["context_identifier"]["video_caption_id"]
caption["past_bookmarks"] = grouped.get(caption_id, [])

if "tokenized_title" in returned_info:
returned_info["tokenized_title"]["past_bookmarks"] = (
Expand Down
57 changes: 43 additions & 14 deletions zeeguu/core/model/video.py
Original file line number Diff line number Diff line change
Expand Up @@ -281,22 +281,51 @@ def video_info(self, with_content=False):
result_dict["published_time"] = datetime_to_json(self.published_time)

if with_content:
import json
from zeeguu.core.mwe import tokenize_for_reading

result_dict["captions"] = [
{
"time_start": caption.time_start / 1000, # convert to seconds
"time_end": caption.time_end / 1000,
"text": caption.get_content(),
"tokenized_text": tokenize_for_reading(
from zeeguu.core.model.caption_tokenization_cache import (
CaptionTokenizationCache,
)
from . import db

# Stanza tokenization is by far the slowest piece of this method --
# for a 16-min auto-captioned video, the uncached path was ~2s
# spread across hundreds of Stanza calls. Batch-load the cache so
# the typical (warm-cache) request does zero tokenization work.
caption_ids = [c.id for c in self.captions]
cached = CaptionTokenizationCache.get_many(db.session, caption_ids)
populated_any = False

captions_out = []
for caption in self.captions:
cache_json = cached.get(caption.id)
if cache_json:
tokenized = json.loads(cache_json)
else:
tokenized = tokenize_for_reading(
caption.get_content(), self.language, mode="stanza"
),
"context_identifier": ContextIdentifier(
ContextType.VIDEO_CAPTION, video_caption_id=caption.id
).as_dictionary(),
}
for caption in self.captions
]
)
row = CaptionTokenizationCache.find_or_create(
db.session, caption.id
)
row.tokenized_text = json.dumps(tokenized)
populated_any = True

captions_out.append(
{
"time_start": caption.time_start / 1000, # convert to seconds
"time_end": caption.time_end / 1000,
"text": caption.get_content(),
"tokenized_text": tokenized,
"context_identifier": ContextIdentifier(
ContextType.VIDEO_CAPTION, video_caption_id=caption.id
).as_dictionary(),
}
)
result_dict["captions"] = captions_out

if populated_any:
db.session.commit()

result_dict["tokenized_title"] = {
"tokens": tokenize_for_reading(self.title, self.language, mode="stanza"),
Expand Down
27 changes: 27 additions & 0 deletions zeeguu/core/model/video_caption_context.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,3 +76,30 @@ def get_all_user_bookmarks_for_caption(
).all()

return [each.to_json(True) if as_json_serializable else each for each in result]

@classmethod
def get_user_bookmarks_grouped_by_caption(cls, user_id: int, caption_ids):
"""Batched companion to get_all_user_bookmarks_for_caption.

One query for many captions; returns {caption_id: [bookmark_json, ...]}.
Avoids the N+1 of calling the single-caption helper per caption when
rendering the whole transcript of a video.
"""
if not caption_ids:
return {}

from zeeguu.core.model.user_word import UserWord

rows = (
Bookmark.query.join(cls)
.join(UserWord, Bookmark.user_word_id == UserWord.id)
.filter(cls.caption_id.in_(caption_ids))
.filter(UserWord.user_id == user_id)
.add_columns(cls.caption_id)
.all()
)

grouped = {}
for bookmark, caption_id in rows:
grouped.setdefault(caption_id, []).append(bookmark.to_json(True))
return grouped
Loading