From 51b0bdfa4f1fc1fb99371fa6ff32549310aeb0d0 Mon Sep 17 00:00:00 2001 From: Eric Litman Date: Mon, 4 May 2026 23:39:44 +0300 Subject: [PATCH 1/5] feat: capture bookmark reply threads --- src/bookmark-media.ts | 24 +- src/bookmarks-db.ts | 122 ++++++- src/cli.ts | 87 ++++- src/graphql-bookmarks.ts | 607 +++++++++++++++++++++++++++++++- src/types.ts | 39 ++ tests/bookmark-media.test.ts | 79 +++++ tests/graphql-bookmarks.test.ts | 300 ++++++++++++++++ 7 files changed, 1242 insertions(+), 16 deletions(-) diff --git a/src/bookmark-media.ts b/src/bookmark-media.ts index 3a052ec..1c9f6a1 100644 --- a/src/bookmark-media.ts +++ b/src/bookmark-media.ts @@ -3,7 +3,7 @@ import { createHash } from 'node:crypto'; import { writeFile } from 'node:fs/promises'; import { ensureDir, pathExists, readJson, readJsonLines, writeJson } from './fs.js'; import { bookmarkMediaDir, bookmarkMediaManifestPath, twitterBookmarksCachePath } from './paths.js'; -import type { BookmarkRecord } from './types.js'; +import type { BookmarkMediaObject, BookmarkRecord, ThreadTweetSnapshot } from './types.js'; export const DEFAULT_MEDIA_MAX_BYTES = 200 * 1024 * 1024; @@ -60,7 +60,7 @@ interface MediaTargetSource { authorName?: string; authorProfileImageUrl?: string; media?: string[]; - mediaObjects?: BookmarkRecord['mediaObjects']; + mediaObjects?: BookmarkMediaObject[]; } interface CachedMediaResult { @@ -105,7 +105,10 @@ function hasTargets(source: { media?: unknown[]; mediaObjects?: unknown[]; autho } function hasMediaCandidate(bookmark: BookmarkRecord): boolean { - return hasTargets(bookmark) || hasTargets(bookmark.quotedTweet); + return hasTargets(bookmark) + || hasTargets(bookmark.quotedTweet) + || (bookmark.threadContext ?? []).some(hasTargets) + || (bookmark.threadBelow ?? []).some(hasTargets); } function pushTarget( @@ -199,6 +202,21 @@ function resolveMediaTargets( }, downloadedProfileImageUrls, skipProfileImages); } + const appendThreadTweet = (tweet: ThreadTweetSnapshot): void => { + appendMediaTargets(targets, seenKeys, bookmark.id, { + tweetId: tweet.id, + tweetUrl: tweet.url, + authorHandle: tweet.authorHandle, + authorName: tweet.authorName, + authorProfileImageUrl: tweet.authorProfileImageUrl, + media: tweet.media, + mediaObjects: tweet.mediaObjects, + }, downloadedProfileImageUrls, skipProfileImages); + }; + + for (const tweet of bookmark.threadContext ?? []) appendThreadTweet(tweet); + for (const tweet of bookmark.threadBelow ?? []) appendThreadTweet(tweet); + return targets; } diff --git a/src/bookmarks-db.ts b/src/bookmarks-db.ts index cfc336b..910bcb0 100644 --- a/src/bookmarks-db.ts +++ b/src/bookmarks-db.ts @@ -3,11 +3,11 @@ import { openDb, saveDb } from './db.js'; import { parseTimestampMs, toIsoDate } from './date-utils.js'; import { readJsonLines } from './fs.js'; import { twitterBookmarksCachePath, twitterBookmarksIndexPath } from './paths.js'; -import type { BookmarkRecord, QuotedTweetSnapshot } from './types.js'; +import type { BookmarkRecord, QuotedTweetSnapshot, ThreadTweetSnapshot } from './types.js'; import { classifyCorpus, formatClassificationSummary } from './bookmark-classify.js'; import type { ClassificationSummary } from './bookmark-classify.js'; -const SCHEMA_VERSION = 6; +const SCHEMA_VERSION = 7; export interface SearchResult { id: string; @@ -51,6 +51,10 @@ export interface BookmarkTimelineItem { enrichedAt?: string | null; quotedStatusId?: string | null; quotedTweet?: QuotedTweetSnapshot | null; + threadContext: ThreadTweetSnapshot[]; + threadBelow: ThreadTweetSnapshot[]; + threadExpandedAt?: string | null; + threadExpansionFailedAt?: string | null; mediaCount: number; linkCount: number; likeCount?: number | null; @@ -104,6 +108,21 @@ function parseQuotedTweet(value: unknown): QuotedTweetSnapshot | null { } } +function parseThreadTweets(value: unknown): ThreadTweetSnapshot[] { + if (typeof value !== 'string' || !value.trim()) return []; + try { + const parsed = JSON.parse(value) as Array>; + if (!Array.isArray(parsed)) return []; + return parsed.filter((entry): entry is ThreadTweetSnapshot => + typeof entry?.id === 'string' && + typeof entry?.text === 'string' && + typeof entry?.url === 'string' + ); + } catch { + return []; + } +} + function parseCsv(value: unknown): string[] { if (typeof value !== 'string' || !value.trim()) return []; return value @@ -171,6 +190,10 @@ function mapTimelineRow(row: unknown[]): BookmarkTimelineItem { enrichedAt: (row[29] as string) ?? null, quotedStatusId: (row[30] as string) ?? null, quotedTweet: parseQuotedTweet(row[31]), + threadContext: parseThreadTweets(row[32]), + threadBelow: parseThreadTweets(row[33]), + threadExpandedAt: (row[34] as string) ?? null, + threadExpansionFailedAt: (row[35] as string) ?? null, }; } @@ -271,7 +294,11 @@ function initSchema(db: Database): void { article_site TEXT, enriched_at TEXT, folder_ids TEXT, - folder_names TEXT + folder_names TEXT, + thread_context_json TEXT, + thread_below_json TEXT, + thread_expanded_at TEXT, + thread_expansion_failed_at TEXT )`); db.run(`CREATE INDEX IF NOT EXISTS idx_bookmarks_author ON bookmarks(author_handle)`); @@ -345,6 +372,10 @@ function ensureMigrations(db: Database): void { ensureColumn(db, 'bookmarks', 'folder_ids', 'TEXT'); ensureColumn(db, 'bookmarks', 'folder_names', 'TEXT'); + ensureColumn(db, 'bookmarks', 'thread_context_json', 'TEXT'); + ensureColumn(db, 'bookmarks', 'thread_below_json', 'TEXT'); + ensureColumn(db, 'bookmarks', 'thread_expanded_at', 'TEXT'); + ensureColumn(db, 'bookmarks', 'thread_expansion_failed_at', 'TEXT'); // FTS rebuild: only if the FTS table is missing the article_text column. // Check via a zero-row SELECT so we don't rebuild unnecessarily. @@ -375,6 +406,10 @@ interface PreservedBookmarkFields { enrichedAt: string | null; folderIds: string | null; folderNames: string | null; + threadContextJson: string | null; + threadBelowJson: string | null; + threadExpandedAt: string | null; + threadExpansionFailedAt: string | null; } function serializeJsonArray(values: string[] | undefined | null): string | null { @@ -382,6 +417,11 @@ function serializeJsonArray(values: string[] | undefined | null): string | null return JSON.stringify(values); } +function serializeThreadTweets(values: ThreadTweetSnapshot[] | undefined): string | null { + if (values === undefined) return null; + return JSON.stringify(values); +} + function insertRecord(db: Database, r: BookmarkRecord, preserved?: PreservedBookmarkFields): void { // Extract GitHub URLs (kept inline — no LLM needed for URL parsing) const text = r.text ?? ''; @@ -390,7 +430,7 @@ function insertRecord(db: Database, r: BookmarkRecord, preserved?: PreservedBook const githubUrls = [...new Set([...githubMatches.map((m) => `https://${m}`), ...githubFromLinks])]; db.run( - `INSERT OR REPLACE INTO bookmarks VALUES (${Array(37).fill('?').join(',')})`, + `INSERT OR REPLACE INTO bookmarks VALUES (${Array(41).fill('?').join(',')})`, [ r.id, r.tweetId, @@ -429,6 +469,10 @@ function insertRecord(db: Database, r: BookmarkRecord, preserved?: PreservedBook preserved?.enrichedAt ?? null, serializeJsonArray(r.folderIds) ?? preserved?.folderIds ?? null, serializeJsonArray(r.folderNames) ?? preserved?.folderNames ?? null, + serializeThreadTweets(r.threadContext) ?? preserved?.threadContextJson ?? null, + serializeThreadTweets(r.threadBelow) ?? preserved?.threadBelowJson ?? null, + r.threadExpandedAt ?? preserved?.threadExpandedAt ?? null, + r.threadExpansionFailedAt ?? preserved?.threadExpansionFailedAt ?? null, ] ); } @@ -459,7 +503,8 @@ export async function buildIndex(options?: { force?: boolean }): Promise<{ dbPat const rows = db.exec( `SELECT id, categories, primary_category, github_urls, domains, primary_domain, quoted_tweet_json, article_title, article_text, article_site, enriched_at, - folder_ids, folder_names + folder_ids, folder_names, thread_context_json, thread_below_json, + thread_expanded_at, thread_expansion_failed_at FROM bookmarks` ); for (const r of (rows[0]?.values ?? [])) { @@ -476,6 +521,10 @@ export async function buildIndex(options?: { force?: boolean }): Promise<{ dbPat enrichedAt: (r[10] as string) ?? null, folderIds: (r[11] as string) ?? null, folderNames: (r[12] as string) ?? null, + threadContextJson: (r[13] as string) ?? null, + threadBelowJson: (r[14] as string) ?? null, + threadExpandedAt: (r[15] as string) ?? null, + threadExpansionFailedAt: (r[16] as string) ?? null, }); } } catch { /* table may be empty */ } @@ -664,7 +713,11 @@ export async function listBookmarks( b.synced_at, b.enriched_at, b.quoted_status_id, - b.quoted_tweet_json + b.quoted_tweet_json, + b.thread_context_json, + b.thread_below_json, + b.thread_expanded_at, + b.thread_expansion_failed_at FROM bookmarks b ${where} ${bookmarkSortClause(filters.sort)} @@ -732,7 +785,11 @@ export async function exportBookmarksForSyncSeed(): Promise { b.view_count, b.links_json, b.folder_ids, - b.folder_names + b.folder_names, + b.thread_context_json, + b.thread_below_json, + b.thread_expanded_at, + b.thread_expansion_failed_at FROM bookmarks b ${bookmarkSortClause('desc')} `; @@ -765,6 +822,10 @@ export async function exportBookmarksForSyncSeed(): Promise { links: parseJsonArray(row[20]), folderIds: parseJsonArray(row[21]), folderNames: parseJsonArray(row[22]), + threadContext: parseThreadTweets(row[23]), + threadBelow: parseThreadTweets(row[24]), + threadExpandedAt: (row[25] as string) ?? undefined, + threadExpansionFailedAt: (row[26] as string) ?? undefined, tags: [], ingestedVia: 'graphql', })); @@ -812,7 +873,11 @@ export async function getBookmarkById(id: string): Promise { + if (!records.length) return; + const dbPath = twitterBookmarksIndexPath(); + const db = await openDb(dbPath); + ensureMigrations(db); + + try { + const tableExists = db.exec("SELECT name FROM sqlite_master WHERE type='table' AND name='bookmarks'"); + if (tableExists.length === 0 || tableExists[0].values.length === 0) return; + const stmt = db.prepare( + `UPDATE bookmarks + SET thread_context_json = ?, + thread_below_json = ?, + thread_expanded_at = ?, + thread_expansion_failed_at = ? + WHERE id = ?` + ); + for (const record of records) { + stmt.run([ + record.threadContext === undefined ? null : JSON.stringify(record.threadContext), + record.threadBelow === undefined ? null : JSON.stringify(record.threadBelow), + record.threadExpandedAt ?? null, + record.threadExpansionFailedAt ?? null, + record.id, + ]); + } + stmt.free(); + saveDb(db, dbPath); + } finally { + db.close(); + } +} + export function formatSearchResults(results: SearchResult[]): string { if (results.length === 0) return 'No results found.'; diff --git a/src/cli.ts b/src/cli.ts index b32621c..92a3396 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -3,9 +3,9 @@ import { Command, Option } from 'commander'; import { syncTwitterBookmarks } from './bookmarks.js'; import { getBookmarkStatusView, formatBookmarkStatus } from './bookmarks-service.js'; import { runTwitterOAuthFlow } from './xauth.js'; -import { syncBookmarksGraphQL, syncGaps, syncBookmarkFolders } from './graphql-bookmarks.js'; -import type { SyncProgress, GapFillProgress, FolderSyncProgress } from './graphql-bookmarks.js'; -import type { BookmarkFolder, QuotedTweetSnapshot } from './types.js'; +import { syncBookmarksGraphQL, syncGaps, syncBookmarkFolders, syncThreads } from './graphql-bookmarks.js'; +import type { SyncProgress, GapFillProgress, FolderSyncProgress, ThreadSyncProgress } from './graphql-bookmarks.js'; +import type { BookmarkFolder, QuotedTweetSnapshot, ThreadTweetSnapshot } from './types.js'; import { DEFAULT_MEDIA_MAX_BYTES, fetchBookmarkMediaBatch } from './bookmark-media.js'; import type { MediaFetchManifest, MediaFetchProgress } from './bookmark-media.js'; import { @@ -567,6 +567,26 @@ function formatQuotedTweetLines(quoted: QuotedTweetSnapshot): string[] { ]; } +function formatThreadTweetLines(tweet: ThreadTweetSnapshot): string[] { + const author = tweet.authorHandle ? `@${tweet.authorHandle}` : (tweet.authorName ?? 'thread tweet'); + const date = tweet.postedAt ? ` · ${tweet.postedAt.slice(0, 10)}` : ''; + const text = tweet.text.split(/\r?\n/).map((line) => ` | ${sanitizeForDisplay(line)}`); + return [ + ` | ${sanitizeForDisplay(author)}${date}`, + ...text, + ` | ${tweet.url}`, + ]; +} + +function formatThreadSectionLines(title: string, tweets: ThreadTweetSnapshot[]): string[] { + if (tweets.length === 0) return []; + const lines = ['', title]; + for (const tweet of tweets) { + lines.push(...formatThreadTweetLines(tweet), ''); + } + return lines; +} + export function formatFolderMirrorStats(stats: { added: number; tagged: number; untagged: number; unchanged: number }): string { const parts: string[] = []; if (stats.added > 0) parts.push(`${stats.added} new`); @@ -751,6 +771,7 @@ export function buildCli() { .option('--firefox-profile-dir ', 'Firefox profile directory') .option('--folders', 'Also sync bookmark folder tags (mirrors X\u2019s current folder state)', false) .option('--folder ', 'Sync only this folder (case-insensitive, supports unambiguous prefix)') + .option('--threads', 'Capture parent context and same-author thread continuations', false) .addOption(engineOption()) .action(async (options) => { const firstRun = isFirstRun(); @@ -763,12 +784,18 @@ export function buildCli() { await resolveEngine({ override: engineOverride }); } + const syncThreadsEnabled = Boolean(options.threads); const mutuallyExclusive = [options.rebuild, options.continue, options.gaps].filter(Boolean).length; if (mutuallyExclusive > 1) { console.error(' Error: --rebuild, --continue, and --gaps cannot be used together.'); process.exitCode = 1; return; } + if (syncThreadsEnabled && options.gaps) { + console.error(' Error: --threads cannot be combined with --gaps yet. Run them separately.'); + process.exitCode = 1; + return; + } // Folder flags: --folders (all) and --folder (one) are mutually exclusive. const folderAll = Boolean(options.folders); @@ -784,6 +811,11 @@ export function buildCli() { process.exitCode = 1; return; } + if (syncThreadsEnabled && options.api) { + console.error(' Error: Thread sync requires browser session (GraphQL). Remove --api.'); + process.exitCode = 1; + return; + } if (folderMode !== 'off' && options.gaps) { console.error(' Error: --folders/--folder cannot be combined with --gaps. Run them separately.'); process.exitCode = 1; @@ -801,6 +833,47 @@ export function buildCli() { console.log(''); }; + const runThreadSync = async (cookieArgs: { csrfToken?: string; cookieHeader?: string }): Promise => { + if (!syncThreadsEnabled) return; + const startTime = Date.now(); + process.stderr.write(`\n Expanding reply threads...\n`); + let lastProgress: ThreadSyncProgress = { done: 0, total: 0, contextFilled: 0, belowFilled: 0, emptyChecked: 0, failed: 0 }; + const spinner = createSpinner(() => { + const p = lastProgress; + const pct = p.total > 0 ? Math.round((p.done / p.total) * 100) : 0; + const elapsed = Math.round((Date.now() - startTime) / 1000); + const parts = [`${p.done}/${p.total} (${pct}%)`]; + if (p.contextFilled) parts.push(`${p.contextFilled} context`); + if (p.belowFilled) parts.push(`${p.belowFilled} continuations`); + if (p.emptyChecked) parts.push(`${p.emptyChecked} empty`); + if (p.failed) parts.push(`${p.failed} failed`); + parts.push(`${elapsed}s`); + return parts.join(' \u2502 '); + }); + const result = await runWithSpinner(spinner, () => syncThreads({ + delayMs: Number(options.delayMs) || 300, + browser: options.browser ? String(options.browser) : undefined, + chromeUserDataDir: options.chromeUserDataDir ? String(options.chromeUserDataDir) : undefined, + chromeProfileDirectory: options.chromeProfileDirectory ? String(options.chromeProfileDirectory) : undefined, + firefoxProfileDir: options.firefoxProfileDir ? String(options.firefoxProfileDir) : undefined, + csrfToken: cookieArgs.csrfToken, + cookieHeader: cookieArgs.cookieHeader, + onProgress: (progress: ThreadSyncProgress) => { + lastProgress = progress; + spinner.update(); + }, + })); + if (result.total === 0) { + console.log(' No thread gaps found.'); + } else { + if (result.contextFilled > 0) console.log(` \u2713 ${result.contextFilled} bookmarks got parent context`); + if (result.belowFilled > 0) console.log(` \u2713 ${result.belowFilled} bookmarks got same-author continuations`); + if (result.emptyChecked > 0) console.log(` \u2713 ${result.emptyChecked} bookmarks checked with no thread continuation`); + if (result.failed > 0) console.log(` ${result.failed} thread expansions failed`); + } + console.log(''); + }; + // ── gaps mode: backfill missing data for existing bookmarks ── if (options.gaps) { const startTime = Date.now(); @@ -1032,6 +1105,8 @@ export function buildCli() { } } + await runThreadSync({ csrfToken, cookieHeader }); + await postSyncMediaFetch(); const newCount = await rebuildIndex(); @@ -1193,6 +1268,12 @@ export function buildCli() { console.log(`${item.id} \u00b7 ${item.authorHandle ? `@${item.authorHandle}` : '@?'}`); console.log(item.url); console.log(item.text); + if (item.threadContext.length) { + console.log(formatThreadSectionLines('thread context', item.threadContext).join('\n')); + } + if (item.threadBelow.length) { + console.log(formatThreadSectionLines('thread continuation', item.threadBelow).join('\n')); + } if (item.quotedTweet) { console.log(formatQuotedTweetLines(item.quotedTweet).join('\n')); } diff --git a/src/graphql-bookmarks.ts b/src/graphql-bookmarks.ts index 86400bc..4f521c7 100644 --- a/src/graphql-bookmarks.ts +++ b/src/graphql-bookmarks.ts @@ -4,8 +4,8 @@ import { loadChromeSessionConfig } from './config.js'; import { extractChromeXCookies } from './chrome-cookies.js'; import { extractFirefoxXCookies } from './firefox-cookies.js'; import { parseTimestampMs } from './date-utils.js'; -import type { BookmarkBackfillState, BookmarkCacheMeta, BookmarkFolder, BookmarkRecord, QuotedTweetSnapshot } from './types.js'; -import { exportBookmarksForSyncSeed, updateQuotedTweets, updateBookmarkText, updateArticleContent } from './bookmarks-db.js'; +import type { BookmarkBackfillState, BookmarkCacheMeta, BookmarkFolder, BookmarkRecord, QuotedTweetSnapshot, ThreadTweetSnapshot } from './types.js'; +import { exportBookmarksForSyncSeed, updateQuotedTweets, updateBookmarkText, updateArticleContent, updateThreadData } from './bookmarks-db.js'; import type { ArticleUpdate } from './bookmarks-db.js'; import { fetchArticle, resolveTcoLink } from './bookmark-enrich.js'; import type { ArticleContent } from './bookmark-enrich.js'; @@ -26,6 +26,14 @@ const BOOKMARKS_OPERATION = 'Bookmarks'; const TWEET_RESULT_BY_REST_ID_QUERY_ID = 'fHLDP3qFEjnTqhWBVvsREg'; const TWEET_RESULT_BY_REST_ID_OPERATION = 'TweetResultByRestId'; +// TweetDetail — used by `--threads` to fetch same-author continuations below a +// bookmarked tweet. Query ids rotate; refresh by searching for +// `operationName:"TweetDetail"` inside the current +// `abs.twimg.com/responsive-web/client-web/main..js` bundle or by +// capturing a live `/i/api/graphql//TweetDetail` request from x.com. +const TWEET_DETAIL_QUERY_ID = '-0WTL1e9Pij-JWAF5ztCCA'; +const TWEET_DETAIL_OPERATION = 'TweetDetail'; + // ────────────────────────────────────────────────────────────────────────── // Folder endpoints — READ ONLY. We never POST/PUT/DELETE to X. // The folder feature makes exactly these two GraphQL GET calls, nothing else. @@ -522,6 +530,18 @@ export function mergeBookmarkRecord(existing: BookmarkRecord | undefined, incomi if (existing.enrichedAt && !incoming.enrichedAt) { merged.enrichedAt = existing.enrichedAt; } + if (existing.threadContext && incoming.threadContext === undefined) { + merged.threadContext = existing.threadContext; + } + if (existing.threadBelow && incoming.threadBelow === undefined) { + merged.threadBelow = existing.threadBelow; + } + if (existing.threadExpandedAt && !incoming.threadExpandedAt) { + merged.threadExpandedAt = existing.threadExpandedAt; + } + if (existing.threadExpansionFailedAt && !incoming.threadExpansionFailedAt) { + merged.threadExpansionFailedAt = existing.threadExpansionFailedAt; + } if ((existing.mediaObjects?.length ?? 0) > 0 && (incoming.mediaObjects?.length ?? 0) === 0) { merged.mediaObjects = existing.mediaObjects; } @@ -1426,6 +1446,41 @@ const TWEET_RESULT_FIELD_TOGGLES = { withAuxiliaryUserLabels: false, }; +const TWEET_DETAIL_FEATURES = { + rweb_video_screen_enabled: false, + payments_enabled: false, + profile_label_improvements_pcf_label_in_post_enabled: true, + rweb_tipjar_consumption_enabled: true, + verified_phone_label_enabled: false, + creator_subscriptions_tweet_preview_api_enabled: true, + responsive_web_graphql_timeline_navigation_enabled: true, + responsive_web_graphql_skip_user_profile_image_extensions_enabled: false, + premium_content_api_read_enabled: false, + communities_web_enable_tweet_community_results_fetch: true, + c9s_tweet_anatomy_moderator_badge_enabled: true, + responsive_web_grok_analyze_button_fetch_trends_enabled: false, + responsive_web_grok_analyze_post_followups_enabled: true, + responsive_web_jetfuel_frame: false, + responsive_web_grok_share_attachment_enabled: true, + articles_preview_enabled: true, + responsive_web_edit_tweet_api_enabled: true, + graphql_is_translatable_rweb_tweet_is_translatable_enabled: true, + view_counts_everywhere_api_enabled: true, + longform_notetweets_consumption_enabled: true, + responsive_web_twitter_article_tweet_consumption_enabled: true, + tweet_awards_web_tipping_enabled: false, + responsive_web_grok_show_grok_translated_post: false, + responsive_web_grok_analysis_button_from_backend: false, + creator_subscriptions_quote_tweet_preview_enabled: false, + freedom_of_speech_not_reach_fetch_enabled: true, + standardized_nudges_misinfo: true, + tweet_with_visibility_results_prefer_gql_limited_actions_policy_enabled: true, + longform_notetweets_rich_text_read_enabled: true, + longform_notetweets_inline_media_enabled: true, + responsive_web_grok_image_annotation_enabled: true, + responsive_web_enhance_cards_enabled: false, +}; + export type TweetFetchSource = 'graphql' | 'syndication'; export interface TweetFetchResult { @@ -1480,6 +1535,59 @@ export function parseTweetResultByRestId(json: any, tweetId: string): QuotedTwee }; } +function parseThreadTweetResult( + value: any, + fallbackId?: string, + metadata: Partial = {}, +): ThreadTweetSnapshot | null { + const tweet = value?.tweet ?? value; + const legacy = tweet?.legacy; + if (!legacy) return null; + + const noteText = tweet?.note_tweet?.note_tweet_results?.result?.text; + const text = noteText ?? legacy.full_text ?? legacy.text ?? ''; + const resolvedId = String(legacy.id_str ?? tweet?.rest_id ?? fallbackId ?? ''); + if (!resolvedId || !text) return null; + + const userResult = tweet?.core?.user_results?.result; + const handle = userResult?.core?.screen_name ?? userResult?.legacy?.screen_name; + const mediaEntities: any[] = legacy?.extended_entities?.media ?? legacy?.entities?.media ?? []; + + return { + id: resolvedId, + text, + authorHandle: handle, + authorName: userResult?.core?.name ?? userResult?.legacy?.name, + authorProfileImageUrl: + userResult?.avatar?.image_url ?? userResult?.legacy?.profile_image_url_https, + postedAt: legacy.created_at ?? null, + media: mediaEntities.map((m: any) => m.media_url_https ?? m.media_url).filter(Boolean), + mediaObjects: mediaEntities.map((m: any) => ({ + type: m.type, + url: m.media_url_https ?? m.media_url, + expandedUrl: m.expanded_url, + width: m.original_info?.width, + height: m.original_info?.height, + altText: m.ext_alt_text, + videoVariants: Array.isArray(m.video_info?.variants) + ? m.video_info.variants + .filter((v: any) => v.content_type === 'video/mp4') + .map((v: any) => ({ bitrate: v.bitrate, url: v.url })) + : undefined, + })), + conversationId: legacy.conversation_id_str, + inReplyToStatusId: legacy.in_reply_to_status_id_str, + ...metadata, + url: `https://x.com/${handle ?? '_'}/status/${resolvedId}`, + }; +} + +export function parseThreadTweetResultByRestId(json: any, tweetId: string): ThreadTweetSnapshot | null { + const result = json?.data?.tweetResult?.result; + if (!result) return null; + return parseThreadTweetResult(result, tweetId); +} + function unwrapGraphqlResult(value: any): any { return value?.result?.tweet ?? value?.result ?? value?.tweet ?? value; } @@ -1581,6 +1689,127 @@ function buildTweetResultByRestIdUrl(tweetId: string): string { return `https://x.com/i/api/graphql/${TWEET_RESULT_BY_REST_ID_QUERY_ID}/${TWEET_RESULT_BY_REST_ID_OPERATION}?${params}`; } +function buildTweetDetailUrl(tweetId: string, cursor?: string): string { + const variables: Record = { + focalTweetId: tweetId, + with_rux_injections: false, + includePromotedContent: false, + withCommunity: true, + withQuickPromoteEligibilityTweetFields: true, + withBirdwatchNotes: true, + withVoice: true, + withV2Timeline: true, + rankingMode: 'Relevance', + count: 40, + }; + if (cursor) variables.cursor = cursor; + const params = new URLSearchParams({ + variables: JSON.stringify(variables), + features: JSON.stringify(TWEET_DETAIL_FEATURES), + }); + return `https://x.com/i/api/graphql/${TWEET_DETAIL_QUERY_ID}/${TWEET_DETAIL_OPERATION}?${params}`; +} + +function compareTweetIdsChronologically(a: ThreadTweetSnapshot, b: ThreadTweetSnapshot): number { + const aId = parseSnowflake(a.id); + const bId = parseSnowflake(b.id); + if (aId != null && bId != null && aId !== bId) return aId < bId ? -1 : 1; + const aTime = parseTimestampMs(a.postedAt); + const bTime = parseTimestampMs(b.postedAt); + if (aTime != null && bTime != null && aTime !== bTime) return aTime - bTime; + return a.id.localeCompare(b.id); +} + +function sameHandle(a?: string, b?: string): boolean { + if (!a || !b) return false; + return a.toLowerCase() === b.toLowerCase(); +} + +function conversationSection(content: any): string | undefined { + return content?.clientEventInfo?.details?.conversationDetails?.conversationSection; +} + +function collectThreadEntries(entries: any[], out: ThreadTweetSnapshot[]): string | undefined { + let nextCursor: string | undefined; + for (const entry of entries) { + if (entry?.entryId?.startsWith('cursor-bottom')) { + nextCursor = entry?.content?.value; + continue; + } + + const direct = entry?.content?.itemContent?.tweet_results?.result; + const directSnapshot = direct ? parseThreadTweetResult(direct) : null; + if (directSnapshot) out.push(directSnapshot); + + const moduleItems = entry?.content?.items; + if (Array.isArray(moduleItems)) { + const moduleSnapshots: ThreadTweetSnapshot[] = []; + for (let index = 0; index < moduleItems.length; index++) { + const item = moduleItems[index]; + const result = item?.item?.itemContent?.tweet_results?.result; + const snapshot = result ? parseThreadTweetResult(result, undefined, { + conversationEntryId: entry.entryId, + conversationDisplayType: entry?.content?.displayType, + conversationSection: conversationSection(entry.content), + conversationItemIndex: index, + }) : null; + if (snapshot) moduleSnapshots.push(snapshot); + } + const rootId = moduleSnapshots[0]?.id; + for (const snapshot of moduleSnapshots) { + if (rootId) snapshot.conversationRootId = rootId; + out.push(snapshot); + } + } + } + return nextCursor; +} + +export function parseTweetDetailResponse(json: any): { tweets: ThreadTweetSnapshot[]; nextCursor?: string } { + const instructions = json?.data?.threaded_conversation_with_injections_v2?.instructions ?? []; + const tweets: ThreadTweetSnapshot[] = []; + let nextCursor: string | undefined; + + for (const instruction of instructions) { + if (instruction?.type === 'TimelineAddEntries' && Array.isArray(instruction.entries)) { + nextCursor = collectThreadEntries(instruction.entries, tweets) ?? nextCursor; + } + if (instruction?.type === 'TimelinePinEntry' && instruction.entry) { + collectThreadEntries([instruction.entry], tweets); + } + } + + const byId = new Map(); + for (const tweet of tweets) { + if (!byId.has(tweet.id)) byId.set(tweet.id, tweet); + } + return { tweets: Array.from(byId.values()).sort(compareTweetIdsChronologically), nextCursor }; +} + +export function extractSameAuthorThreadBelow( + tweets: ThreadTweetSnapshot[], + focalTweetId: string, + focalAuthorHandle?: string, +): ThreadTweetSnapshot[] { + const focal = tweets.find((tweet) => tweet.id === focalTweetId); + const authorHandle = focalAuthorHandle ?? focal?.authorHandle; + if (!authorHandle) return []; + + const chainIds = new Set([focalTweetId]); + const below: ThreadTweetSnapshot[] = []; + const sorted = tweets + .filter((tweet) => tweet.id !== focalTweetId && sameHandle(tweet.authorHandle, authorHandle)) + .sort(compareTweetIdsChronologically); + + for (const tweet of sorted) { + if (!tweet.inReplyToStatusId || !chainIds.has(tweet.inReplyToStatusId)) continue; + below.push({ ...tweet, threadRole: 'post-thread' }); + chainIds.add(tweet.id); + } + + return below; +} + export async function fetchTweetByIdViaGraphQL( tweetId: string, csrfToken: string, @@ -1637,6 +1866,49 @@ export async function fetchTweetByIdViaGraphQL( return { snapshot: null, status: 'rate_limited', source: 'graphql' }; } +export async function fetchTweetDetailViaGraphQL( + tweetId: string, + csrfToken: string, + cookieHeader?: string, + options: { maxPages?: number; delayMs?: number } = {}, +): Promise<{ tweets: ThreadTweetSnapshot[]; status: TweetFetchResult['status']; httpStatus?: number }> { + const maxPages = options.maxPages ?? 3; + const delayMs = options.delayMs ?? 300; + const tweets: ThreadTweetSnapshot[] = []; + let cursor: string | undefined; + + for (let page = 0; page < maxPages; page++) { + let response: Response; + try { + response = await fetch(buildTweetDetailUrl(tweetId, cursor), { + headers: buildHeaders(csrfToken, cookieHeader), + }); + } catch { + return { tweets, status: 'error' }; + } + + if (response.status === 429) return { tweets, status: 'rate_limited', httpStatus: 429 }; + if (response.status === 404) return { tweets, status: 'not_found', httpStatus: 404 }; + if (response.status === 401 || response.status === 403) return { tweets, status: 'error', httpStatus: response.status }; + if (response.status >= 500) return { tweets, status: 'server_error', httpStatus: response.status }; + if (!response.ok) return { tweets, status: 'error', httpStatus: response.status }; + + const json = await response.json(); + const parsed = parseTweetDetailResponse(json); + tweets.push(...parsed.tweets); + if (!parsed.nextCursor || parsed.nextCursor === cursor) break; + cursor = parsed.nextCursor; + if (page < maxPages - 1) await new Promise((r) => setTimeout(r, delayMs)); + } + + const byId = new Map(); + for (const tweet of tweets) { + if (!byId.has(tweet.id)) byId.set(tweet.id, tweet); + } + if (byId.size === 0) return { tweets: [], status: 'error' }; + return { tweets: Array.from(byId.values()).sort(compareTweetIdsChronologically), status: 'ok' }; +} + async function fetchTweetViaSyndication(tweetId: string): Promise { for (let attempt = 0; attempt < 4; attempt++) { const response = await fetch(`${SYNDICATION_URL}?id=${tweetId}&token=x`, { @@ -1687,6 +1959,55 @@ async function fetchTweetViaSyndication(tweetId: string): Promise { + for (let attempt = 0; attempt < 4; attempt++) { + const response = await fetch(`${SYNDICATION_URL}?id=${tweetId}&token=x`, { + headers: { + 'user-agent': CHROME_UA, + }, + }); + + if (response.ok) { + const data = await response.json() as any; + if (!data?.text) return { tweet: null, status: 'empty' }; + const handle = data.user?.screen_name; + const mediaEntities: any[] = data.mediaDetails ?? []; + return { + status: 'ok', + tweet: { + id: String(data.id_str ?? tweetId), + text: data.text, + authorHandle: handle, + authorName: data.user?.name, + authorProfileImageUrl: data.user?.profile_image_url_https, + postedAt: data.created_at ?? null, + media: mediaEntities.map((m: any) => m.media_url_https ?? m.media_url).filter(Boolean), + mediaObjects: mediaEntities.map((m: any) => ({ + type: m.type, + url: m.media_url_https ?? m.media_url, + width: m.original_info?.width, + height: m.original_info?.height, + })), + conversationId: data.conversation_id_str, + inReplyToStatusId: data.in_reply_to_status_id_str, + url: `https://x.com/${handle ?? '_'}/status/${data.id_str ?? tweetId}`, + }, + }; + } + + if (response.status === 429) { + await new Promise((r) => setTimeout(r, Math.min(15 * Math.pow(2, attempt), 120) * 1000)); + continue; + } + if (response.status >= 500) { + await new Promise((r) => setTimeout(r, 5000 * (attempt + 1))); + continue; + } + return { tweet: null, status: response.status === 404 ? 'not_found' : 'forbidden' }; + } + return { tweet: null, status: 'rate_limited' }; +} + // Text >= 275 chars may be truncated by Twitter's legacy.full_text limit const TRUNCATION_THRESHOLD = 275; const LINK_ONLY_THRESHOLD = 80; @@ -1706,6 +2027,288 @@ const X_ARTICLE_MISSING_REASONS: Record = { unknown: 'X Article body was not returned', }; +const RECENT_THREAD_RECHECK_MS = 72 * 60 * 60_000; +const MIN_THREAD_RECHECK_MS = 6 * 60 * 60_000; + +export interface ThreadSyncProgress { + done: number; + total: number; + contextFilled: number; + belowFilled: number; + emptyChecked: number; + failed: number; +} + +export interface ThreadSyncFailure { + tweetId: string; + reason: string; + url: string; +} + +export interface ThreadSyncResult { + contextFilled: number; + belowFilled: number; + emptyChecked: number; + failed: number; + failures: ThreadSyncFailure[]; + total: number; +} + +type ThreadExpansionFetcher = (record: BookmarkRecord) => Promise<{ + context: ThreadTweetSnapshot[]; + below: ThreadTweetSnapshot[]; + status: TweetFetchResult['status']; +}>; + +const THREAD_TRANSIENT_FAILURE_STATUSES = new Set([ + 'rate_limited', + 'server_error', + 'error', +]); + +export interface SyncThreadsOptions { + onProgress?: (progress: ThreadSyncProgress) => void; + delayMs?: number; + browser?: string; + chromeUserDataDir?: string; + chromeProfileDirectory?: string; + firefoxProfileDir?: string; + csrfToken?: string; + cookieHeader?: string; + threadFetcher?: ThreadExpansionFetcher; +} + +function shouldSyncThread(record: BookmarkRecord, nowMs: number): boolean { + if (record.threadExpansionFailedAt) return false; + if (record.threadContext === undefined || record.threadBelow === undefined) return true; + if (!record.threadExpandedAt) return true; + + const postedAtMs = parseTimestampMs(record.postedAt); + const checkedAtMs = parseTimestampMs(record.threadExpandedAt); + if (postedAtMs == null || checkedAtMs == null) return false; + if (nowMs - postedAtMs > RECENT_THREAD_RECHECK_MS) return false; + return nowMs - checkedAtMs >= MIN_THREAD_RECHECK_MS; +} + +async function fetchThreadTweetById( + tweetId: string, + cookies: { csrfToken?: string; cookieHeader?: string }, +): Promise<{ tweet: ThreadTweetSnapshot | null; status: TweetFetchResult['status'] }> { + if (cookies.csrfToken) { + for (let attempt = 0; attempt < 4; attempt++) { + let response: Response; + try { + response = await fetch(buildTweetResultByRestIdUrl(tweetId), { + headers: buildHeaders(cookies.csrfToken, cookies.cookieHeader), + }); + } catch { + await new Promise((r) => setTimeout(r, 2000 * (attempt + 1))); + continue; + } + + if (response.ok) { + const json = await response.json(); + const result = json?.data?.tweetResult?.result; + const typename = result?.__typename; + if (!result || typename === 'TweetTombstone' || typename === 'TweetUnavailable') { + return { tweet: null, status: 'not_found' }; + } + const tweet = parseThreadTweetResultByRestId(json, tweetId); + return { tweet, status: tweet ? 'ok' : 'empty' }; + } + + if (response.status === 429) { + await new Promise((r) => setTimeout(r, Math.min(15 * Math.pow(2, attempt), 120) * 1000)); + continue; + } + if (response.status >= 500) { + await new Promise((r) => setTimeout(r, 5000 * (attempt + 1))); + continue; + } + if (response.status === 404) return { tweet: null, status: 'not_found' }; + if (response.status === 401 || response.status === 403) break; + return { tweet: null, status: 'error' }; + } + } + return fetchThreadTweetViaSyndication(tweetId); +} + +async function expandThreadForRecord( + record: BookmarkRecord, + cookies: { csrfToken?: string; cookieHeader?: string }, + delayMs: number, +): Promise<{ context: ThreadTweetSnapshot[]; below: ThreadTweetSnapshot[]; status: TweetFetchResult['status'] }> { + const context: ThreadTweetSnapshot[] = []; + let nextParentId = record.inReplyToStatusId; + const seenParents = new Set(); + + while (nextParentId && !seenParents.has(nextParentId) && context.length < 25) { + seenParents.add(nextParentId); + const parent = await fetchThreadTweetById(nextParentId, cookies); + if (!parent.tweet) { + if (parent.status === 'not_found' || parent.status === 'forbidden' || parent.status === 'empty') break; + return { context, below: [], status: parent.status }; + } + context.unshift(parent.tweet); + nextParentId = parent.tweet.inReplyToStatusId; + if (nextParentId) await new Promise((r) => setTimeout(r, delayMs)); + } + + let below: ThreadTweetSnapshot[] = []; + if (cookies.csrfToken) { + const detail = await fetchTweetDetailViaGraphQL(record.tweetId, cookies.csrfToken, cookies.cookieHeader, { delayMs }); + if (detail.status !== 'ok') return { context, below: [], status: detail.status }; + below = extractSameAuthorThreadBelow(detail.tweets, record.tweetId, record.authorHandle); + } + + return { context, below, status: 'ok' }; +} + +export async function syncThreads(options: SyncThreadsOptions = {}): Promise { + const delayMs = options.delayMs ?? 300; + const cachePath = twitterBookmarksCachePath(); + const loaded = sanitizeRecords(await readJsonLines(cachePath)); + const records = loaded.records; + const nowMs = Date.now(); + const now = new Date(nowMs).toISOString(); + const candidates = records.filter((record) => shouldSyncThread(record, nowMs)); + const total = candidates.length; + + if (total === 0) { + return { + contextFilled: 0, + belowFilled: 0, + emptyChecked: 0, + failed: 0, + failures: [], + total: 0, + }; + } + + const cookies = options.threadFetcher ? {} : resolveGapFillCookies({ + browser: options.browser, + chromeUserDataDir: options.chromeUserDataDir, + chromeProfileDirectory: options.chromeProfileDirectory, + firefoxProfileDir: options.firefoxProfileDir, + csrfToken: options.csrfToken, + cookieHeader: options.cookieHeader, + }); + if (!options.threadFetcher && !cookies.csrfToken) { + throw new Error('Thread sync requires X browser cookies with ct0/auth_token; pass --cookies or a browser profile.'); + } + const fetcher: ThreadExpansionFetcher = options.threadFetcher + ?? ((record) => expandThreadForRecord(record, cookies, delayMs)); + + let contextFilled = 0; + let belowFilled = 0; + let emptyChecked = 0; + let failed = 0; + const failures: ThreadSyncFailure[] = []; + const dbUpdates: Array<{ + id: string; + threadContext?: ThreadTweetSnapshot[]; + threadBelow?: ThreadTweetSnapshot[]; + threadExpandedAt?: string; + threadExpansionFailedAt?: string; + }> = []; + const persistProgress = async (): Promise => { + await writeJsonLines(cachePath, records); + if (dbUpdates.length > 0) { + await updateThreadData(dbUpdates); + dbUpdates.length = 0; + } + }; + + for (let i = 0; i < candidates.length; i++) { + const record = candidates[i]; + let status: TweetFetchResult['status'] = 'error'; + let transientFailurePersisted = false; + try { + const expanded = await fetcher(record); + status = expanded.status; + if (expanded.status === 'ok') { + record.threadContext = expanded.context; + record.threadBelow = expanded.below; + record.threadExpandedAt = now; + delete record.threadExpansionFailedAt; + if (expanded.context.length > 0) contextFilled++; + if (expanded.below.length > 0) belowFilled++; + if (expanded.context.length === 0 && expanded.below.length === 0) emptyChecked++; + dbUpdates.push({ + id: record.id, + threadContext: record.threadContext, + threadBelow: record.threadBelow, + threadExpandedAt: record.threadExpandedAt, + }); + } else { + failed++; + const reason = GAP_FILL_FAILURE_REASONS[expanded.status] ?? expanded.status; + failures.push({ + tweetId: record.tweetId, + reason, + url: record.url, + }); + if (THREAD_TRANSIENT_FAILURE_STATUSES.has(expanded.status)) { + await persistProgress(); + transientFailurePersisted = true; + if (expanded.status === 'rate_limited') { + throw new RateLimitError('Thread sync stopped by X rate limiting. Resume on a later run.'); + } + throw new Error(`Thread sync stopped on transient X failure: ${reason}`); + } + } + } catch (err) { + if (THREAD_TRANSIENT_FAILURE_STATUSES.has(status)) { + if (!transientFailurePersisted) await persistProgress(); + throw err; + } + failed++; + failures.push({ + tweetId: record.tweetId, + reason: (err as Error).message ?? 'unknown error', + url: record.url, + }); + await persistProgress(); + throw err; + } + + const isPermanentFailure = status === 'not_found' || status === 'forbidden' || status === 'empty'; + if (isPermanentFailure) { + record.threadExpansionFailedAt = now; + dbUpdates.push({ + id: record.id, + threadContext: record.threadContext, + threadBelow: record.threadBelow, + threadExpandedAt: record.threadExpandedAt, + threadExpansionFailedAt: record.threadExpansionFailedAt, + }); + } + + options.onProgress?.({ + done: i + 1, + total, + contextFilled, + belowFilled, + emptyChecked, + failed, + }); + + if ((i + 1) % 25 === 0) await persistProgress(); + if (i < candidates.length - 1) await new Promise((r) => setTimeout(r, delayMs)); + } + + await persistProgress(); + + return { + contextFilled, + belowFilled, + emptyChecked, + failed, + failures, + total, + }; +} + export interface GapFillProgress { done: number; total: number; diff --git a/src/types.ts b/src/types.ts index 75fb71e..a268b58 100644 --- a/src/types.ts +++ b/src/types.ts @@ -52,6 +52,31 @@ export interface QuotedTweetSnapshot { url: string; } +export interface ThreadTweetSnapshot { + id: string; + text: string; + authorHandle?: string; + authorName?: string; + authorProfileImageUrl?: string; + postedAt?: string | null; + media?: string[]; + mediaObjects?: BookmarkMediaObject[]; + conversationId?: string; + inReplyToStatusId?: string; + /** + * How this tweet relates to the bookmarked tweet in X's conversation UI. + * `post-thread` means X presents it as the author's continuation of the + * focal post, not as the author's response inside someone else's reply branch. + */ + threadRole?: 'post-thread'; + conversationEntryId?: string; + conversationDisplayType?: string; + conversationSection?: string; + conversationRootId?: string; + conversationItemIndex?: number; + url: string; +} + export interface BookmarkRecord { id: string; tweetId: string; @@ -100,6 +125,20 @@ export interface BookmarkRecord { * same dead tweet on every run. */ quotedTweetFailedAt?: string; + /** Parent tweets above the bookmarked tweet, oldest first. */ + threadContext?: ThreadTweetSnapshot[]; + /** Same-author continuation tweets below the bookmarked tweet. */ + threadBelow?: ThreadTweetSnapshot[]; + /** + * Last time thread expansion checked this record. Empty thread arrays are a + * completed check, but recent tweets are rechecked for delayed self-replies. + */ + threadExpandedAt?: string; + /** + * Set when thread expansion hit a permanent failure for the focal tweet, + * preventing repeated fetches for deleted or unavailable tweets. + */ + threadExpansionFailedAt?: string; } export interface BookmarkFolder { diff --git a/tests/bookmark-media.test.ts b/tests/bookmark-media.test.ts index 4c2f661..5f430e7 100644 --- a/tests/bookmark-media.test.ts +++ b/tests/bookmark-media.test.ts @@ -150,6 +150,85 @@ test('fetchBookmarkMediaBatch downloads quoted tweet media targets', async () => } }); +test('fetchBookmarkMediaBatch downloads thread reply media targets', async () => { + const replyPhotoUrl = 'https://pbs.twimg.com/media/reply-photo.jpg'; + const replyPosterUrl = 'https://pbs.twimg.com/amplify_video_thumb/reply.jpg'; + const replyVideoUrl = 'https://video.twimg.com/ext_tw_video/reply.mp4'; + const replyProfileUrl = 'https://pbs.twimg.com/profile_images/789/reply_normal.jpg'; + const contextPhotoUrl = 'https://pbs.twimg.com/media/context-photo.jpg'; + const records = [{ + id: '1', + tweetId: '1', + url: 'https://x.com/alice/status/1', + text: 'thread media test', + authorHandle: 'alice', + authorName: 'Alice', + syncedAt: '2026-04-09T00:00:00.000Z', + mediaObjects: [], + threadContext: [{ + id: '90', + url: 'https://x.com/alice/status/90', + text: 'context with media', + authorHandle: 'alice', + mediaObjects: [{ type: 'photo', url: contextPhotoUrl }], + }], + threadBelow: [{ + id: '101', + url: 'https://x.com/alice/status/101', + text: 'reply with media', + authorHandle: 'alice', + authorName: 'Alice', + authorProfileImageUrl: replyProfileUrl, + mediaObjects: [ + { type: 'photo', url: replyPhotoUrl }, + { type: 'video', url: replyPosterUrl, videoVariants: [{ url: replyVideoUrl, bitrate: 832000 }] }, + ], + }], + links: [], + tags: [], + ingestedVia: 'graphql', + }]; + + const originalFetch = globalThis.fetch; + globalThis.fetch = async (input: string | URL | Request, init?: RequestInit): Promise => { + const url = String(input instanceof Request ? input.url : input); + const method = init?.method ?? 'GET'; + const contentType = url.endsWith('.mp4') ? 'video/mp4' : 'image/jpeg'; + if (method === 'HEAD') { + return new Response(null, { + status: 200, + headers: { 'content-length': '4', 'content-type': contentType }, + }); + } + return new Response(Uint8Array.from([1, 2, 3, 4]), { + status: 200, + headers: { 'content-type': contentType }, + }); + }; + + try { + await withMediaDataDir(records, async () => { + const manifest = await fetchBookmarkMediaBatch({ limit: 10, maxBytes: 1024 }); + const downloaded = manifest.entries + .filter((entry) => entry.status === 'downloaded') + .map((entry) => ({ bookmarkId: entry.bookmarkId, tweetId: entry.tweetId, sourceUrl: entry.sourceUrl })) + .sort((a, b) => a.sourceUrl.localeCompare(b.sourceUrl)); + + const expected = [ + { bookmarkId: '1', tweetId: '90', sourceUrl: contextPhotoUrl }, + { bookmarkId: '1', tweetId: '101', sourceUrl: replyPosterUrl }, + { bookmarkId: '1', tweetId: '101', sourceUrl: replyPhotoUrl }, + { bookmarkId: '1', tweetId: '101', sourceUrl: replyProfileUrl.replace('_normal.', '_400x400.') }, + { bookmarkId: '1', tweetId: '101', sourceUrl: replyVideoUrl }, + ].sort((a, b) => a.sourceUrl.localeCompare(b.sourceUrl)); + assert.deepEqual(downloaded, expected); + assert.ok(downloaded.every((entry) => entry.tweetId !== '1')); + }); + } finally { + globalThis.fetch = originalFetch; + } +}); + test('fetchBookmarkMediaBatch downloads shared profile images only once across bookmarks', async () => { const profileUrl = 'https://pbs.twimg.com/profile_images/123/avatar_normal.jpg'; const fullProfileUrl = profileUrl.replace('_normal.', '_400x400.'); diff --git a/tests/graphql-bookmarks.test.ts b/tests/graphql-bookmarks.test.ts index fc056aa..6f31fab 100644 --- a/tests/graphql-bookmarks.test.ts +++ b/tests/graphql-bookmarks.test.ts @@ -10,6 +10,7 @@ import { parseBookmarksResponse, parseFolderTimelineResponse, parseTweetArticleByRestId, + parseTweetDetailResponse, parseTweetResultByRestId, sanitizeBookmarkedAt, scoreRecord, @@ -20,6 +21,8 @@ import { formatSyncResult, syncBookmarksGraphQL, syncGaps, + syncThreads, + extractSameAuthorThreadBelow, } from '../src/graphql-bookmarks.js'; import { buildIndex, getBookmarkById } from '../src/bookmarks-db.js'; import { resolveFolder, formatFolderMirrorStats } from '../src/cli.js'; @@ -118,6 +121,32 @@ function makeGraphQLResponse(tweetResults: any[], bottomCursor?: string) { }; } +function makeTweetDetailResponse(tweetResults: any[], bottomCursor?: string) { + const entries = tweetResults.map((tr) => ({ + entryId: `tweet-${tr.legacy?.id_str ?? tr.rest_id}`, + content: { + itemContent: { + tweet_results: { result: tr }, + }, + }, + })); + if (bottomCursor) { + entries.push({ + entryId: 'cursor-bottom-thread', + content: { value: bottomCursor } as any, + }); + } + return { + data: { + threaded_conversation_with_injections_v2: { + instructions: [ + { type: 'TimelineAddEntries', entries }, + ], + }, + }, + }; +} + function makeRecord(overrides: Partial = {}): BookmarkRecord { return { id: '100', @@ -381,6 +410,119 @@ test('convertTweetToRecord: handles missing quoted tweet gracefully', () => { assert.equal(result.quotedTweet, undefined); }); +test('parseTweetDetailResponse: extracts tweets from timeline items and modules', () => { + const root = makeTweetResult({ + legacy: { id_str: '100', full_text: 'Launch post', conversation_id_str: '100' }, + }); + const reply = makeTweetResult({ + rest_id: '101', + legacy: { + id_str: '101', + full_text: 'Link below: https://example.com', + conversation_id_str: '100', + in_reply_to_status_id_str: '100', + }, + }); + const moduleResp = { + data: { + threaded_conversation_with_injections_v2: { + instructions: [{ + type: 'TimelineAddEntries', + entries: [{ + entryId: 'conversationthread-100', + content: { + displayType: 'VerticalConversation', + clientEventInfo: { + details: { + conversationDetails: { + conversationSection: 'HighQuality', + }, + }, + }, + items: [ + { item: { itemContent: { tweet_results: { result: root } } } }, + { item: { itemContent: { tweet_results: { result: reply } } } }, + ], + }, + }, { + entryId: 'cursor-bottom-100', + content: { value: 'cursor-next' }, + }], + }], + }, + }, + }; + + const parsed = parseTweetDetailResponse(moduleResp); + assert.equal(parsed.tweets.length, 2); + assert.equal(parsed.tweets[1].id, '101'); + assert.equal(parsed.tweets[1].inReplyToStatusId, '100'); + assert.equal(parsed.tweets[1].conversationDisplayType, 'VerticalConversation'); + assert.equal(parsed.tweets[1].conversationSection, 'HighQuality'); + assert.equal(parsed.tweets[1].conversationRootId, '100'); + assert.equal(parsed.tweets[1].conversationItemIndex, 1); + assert.equal(parsed.nextCursor, 'cursor-next'); +}); + +test('extractSameAuthorThreadBelow: keeps same-author continuations and excludes public replies', () => { + const root = makeTweetResult({ + legacy: { id_str: '100', full_text: 'Launch post', conversation_id_str: '100' }, + }); + const sameAuthorReply = makeTweetResult({ + rest_id: '101', + legacy: { + id_str: '101', + full_text: 'Here is the link', + conversation_id_str: '100', + in_reply_to_status_id_str: '100', + }, + }); + const publicReply = makeTweetResult({ + rest_id: '102', + legacy: { + id_str: '102', + full_text: 'random public reply', + conversation_id_str: '100', + in_reply_to_status_id_str: '100', + }, + userResult: { + rest_id: '1111', + core: { screen_name: 'other', name: 'Other User' }, + }, + }); + const authorSocialReply = makeTweetResult({ + rest_id: '104', + legacy: { + id_str: '104', + full_text: '@other thanks!', + conversation_id_str: '100', + in_reply_to_status_id_str: '102', + }, + }); + const secondContinuation = makeTweetResult({ + rest_id: '103', + legacy: { + id_str: '103', + full_text: 'More detail', + conversation_id_str: '100', + in_reply_to_status_id_str: '101', + }, + }); + + const parsed = parseTweetDetailResponse(makeTweetDetailResponse([ + root, + sameAuthorReply, + publicReply, + authorSocialReply, + secondContinuation, + ])); + const below = extractSameAuthorThreadBelow(parsed.tweets, '100', 'testuser'); + + assert.deepEqual(below.map((tweet) => tweet.id), ['101', '103']); + assert.ok(below.every((tweet) => tweet.authorHandle === 'testuser')); + assert.ok(below.every((tweet) => tweet.threadRole === 'post-thread')); +}); + test('convertTweetToRecord: quoted tweet prefers note_tweet body over legacy full_text', () => { const tr = makeTweetResult({ legacy: { quoted_status_id_str: '8888888' }, @@ -636,6 +778,164 @@ async function withIsolatedGapFillDataDir( } } +test('syncThreads: writes parent context and same-author continuations to JSONL and DB', async () => { + const bookmark = makeRecord({ + id: '100', + tweetId: '100', + url: 'https://x.com/testuser/status/100', + text: 'Launch post', + authorHandle: 'testuser', + postedAt: '2026-04-01T00:00:00.000Z', + inReplyToStatusId: '99', + }); + const parent = { + id: '99', + text: 'Parent context', + authorHandle: 'testuser', + url: 'https://x.com/testuser/status/99', + }; + const reply = { + id: '101', + text: 'Here is the actual link: https://example.com', + authorHandle: 'testuser', + inReplyToStatusId: '100', + url: 'https://x.com/testuser/status/101', + }; + + await withIsolatedGapFillDataDir(async () => { + await buildIndex(); + const result = await syncThreads({ + threadFetcher: async () => ({ context: [parent], below: [reply], status: 'ok' }), + delayMs: 0, + }); + + assert.equal(result.contextFilled, 1); + assert.equal(result.belowFilled, 1); + assert.equal(result.failed, 0); + + const jsonl = await readFile(path.join(process.env.FT_DATA_DIR!, 'bookmarks.jsonl'), 'utf8'); + const stored = JSON.parse(jsonl.trim().split('\n').pop()!); + assert.equal(stored.threadContext[0].text, 'Parent context'); + assert.match(stored.threadBelow[0].text, /actual link/); + assert.ok(stored.threadExpandedAt); + + const refreshed = await getBookmarkById('100'); + assert.equal(refreshed?.threadContext[0]?.id, '99'); + assert.equal(refreshed?.threadBelow[0]?.id, '101'); + }, [bookmark]); +}); + +test('syncThreads: rechecks recent empty threads but skips old checked empties', async () => { + const recentChecked = makeRecord({ + id: '200', + tweetId: '200', + url: 'https://x.com/testuser/status/200', + text: 'Recent launch', + authorHandle: 'testuser', + postedAt: new Date(Date.now() - 24 * 60 * 60_000).toISOString(), + threadContext: [], + threadBelow: [], + threadExpandedAt: new Date(Date.now() - 7 * 60 * 60_000).toISOString(), + }); + const oldChecked = makeRecord({ + id: '300', + tweetId: '300', + url: 'https://x.com/testuser/status/300', + text: 'Old launch', + authorHandle: 'testuser', + postedAt: '2026-01-01T00:00:00.000Z', + threadContext: [], + threadBelow: [], + threadExpandedAt: '2026-01-01T02:00:00.000Z', + }); + + await withIsolatedGapFillDataDir(async () => { + let calls = 0; + const result = await syncThreads({ + threadFetcher: async (record) => { + calls += 1; + assert.equal(record.id, '200'); + return { context: [], below: [], status: 'ok' }; + }, + delayMs: 0, + }); + + assert.equal(calls, 1); + assert.equal(result.emptyChecked, 1); + assert.equal(result.total, 1); + }, [recentChecked, oldChecked]); +}); + +test('syncThreads: permanent focal failure stamps threadExpansionFailedAt', async () => { + const bookmark = makeRecord({ + id: '400', + tweetId: '400', + url: 'https://x.com/testuser/status/400', + text: 'Gone', + authorHandle: 'testuser', + }); + + await withIsolatedGapFillDataDir(async () => { + const result = await syncThreads({ + threadFetcher: async () => ({ context: [], below: [], status: 'not_found' }), + delayMs: 0, + }); + + assert.equal(result.failed, 1); + const jsonl = await readFile(path.join(process.env.FT_DATA_DIR!, 'bookmarks.jsonl'), 'utf8'); + const stored = JSON.parse(jsonl.trim().split('\n').pop()!); + assert.ok(stored.threadExpansionFailedAt); + }, [bookmark]); +}); + +test('syncThreads: transient failures abort without stamping permanent failure', async () => { + const first = makeRecord({ + id: '500', + tweetId: '500', + url: 'https://x.com/testuser/status/500', + text: 'Good thread', + authorHandle: 'testuser', + }); + const second = makeRecord({ + id: '600', + tweetId: '600', + url: 'https://x.com/testuser/status/600', + text: 'Rate limited thread', + authorHandle: 'testuser', + }); + const reply = { + id: '501', + text: 'Continuation', + authorHandle: 'testuser', + url: 'https://x.com/testuser/status/501', + }; + + await withIsolatedGapFillDataDir(async () => { + await buildIndex(); + let calls = 0; + await assert.rejects( + () => syncThreads({ + threadFetcher: async () => { + calls += 1; + if (calls === 1) return { context: [], below: [reply], status: 'ok' }; + return { context: [], below: [], status: 'rate_limited' }; + }, + delayMs: 0, + }), + /rate limiting/, + ); + + const jsonl = await readFile(path.join(process.env.FT_DATA_DIR!, 'bookmarks.jsonl'), 'utf8'); + const rows = jsonl.trim().split('\n').map((line) => JSON.parse(line)); + const storedFirst = rows.find((row) => row.id === '500'); + const storedSecond = rows.find((row) => row.id === '600'); + assert.equal(storedFirst.threadBelow[0].id, '501'); + assert.ok(storedFirst.threadExpandedAt); + assert.equal(storedSecond.threadExpansionFailedAt, undefined); + assert.equal(calls, 2); + }, [first, second]); +}); + test('syncGaps: expands truncated note_tweet and stamps textExpandedAt', async () => { const fixture = loadFixture('tweet-result-by-rest-id-note-tweet.json'); const legacyPreview: string = fixture.data.tweetResult.result.legacy.full_text; From 2d6fe1fa7608f6df4f2133be088111fee2f2d674 Mon Sep 17 00:00:00 2001 From: Eric Litman Date: Thu, 7 May 2026 15:47:50 +0300 Subject: [PATCH 2/5] feat: preserve expanded links in tweet snapshots --- src/bookmarks-db.ts | 21 +++++-- src/graphql-bookmarks.ts | 99 ++++++++++++++++++++++++++------- src/types.ts | 2 + tests/graphql-bookmarks.test.ts | 50 ++++++++++++++++- 4 files changed, 147 insertions(+), 25 deletions(-) diff --git a/src/bookmarks-db.ts b/src/bookmarks-db.ts index 910bcb0..862b95a 100644 --- a/src/bookmarks-db.ts +++ b/src/bookmarks-db.ts @@ -1237,18 +1237,31 @@ export async function updateQuotedTweets( } export async function updateBookmarkText( - records: Array<{ id: string; text: string }>, + records: Array<{ id: string; text: string; links?: string[] }>, ): Promise { const dbPath = twitterBookmarksIndexPath(); const db = await openDb(dbPath); ensureMigrations(db); try { - const stmt = db.prepare('UPDATE bookmarks SET text = ? WHERE id = ?'); + const textOnlyStmt = db.prepare('UPDATE bookmarks SET text = ? WHERE id = ?'); + const textAndLinksStmt = db.prepare( + 'UPDATE bookmarks SET text = ?, link_count = ?, links_json = ? WHERE id = ?' + ); for (const record of records) { - stmt.run([record.text, record.id]); + if (record.links === undefined) { + textOnlyStmt.run([record.text, record.id]); + } else { + textAndLinksStmt.run([ + record.text, + record.links.length, + record.links.length ? JSON.stringify(record.links) : null, + record.id, + ]); + } } - stmt.free(); + textOnlyStmt.free(); + textAndLinksStmt.free(); // Rebuild FTS to reflect updated text db.run("INSERT INTO bookmarks_fts(bookmarks_fts) VALUES('rebuild')"); saveDb(db, dbPath); diff --git a/src/graphql-bookmarks.ts b/src/graphql-bookmarks.ts index 4f521c7..4abfdbf 100644 --- a/src/graphql-bookmarks.ts +++ b/src/graphql-bookmarks.ts @@ -180,6 +180,61 @@ function parseBookmarkTimestamp(record: BookmarkRecord): number | null { return null; } +function uniqueStrings(values: string[]): string[] { + return [...new Set(values)]; +} + +function urlEntityKey(entity: any): string { + return String(entity?.url ?? entity?.expanded_url ?? entity?.expandedUrl ?? entity?.display_url ?? entity?.displayUrl ?? ''); +} + +function tweetUrlEntities(tweet: any, legacy: any): any[] { + const entities = [ + ...(Array.isArray(legacy?.entities?.urls) ? legacy.entities.urls : []), + ...(Array.isArray(tweet?.note_tweet?.note_tweet_results?.result?.entity_set?.urls) + ? tweet.note_tweet.note_tweet_results.result.entity_set.urls + : []), + ]; + const seen = new Set(); + return entities.filter((entity) => { + const key = urlEntityKey(entity); + if (!key || seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +function syndicationUrlEntities(data: any): any[] { + return Array.isArray(data?.entities?.urls) ? data.entities.urls : []; +} + +function extractExpandedLinks(urlEntities: any[]): string[] { + return uniqueStrings( + urlEntities + .map((entity) => entity?.expanded_url ?? entity?.expandedUrl ?? entity?.url) + .filter((url): url is string => typeof url === 'string' && url.length > 0 && !url.includes('t.co/')), + ); +} + +function urlEntityReplacement(entity: any): string | undefined { + const expanded = entity?.expanded_url ?? entity?.expandedUrl; + if (typeof expanded === 'string' && expanded.length > 0 && !expanded.includes('t.co/')) return expanded; + const display = entity?.display_url ?? entity?.displayUrl; + if (typeof display === 'string' && display.length > 0 && !display.includes('t.co/')) return display; + return undefined; +} + +function expandVisibleUrlEntities(text: string, urlEntities: any[]): string { + let expanded = text; + for (const entity of urlEntities) { + if (typeof entity?.url !== 'string') continue; + const replacement = urlEntityReplacement(entity); + if (!replacement) continue; + expanded = expanded.split(entity.url).join(replacement); + } + return expanded; +} + function compareBookmarkChronology(a: BookmarkRecord, b: BookmarkRecord): number { const aSortIndex = parseSnowflake(a.sortIndex); const bSortIndex = parseSnowflake(b.sortIndex); @@ -295,10 +350,8 @@ export function convertTweetToRecord(tweetResult: any, now: string): BookmarkRec : undefined, })); - const urlEntities = legacy?.entities?.urls ?? []; - const links: string[] = urlEntities - .map((u: any) => u.expanded_url) - .filter((u: string | undefined) => u && !u.includes('t.co')); + const urlEntities = tweetUrlEntities(tweet, legacy); + const links = extractExpandedLinks(urlEntities); // Extract quoted tweet if present const quotedResult = tweet?.quoted_status_result?.result; @@ -311,10 +364,11 @@ export function convertTweetToRecord(tweetResult: any, now: string): BookmarkRec const qtUser = qtTweet?.core?.user_results?.result; const qtHandle = qtUser?.core?.screen_name ?? qtUser?.legacy?.screen_name; const qtMediaEntities = qtLegacy?.extended_entities?.media ?? qtLegacy?.entities?.media ?? []; + const qtUrlEntities = tweetUrlEntities(qtTweet, qtLegacy); const qtNoteText = qtTweet?.note_tweet?.note_tweet_results?.result?.text; quotedTweet = { id: qtId, - text: qtNoteText ?? qtLegacy.full_text ?? qtLegacy.text ?? '', + text: expandVisibleUrlEntities(qtNoteText ?? qtLegacy.full_text ?? qtLegacy.text ?? '', qtUrlEntities), authorHandle: qtHandle, authorName: qtUser?.core?.name ?? qtUser?.legacy?.name, authorProfileImageUrl: @@ -334,6 +388,7 @@ export function convertTweetToRecord(tweetResult: any, now: string): BookmarkRec .map((v: any) => ({ bitrate: v.bitrate, url: v.url })) : undefined, })), + links: extractExpandedLinks(qtUrlEntities), url: `https://x.com/${qtHandle ?? '_'}/status/${qtId}`, }; } @@ -341,12 +396,7 @@ export function convertTweetToRecord(tweetResult: any, now: string): BookmarkRec // X Articles / long-form note tweets store full text separately const noteTweetText = tweet?.note_tweet?.note_tweet_results?.result?.text; - let text = noteTweetText ?? legacy.full_text ?? legacy.text ?? ''; - for (const entity of urlEntities) { - if (typeof entity?.url === 'string' && typeof entity?.display_url === 'string') { - text = text.split(entity.url).join(entity.display_url); - } - } + const text = expandVisibleUrlEntities(noteTweetText ?? legacy.full_text ?? legacy.text ?? '', urlEntities); return { id: tweetId, @@ -1506,8 +1556,9 @@ export function parseTweetResultByRestId(json: any, tweetId: string): QuotedTwee const legacy = tweet?.legacy; if (!legacy) return null; + const urlEntities = tweetUrlEntities(tweet, legacy); const noteText = tweet?.note_tweet?.note_tweet_results?.result?.text; - const text = noteText ?? legacy.full_text ?? legacy.text ?? ''; + const text = expandVisibleUrlEntities(noteText ?? legacy.full_text ?? legacy.text ?? '', urlEntities); if (!text) return null; const userResult = tweet?.core?.user_results?.result; @@ -1531,6 +1582,7 @@ export function parseTweetResultByRestId(json: any, tweetId: string): QuotedTwee width: m.original_info?.width, height: m.original_info?.height, })), + links: extractExpandedLinks(urlEntities), url: `https://x.com/${handle ?? '_'}/status/${resolvedId}`, }; } @@ -1544,8 +1596,9 @@ function parseThreadTweetResult( const legacy = tweet?.legacy; if (!legacy) return null; + const urlEntities = tweetUrlEntities(tweet, legacy); const noteText = tweet?.note_tweet?.note_tweet_results?.result?.text; - const text = noteText ?? legacy.full_text ?? legacy.text ?? ''; + const text = expandVisibleUrlEntities(noteText ?? legacy.full_text ?? legacy.text ?? '', urlEntities); const resolvedId = String(legacy.id_str ?? tweet?.rest_id ?? fallbackId ?? ''); if (!resolvedId || !text) return null; @@ -1575,6 +1628,7 @@ function parseThreadTweetResult( .map((v: any) => ({ bitrate: v.bitrate, url: v.url })) : undefined, })), + links: extractExpandedLinks(urlEntities), conversationId: legacy.conversation_id_str, inReplyToStatusId: legacy.in_reply_to_status_id_str, ...metadata, @@ -1922,12 +1976,13 @@ async function fetchTweetViaSyndication(tweetId: string): Promise = []; - const dbTextUpdates: Array<{ id: string; text: string }> = []; + const dbTextUpdates: Array<{ id: string; text: string; links?: string[] }> = []; const articleDbUpdates: ArticleUpdate[] = []; // Fetch and apply incrementally @@ -2543,10 +2601,13 @@ export async function syncGaps(options: SyncGapsOptions = {}): Promise (record.text?.length ?? 0); - if (didExpand) { + const mergedLinks = uniqueStrings([...(record.links ?? []), ...(snapshot?.links ?? [])]); + const didExpandLinks = mergedLinks.length > (record.links?.length ?? 0); + if (didExpand || didExpandLinks) { record.text = snapshot!.text; - dbTextUpdates.push({ id: record.id, text: snapshot!.text }); - textExpanded++; + record.links = mergedLinks; + dbTextUpdates.push({ id: record.id, text: snapshot!.text, links: mergedLinks }); + if (didExpand) textExpanded++; } if (didExpand || graphqlSettled || isPermanentFailure) { record.textExpandedAt = now; diff --git a/src/types.ts b/src/types.ts index a268b58..e6bd8ad 100644 --- a/src/types.ts +++ b/src/types.ts @@ -49,6 +49,7 @@ export interface QuotedTweetSnapshot { postedAt?: string | null; media?: string[]; mediaObjects?: BookmarkMediaObject[]; + links?: string[]; url: string; } @@ -61,6 +62,7 @@ export interface ThreadTweetSnapshot { postedAt?: string | null; media?: string[]; mediaObjects?: BookmarkMediaObject[]; + links?: string[]; conversationId?: string; inReplyToStatusId?: string; /** diff --git a/tests/graphql-bookmarks.test.ts b/tests/graphql-bookmarks.test.ts index 6f31fab..53b91cc 100644 --- a/tests/graphql-bookmarks.test.ts +++ b/tests/graphql-bookmarks.test.ts @@ -222,7 +222,7 @@ test('convertTweetToRecord: extracts links, filtering out t.co', () => { assert.equal(result.links![0], 'https://example.com/article'); }); -test('convertTweetToRecord: expands t.co links in visible text using display_url', () => { +test('convertTweetToRecord: expands t.co links in visible text using expanded_url', () => { const result = convertTweetToRecord(makeTweetResult({ legacy: { full_text: 'Check this: https://t.co/abc and this: https://t.co/def', @@ -235,7 +235,34 @@ test('convertTweetToRecord: expands t.co links in visible text using display_url }, }), NOW)!; - assert.equal(result.text, 'Check this: example.com/foo and this: tools.exec.security'); + assert.equal(result.text, 'Check this: https://example.com/article and this: https://tools.exec.security'); + assert.deepEqual(result.links, ['https://example.com/article', 'https://tools.exec.security']); +}); + +test('convertTweetToRecord: extracts note_tweet entity_set links', () => { + const result = convertTweetToRecord(makeTweetResult({ + legacy: { + full_text: 'Preview https://t.co/note', + entities: { urls: [] }, + }, + tweet: { + note_tweet: { + note_tweet_results: { + result: { + text: 'Full note body with a link: https://t.co/note', + entity_set: { + urls: [ + { expanded_url: 'https://example.com/full-note', url: 'https://t.co/note', display_url: 'example.com/full-note' }, + ], + }, + }, + }, + }, + }, + }), NOW)!; + + assert.equal(result.text, 'Full note body with a link: https://example.com/full-note'); + assert.deepEqual(result.links, ['https://example.com/full-note']); }); test('convertTweetToRecord: handles location as object', () => { @@ -564,6 +591,25 @@ test('convertTweetToRecord: quoted tweet prefers note_tweet body over legacy ful ); }); +test('parseTweetDetailResponse: expands thread tweet t.co links and stores links', () => { + const reply = makeTweetResult({ + legacy: { + id_str: '101', + full_text: 'Reply link below: https://t.co/reply', + entities: { + urls: [ + { expanded_url: 'https://example.com/reply', url: 'https://t.co/reply', display_url: 'example.com/reply' }, + ], + }, + }, + }); + const parsed = parseTweetDetailResponse(makeTweetDetailResponse([reply])); + + assert.equal(parsed.tweets.length, 1); + assert.equal(parsed.tweets[0].text, 'Reply link below: https://example.com/reply'); + assert.deepEqual(parsed.tweets[0].links, ['https://example.com/reply']); +}); + test('parseBookmarksResponse: captures full note_tweet body from live bookmarks-feed fixture', () => { const fixture = loadFixture('bookmark-feed-note-tweet.json'); const { records } = parseBookmarksResponse(fixture, NOW); From db7ef9e35b0284e4728ca3ec5ecd85cd40521d80 Mon Sep 17 00:00:00 2001 From: Eric Litman Date: Thu, 7 May 2026 17:02:40 +0300 Subject: [PATCH 3/5] fix: harden thread capture sync --- src/bookmarks-db.ts | 82 +++++++-- src/cli.ts | 33 ++-- src/graphql-bookmarks.ts | 253 ++++----------------------- src/tweet-snapshots.ts | 293 ++++++++++++++++++++++++++++++++ tests/bookmarks-db.test.ts | 58 +++++++ tests/graphql-bookmarks.test.ts | 63 ++++++- 6 files changed, 537 insertions(+), 245 deletions(-) create mode 100644 src/tweet-snapshots.ts diff --git a/src/bookmarks-db.ts b/src/bookmarks-db.ts index 862b95a..b91024f 100644 --- a/src/bookmarks-db.ts +++ b/src/bookmarks-db.ts @@ -7,7 +7,7 @@ import type { BookmarkRecord, QuotedTweetSnapshot, ThreadTweetSnapshot } from '. import { classifyCorpus, formatClassificationSummary } from './bookmark-classify.js'; import type { ClassificationSummary } from './bookmark-classify.js'; -const SCHEMA_VERSION = 7; +const SCHEMA_VERSION = 8; export interface SearchResult { id: string; @@ -298,7 +298,8 @@ function initSchema(db: Database): void { thread_context_json TEXT, thread_below_json TEXT, thread_expanded_at TEXT, - thread_expansion_failed_at TEXT + thread_expansion_failed_at TEXT, + thread_text TEXT )`); db.run(`CREATE INDEX IF NOT EXISTS idx_bookmarks_author ON bookmarks(author_handle)`); @@ -312,6 +313,7 @@ function initSchema(db: Database): void { author_handle, author_name, article_text, + thread_text, content=bookmarks, content_rowid=rowid, tokenize='porter unicode61' @@ -345,6 +347,46 @@ function ftsHasColumn(db: Database, column: string): boolean { } } +function buildThreadSearchText(context: ThreadTweetSnapshot[] = [], below: ThreadTweetSnapshot[] = []): string | null { + const parts: string[] = []; + for (const tweet of [...context, ...below]) { + if (tweet.text) parts.push(tweet.text); + for (const link of tweet.links ?? []) parts.push(link); + } + + const text = [...new Set(parts)] + .map((part) => part.trim()) + .filter(Boolean) + .join('\n'); + return text.length > 0 ? text : null; +} + +function buildThreadSearchTextFromJson(contextJson: unknown, belowJson: unknown): string | null { + return buildThreadSearchText(parseThreadTweets(contextJson), parseThreadTweets(belowJson)); +} + +function backfillThreadSearchText(db: Database): void { + const rows = db.exec( + `SELECT id, thread_context_json, thread_below_json + FROM bookmarks + WHERE thread_text IS NULL OR thread_text = ''` + ); + const values = rows[0]?.values ?? []; + if (values.length === 0) return; + + const stmt = db.prepare('UPDATE bookmarks SET thread_text = ? WHERE id = ?'); + try { + for (const row of values) { + stmt.run([ + buildThreadSearchTextFromJson(row[1], row[2]), + row[0], + ]); + } + } finally { + stmt.free(); + } +} + function ensureMigrations(db: Database): void { // Ensure meta table exists (may not on a fresh/empty DB) db.run('CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT)'); @@ -376,13 +418,19 @@ function ensureMigrations(db: Database): void { ensureColumn(db, 'bookmarks', 'thread_below_json', 'TEXT'); ensureColumn(db, 'bookmarks', 'thread_expanded_at', 'TEXT'); ensureColumn(db, 'bookmarks', 'thread_expansion_failed_at', 'TEXT'); + const hadThreadText = columnExists(db, 'bookmarks', 'thread_text'); + ensureColumn(db, 'bookmarks', 'thread_text', 'TEXT'); - // FTS rebuild: only if the FTS table is missing the article_text column. + // FTS rebuild: only if the FTS table is missing an indexed content column. // Check via a zero-row SELECT so we don't rebuild unnecessarily. - if (!ftsHasColumn(db, 'article_text')) { + const ftsNeedsRebuild = !ftsHasColumn(db, 'article_text') || !ftsHasColumn(db, 'thread_text'); + if (!hadThreadText || ftsNeedsRebuild) { + backfillThreadSearchText(db); + } + if (ftsNeedsRebuild) { db.run('DROP TABLE IF EXISTS bookmarks_fts'); db.run(`CREATE VIRTUAL TABLE IF NOT EXISTS bookmarks_fts USING fts5( - text, author_handle, author_name, article_text, + text, author_handle, author_name, article_text, thread_text, content=bookmarks, content_rowid=rowid, tokenize='porter unicode61' )`); @@ -410,6 +458,7 @@ interface PreservedBookmarkFields { threadBelowJson: string | null; threadExpandedAt: string | null; threadExpansionFailedAt: string | null; + threadText: string | null; } function serializeJsonArray(values: string[] | undefined | null): string | null { @@ -422,6 +471,14 @@ function serializeThreadTweets(values: ThreadTweetSnapshot[] | undefined): strin return JSON.stringify(values); } +function recordThreadSearchText(r: BookmarkRecord, preserved?: PreservedBookmarkFields): string | null { + if (r.threadContext !== undefined || r.threadBelow !== undefined) { + return buildThreadSearchText(r.threadContext ?? [], r.threadBelow ?? []); + } + return preserved?.threadText + ?? buildThreadSearchTextFromJson(preserved?.threadContextJson, preserved?.threadBelowJson); +} + function insertRecord(db: Database, r: BookmarkRecord, preserved?: PreservedBookmarkFields): void { // Extract GitHub URLs (kept inline — no LLM needed for URL parsing) const text = r.text ?? ''; @@ -430,7 +487,7 @@ function insertRecord(db: Database, r: BookmarkRecord, preserved?: PreservedBook const githubUrls = [...new Set([...githubMatches.map((m) => `https://${m}`), ...githubFromLinks])]; db.run( - `INSERT OR REPLACE INTO bookmarks VALUES (${Array(41).fill('?').join(',')})`, + `INSERT OR REPLACE INTO bookmarks VALUES (${Array(42).fill('?').join(',')})`, [ r.id, r.tweetId, @@ -473,6 +530,7 @@ function insertRecord(db: Database, r: BookmarkRecord, preserved?: PreservedBook serializeThreadTweets(r.threadBelow) ?? preserved?.threadBelowJson ?? null, r.threadExpandedAt ?? preserved?.threadExpandedAt ?? null, r.threadExpansionFailedAt ?? preserved?.threadExpansionFailedAt ?? null, + recordThreadSearchText(r, preserved), ] ); } @@ -504,7 +562,7 @@ export async function buildIndex(options?: { force?: boolean }): Promise<{ dbPat `SELECT id, categories, primary_category, github_urls, domains, primary_domain, quoted_tweet_json, article_title, article_text, article_site, enriched_at, folder_ids, folder_names, thread_context_json, thread_below_json, - thread_expanded_at, thread_expansion_failed_at + thread_expanded_at, thread_expansion_failed_at, thread_text FROM bookmarks` ); for (const r of (rows[0]?.values ?? [])) { @@ -525,6 +583,7 @@ export async function buildIndex(options?: { force?: boolean }): Promise<{ dbPat threadBelowJson: (r[14] as string) ?? null, threadExpandedAt: (r[15] as string) ?? null, threadExpansionFailedAt: (r[16] as string) ?? null, + threadText: (r[17] as string) ?? null, }); } } catch { /* table may be empty */ } @@ -616,7 +675,7 @@ export async function searchBookmarks(options: SearchOptions): Promise { SET thread_context_json = ?, thread_below_json = ?, thread_expanded_at = ?, - thread_expansion_failed_at = ? + thread_expansion_failed_at = ?, + thread_text = ? WHERE id = ?` ); for (const record of records) { @@ -1332,10 +1392,12 @@ export async function updateThreadData(records: ThreadUpdate[]): Promise { record.threadBelow === undefined ? null : JSON.stringify(record.threadBelow), record.threadExpandedAt ?? null, record.threadExpansionFailedAt ?? null, + buildThreadSearchText(record.threadContext ?? [], record.threadBelow ?? []), record.id, ]); } stmt.free(); + db.run("INSERT INTO bookmarks_fts(bookmarks_fts) VALUES('rebuild')"); saveDb(db, dbPath); } finally { db.close(); diff --git a/src/cli.ts b/src/cli.ts index 92a3396..b3ddca3 100644 --- a/src/cli.ts +++ b/src/cli.ts @@ -850,19 +850,26 @@ export function buildCli() { parts.push(`${elapsed}s`); return parts.join(' \u2502 '); }); - const result = await runWithSpinner(spinner, () => syncThreads({ - delayMs: Number(options.delayMs) || 300, - browser: options.browser ? String(options.browser) : undefined, - chromeUserDataDir: options.chromeUserDataDir ? String(options.chromeUserDataDir) : undefined, - chromeProfileDirectory: options.chromeProfileDirectory ? String(options.chromeProfileDirectory) : undefined, - firefoxProfileDir: options.firefoxProfileDir ? String(options.firefoxProfileDir) : undefined, - csrfToken: cookieArgs.csrfToken, - cookieHeader: cookieArgs.cookieHeader, - onProgress: (progress: ThreadSyncProgress) => { - lastProgress = progress; - spinner.update(); - }, - })); + let result; + try { + result = await runWithSpinner(spinner, () => syncThreads({ + delayMs: Number(options.delayMs) || 300, + browser: options.browser ? String(options.browser) : undefined, + chromeUserDataDir: options.chromeUserDataDir ? String(options.chromeUserDataDir) : undefined, + chromeProfileDirectory: options.chromeProfileDirectory ? String(options.chromeProfileDirectory) : undefined, + firefoxProfileDir: options.firefoxProfileDir ? String(options.firefoxProfileDir) : undefined, + csrfToken: cookieArgs.csrfToken, + cookieHeader: cookieArgs.cookieHeader, + onProgress: (progress: ThreadSyncProgress) => { + lastProgress = progress; + spinner.update(); + }, + })); + } catch (err) { + console.error(`\n Thread sync paused: ${(err as Error).message}`); + console.error(' Partial progress was saved. Re-run `ft sync --threads` later to resume.\n'); + return; + } if (result.total === 0) { console.log(' No thread gaps found.'); } else { diff --git a/src/graphql-bookmarks.ts b/src/graphql-bookmarks.ts index 4abfdbf..f1a859d 100644 --- a/src/graphql-bookmarks.ts +++ b/src/graphql-bookmarks.ts @@ -9,6 +9,24 @@ import { exportBookmarksForSyncSeed, updateQuotedTweets, updateBookmarkText, upd import type { ArticleUpdate } from './bookmarks-db.js'; import { fetchArticle, resolveTcoLink } from './bookmark-enrich.js'; import type { ArticleContent } from './bookmark-enrich.js'; +import { + compareThreadTweetsChronologically, + expandVisibleUrlEntities, + extractExpandedLinks, + extractSameAuthorThreadBelow, + parseSnowflake, + parseThreadTweetResultByRestId, + parseTweetDetailResponse, + syndicationUrlEntities, + tweetUrlEntities, + uniqueStrings, +} from './tweet-snapshots.js'; + +export { + extractSameAuthorThreadBelow, + parseThreadTweetResultByRestId, + parseTweetDetailResponse, +} from './tweet-snapshots.js'; const CHROME_UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/146.0.0.0 Safari/537.36'; @@ -127,15 +145,6 @@ export interface SyncResult { retryAfterSec?: number; } -function parseSnowflake(value?: string | null): bigint | null { - if (!value || !/^\d+$/.test(value)) return null; - try { - return BigInt(value); - } catch { - return null; - } -} - const MAX_FUTURE_BOOKMARK_SKEW_MS = 5 * 60_000; export function sanitizeBookmarkedAt(record: BookmarkRecord): BookmarkRecord { @@ -180,61 +189,6 @@ function parseBookmarkTimestamp(record: BookmarkRecord): number | null { return null; } -function uniqueStrings(values: string[]): string[] { - return [...new Set(values)]; -} - -function urlEntityKey(entity: any): string { - return String(entity?.url ?? entity?.expanded_url ?? entity?.expandedUrl ?? entity?.display_url ?? entity?.displayUrl ?? ''); -} - -function tweetUrlEntities(tweet: any, legacy: any): any[] { - const entities = [ - ...(Array.isArray(legacy?.entities?.urls) ? legacy.entities.urls : []), - ...(Array.isArray(tweet?.note_tweet?.note_tweet_results?.result?.entity_set?.urls) - ? tweet.note_tweet.note_tweet_results.result.entity_set.urls - : []), - ]; - const seen = new Set(); - return entities.filter((entity) => { - const key = urlEntityKey(entity); - if (!key || seen.has(key)) return false; - seen.add(key); - return true; - }); -} - -function syndicationUrlEntities(data: any): any[] { - return Array.isArray(data?.entities?.urls) ? data.entities.urls : []; -} - -function extractExpandedLinks(urlEntities: any[]): string[] { - return uniqueStrings( - urlEntities - .map((entity) => entity?.expanded_url ?? entity?.expandedUrl ?? entity?.url) - .filter((url): url is string => typeof url === 'string' && url.length > 0 && !url.includes('t.co/')), - ); -} - -function urlEntityReplacement(entity: any): string | undefined { - const expanded = entity?.expanded_url ?? entity?.expandedUrl; - if (typeof expanded === 'string' && expanded.length > 0 && !expanded.includes('t.co/')) return expanded; - const display = entity?.display_url ?? entity?.displayUrl; - if (typeof display === 'string' && display.length > 0 && !display.includes('t.co/')) return display; - return undefined; -} - -function expandVisibleUrlEntities(text: string, urlEntities: any[]): string { - let expanded = text; - for (const entity of urlEntities) { - if (typeof entity?.url !== 'string') continue; - const replacement = urlEntityReplacement(entity); - if (!replacement) continue; - expanded = expanded.split(entity.url).join(replacement); - } - return expanded; -} - function compareBookmarkChronology(a: BookmarkRecord, b: BookmarkRecord): number { const aSortIndex = parseSnowflake(a.sortIndex); const bSortIndex = parseSnowflake(b.sortIndex); @@ -1587,61 +1541,6 @@ export function parseTweetResultByRestId(json: any, tweetId: string): QuotedTwee }; } -function parseThreadTweetResult( - value: any, - fallbackId?: string, - metadata: Partial = {}, -): ThreadTweetSnapshot | null { - const tweet = value?.tweet ?? value; - const legacy = tweet?.legacy; - if (!legacy) return null; - - const urlEntities = tweetUrlEntities(tweet, legacy); - const noteText = tweet?.note_tweet?.note_tweet_results?.result?.text; - const text = expandVisibleUrlEntities(noteText ?? legacy.full_text ?? legacy.text ?? '', urlEntities); - const resolvedId = String(legacy.id_str ?? tweet?.rest_id ?? fallbackId ?? ''); - if (!resolvedId || !text) return null; - - const userResult = tweet?.core?.user_results?.result; - const handle = userResult?.core?.screen_name ?? userResult?.legacy?.screen_name; - const mediaEntities: any[] = legacy?.extended_entities?.media ?? legacy?.entities?.media ?? []; - - return { - id: resolvedId, - text, - authorHandle: handle, - authorName: userResult?.core?.name ?? userResult?.legacy?.name, - authorProfileImageUrl: - userResult?.avatar?.image_url ?? userResult?.legacy?.profile_image_url_https, - postedAt: legacy.created_at ?? null, - media: mediaEntities.map((m: any) => m.media_url_https ?? m.media_url).filter(Boolean), - mediaObjects: mediaEntities.map((m: any) => ({ - type: m.type, - url: m.media_url_https ?? m.media_url, - expandedUrl: m.expanded_url, - width: m.original_info?.width, - height: m.original_info?.height, - altText: m.ext_alt_text, - videoVariants: Array.isArray(m.video_info?.variants) - ? m.video_info.variants - .filter((v: any) => v.content_type === 'video/mp4') - .map((v: any) => ({ bitrate: v.bitrate, url: v.url })) - : undefined, - })), - links: extractExpandedLinks(urlEntities), - conversationId: legacy.conversation_id_str, - inReplyToStatusId: legacy.in_reply_to_status_id_str, - ...metadata, - url: `https://x.com/${handle ?? '_'}/status/${resolvedId}`, - }; -} - -export function parseThreadTweetResultByRestId(json: any, tweetId: string): ThreadTweetSnapshot | null { - const result = json?.data?.tweetResult?.result; - if (!result) return null; - return parseThreadTweetResult(result, tweetId); -} - function unwrapGraphqlResult(value: any): any { return value?.result?.tweet ?? value?.result ?? value?.tweet ?? value; } @@ -1764,106 +1663,6 @@ function buildTweetDetailUrl(tweetId: string, cursor?: string): string { return `https://x.com/i/api/graphql/${TWEET_DETAIL_QUERY_ID}/${TWEET_DETAIL_OPERATION}?${params}`; } -function compareTweetIdsChronologically(a: ThreadTweetSnapshot, b: ThreadTweetSnapshot): number { - const aId = parseSnowflake(a.id); - const bId = parseSnowflake(b.id); - if (aId != null && bId != null && aId !== bId) return aId < bId ? -1 : 1; - const aTime = parseTimestampMs(a.postedAt); - const bTime = parseTimestampMs(b.postedAt); - if (aTime != null && bTime != null && aTime !== bTime) return aTime - bTime; - return a.id.localeCompare(b.id); -} - -function sameHandle(a?: string, b?: string): boolean { - if (!a || !b) return false; - return a.toLowerCase() === b.toLowerCase(); -} - -function conversationSection(content: any): string | undefined { - return content?.clientEventInfo?.details?.conversationDetails?.conversationSection; -} - -function collectThreadEntries(entries: any[], out: ThreadTweetSnapshot[]): string | undefined { - let nextCursor: string | undefined; - for (const entry of entries) { - if (entry?.entryId?.startsWith('cursor-bottom')) { - nextCursor = entry?.content?.value; - continue; - } - - const direct = entry?.content?.itemContent?.tweet_results?.result; - const directSnapshot = direct ? parseThreadTweetResult(direct) : null; - if (directSnapshot) out.push(directSnapshot); - - const moduleItems = entry?.content?.items; - if (Array.isArray(moduleItems)) { - const moduleSnapshots: ThreadTweetSnapshot[] = []; - for (let index = 0; index < moduleItems.length; index++) { - const item = moduleItems[index]; - const result = item?.item?.itemContent?.tweet_results?.result; - const snapshot = result ? parseThreadTweetResult(result, undefined, { - conversationEntryId: entry.entryId, - conversationDisplayType: entry?.content?.displayType, - conversationSection: conversationSection(entry.content), - conversationItemIndex: index, - }) : null; - if (snapshot) moduleSnapshots.push(snapshot); - } - const rootId = moduleSnapshots[0]?.id; - for (const snapshot of moduleSnapshots) { - if (rootId) snapshot.conversationRootId = rootId; - out.push(snapshot); - } - } - } - return nextCursor; -} - -export function parseTweetDetailResponse(json: any): { tweets: ThreadTweetSnapshot[]; nextCursor?: string } { - const instructions = json?.data?.threaded_conversation_with_injections_v2?.instructions ?? []; - const tweets: ThreadTweetSnapshot[] = []; - let nextCursor: string | undefined; - - for (const instruction of instructions) { - if (instruction?.type === 'TimelineAddEntries' && Array.isArray(instruction.entries)) { - nextCursor = collectThreadEntries(instruction.entries, tweets) ?? nextCursor; - } - if (instruction?.type === 'TimelinePinEntry' && instruction.entry) { - collectThreadEntries([instruction.entry], tweets); - } - } - - const byId = new Map(); - for (const tweet of tweets) { - if (!byId.has(tweet.id)) byId.set(tweet.id, tweet); - } - return { tweets: Array.from(byId.values()).sort(compareTweetIdsChronologically), nextCursor }; -} - -export function extractSameAuthorThreadBelow( - tweets: ThreadTweetSnapshot[], - focalTweetId: string, - focalAuthorHandle?: string, -): ThreadTweetSnapshot[] { - const focal = tweets.find((tweet) => tweet.id === focalTweetId); - const authorHandle = focalAuthorHandle ?? focal?.authorHandle; - if (!authorHandle) return []; - - const chainIds = new Set([focalTweetId]); - const below: ThreadTweetSnapshot[] = []; - const sorted = tweets - .filter((tweet) => tweet.id !== focalTweetId && sameHandle(tweet.authorHandle, authorHandle)) - .sort(compareTweetIdsChronologically); - - for (const tweet of sorted) { - if (!tweet.inReplyToStatusId || !chainIds.has(tweet.inReplyToStatusId)) continue; - below.push({ ...tweet, threadRole: 'post-thread' }); - chainIds.add(tweet.id); - } - - return below; -} - export async function fetchTweetByIdViaGraphQL( tweetId: string, csrfToken: string, @@ -1930,6 +1729,10 @@ export async function fetchTweetDetailViaGraphQL( const delayMs = options.delayMs ?? 300; const tweets: ThreadTweetSnapshot[] = []; let cursor: string | undefined; + let sawRecognizedTimeline = false; + let sawTweetResult = false; + let sawUnavailableTweet = false; + let sawUnparseableTweet = false; for (let page = 0; page < maxPages; page++) { let response: Response; @@ -1949,6 +1752,10 @@ export async function fetchTweetDetailViaGraphQL( const json = await response.json(); const parsed = parseTweetDetailResponse(json); + sawRecognizedTimeline = sawRecognizedTimeline || parsed.recognizedTimeline; + sawTweetResult = sawTweetResult || parsed.sawTweetResult; + sawUnavailableTweet = sawUnavailableTweet || parsed.sawUnavailableTweet; + sawUnparseableTweet = sawUnparseableTweet || parsed.sawUnparseableTweet; tweets.push(...parsed.tweets); if (!parsed.nextCursor || parsed.nextCursor === cursor) break; cursor = parsed.nextCursor; @@ -1959,8 +1766,12 @@ export async function fetchTweetDetailViaGraphQL( for (const tweet of tweets) { if (!byId.has(tweet.id)) byId.set(tweet.id, tweet); } - if (byId.size === 0) return { tweets: [], status: 'error' }; - return { tweets: Array.from(byId.values()).sort(compareTweetIdsChronologically), status: 'ok' }; + if (byId.size === 0) { + if (sawUnavailableTweet && !sawUnparseableTweet) return { tweets: [], status: 'not_found' }; + if (sawRecognizedTimeline && !sawTweetResult) return { tweets: [], status: 'empty' }; + return { tweets: [], status: 'error' }; + } + return { tweets: Array.from(byId.values()).sort(compareThreadTweetsChronologically), status: 'ok' }; } async function fetchTweetViaSyndication(tweetId: string): Promise { diff --git a/src/tweet-snapshots.ts b/src/tweet-snapshots.ts new file mode 100644 index 0000000..791f33b --- /dev/null +++ b/src/tweet-snapshots.ts @@ -0,0 +1,293 @@ +import { parseTimestampMs } from './date-utils.js'; +import type { ThreadTweetSnapshot } from './types.js'; + +export function parseSnowflake(value?: string | null): bigint | null { + if (!value || !/^\d+$/.test(value)) return null; + try { + return BigInt(value); + } catch { + return null; + } +} + +export function uniqueStrings(values: string[]): string[] { + return [...new Set(values)]; +} + +function urlEntityKey(entity: any): string { + return String(entity?.url ?? entity?.expanded_url ?? entity?.expandedUrl ?? entity?.display_url ?? entity?.displayUrl ?? ''); +} + +export function tweetUrlEntities(tweet: any, legacy: any): any[] { + const entities = [ + ...(Array.isArray(legacy?.entities?.urls) ? legacy.entities.urls : []), + ...(Array.isArray(tweet?.note_tweet?.note_tweet_results?.result?.entity_set?.urls) + ? tweet.note_tweet.note_tweet_results.result.entity_set.urls + : []), + ]; + const seen = new Set(); + return entities.filter((entity) => { + const key = urlEntityKey(entity); + if (!key || seen.has(key)) return false; + seen.add(key); + return true; + }); +} + +export function syndicationUrlEntities(data: any): any[] { + return Array.isArray(data?.entities?.urls) ? data.entities.urls : []; +} + +export function extractExpandedLinks(urlEntities: any[]): string[] { + return uniqueStrings( + urlEntities + .map((entity) => entity?.expanded_url ?? entity?.expandedUrl ?? entity?.url) + .filter((url): url is string => typeof url === 'string' && url.length > 0 && !url.includes('t.co/')), + ); +} + +function urlEntityReplacement(entity: any): string | undefined { + const expanded = entity?.expanded_url ?? entity?.expandedUrl; + if (typeof expanded === 'string' && expanded.length > 0 && !expanded.includes('t.co/')) return expanded; + const display = entity?.display_url ?? entity?.displayUrl; + if (typeof display === 'string' && display.length > 0 && !display.includes('t.co/')) return display; + return undefined; +} + +export function expandVisibleUrlEntities(text: string, urlEntities: any[]): string { + let expanded = text; + for (const entity of urlEntities) { + if (typeof entity?.url !== 'string') continue; + const replacement = urlEntityReplacement(entity); + if (!replacement) continue; + expanded = expanded.split(entity.url).join(replacement); + } + return expanded; +} + +export function compareThreadTweetsChronologically(a: ThreadTweetSnapshot, b: ThreadTweetSnapshot): number { + const aId = parseSnowflake(a.id); + const bId = parseSnowflake(b.id); + if (aId != null && bId != null && aId !== bId) return aId < bId ? -1 : 1; + const aTime = parseTimestampMs(a.postedAt); + const bTime = parseTimestampMs(b.postedAt); + if (aTime != null && bTime != null && aTime !== bTime) return aTime - bTime; + return a.id.localeCompare(b.id); +} + +function parseThreadTweetResult( + value: any, + fallbackId?: string, + metadata: Partial = {}, +): ThreadTweetSnapshot | null { + const tweet = value?.tweet ?? value; + const legacy = tweet?.legacy; + if (!legacy) return null; + + const urlEntities = tweetUrlEntities(tweet, legacy); + const noteText = tweet?.note_tweet?.note_tweet_results?.result?.text; + const text = expandVisibleUrlEntities(noteText ?? legacy.full_text ?? legacy.text ?? '', urlEntities); + const resolvedId = String(legacy.id_str ?? tweet?.rest_id ?? fallbackId ?? ''); + if (!resolvedId || !text) return null; + + const userResult = tweet?.core?.user_results?.result; + const handle = userResult?.core?.screen_name ?? userResult?.legacy?.screen_name; + const mediaEntities: any[] = legacy?.extended_entities?.media ?? legacy?.entities?.media ?? []; + + return { + id: resolvedId, + text, + authorHandle: handle, + authorName: userResult?.core?.name ?? userResult?.legacy?.name, + authorProfileImageUrl: + userResult?.avatar?.image_url ?? userResult?.legacy?.profile_image_url_https, + postedAt: legacy.created_at ?? null, + media: mediaEntities.map((m: any) => m.media_url_https ?? m.media_url).filter(Boolean), + mediaObjects: mediaEntities.map((m: any) => ({ + type: m.type, + url: m.media_url_https ?? m.media_url, + expandedUrl: m.expanded_url, + width: m.original_info?.width, + height: m.original_info?.height, + altText: m.ext_alt_text, + videoVariants: Array.isArray(m.video_info?.variants) + ? m.video_info.variants + .filter((v: any) => v.content_type === 'video/mp4') + .map((v: any) => ({ bitrate: v.bitrate, url: v.url })) + : undefined, + })), + links: extractExpandedLinks(urlEntities), + conversationId: legacy.conversation_id_str, + inReplyToStatusId: legacy.in_reply_to_status_id_str, + ...metadata, + url: `https://x.com/${handle ?? '_'}/status/${resolvedId}`, + }; +} + +export function parseThreadTweetResultByRestId(json: any, tweetId: string): ThreadTweetSnapshot | null { + const result = json?.data?.tweetResult?.result; + if (!result) return null; + return parseThreadTweetResult(result, tweetId); +} + +function sameHandle(a?: string, b?: string): boolean { + if (!a || !b) return false; + return a.toLowerCase() === b.toLowerCase(); +} + +function conversationSection(content: any): string | undefined { + return content?.clientEventInfo?.details?.conversationDetails?.conversationSection; +} + +function isUnavailableTweetResult(result: any): boolean { + const typename = result?.__typename ?? result?.tweet?.__typename; + return typename === 'TweetTombstone' || typename === 'TweetUnavailable'; +} + +interface CollectThreadResult { + nextCursor?: string; + sawTweetResult: boolean; + sawUnavailableTweet: boolean; + sawUnparseableTweet: boolean; +} + +function emptyCollectThreadResult(): CollectThreadResult { + return { + sawTweetResult: false, + sawUnavailableTweet: false, + sawUnparseableTweet: false, + }; +} + +function mergeCollectThreadResult(target: CollectThreadResult, source: CollectThreadResult): void { + target.nextCursor = source.nextCursor ?? target.nextCursor; + target.sawTweetResult = target.sawTweetResult || source.sawTweetResult; + target.sawUnavailableTweet = target.sawUnavailableTweet || source.sawUnavailableTweet; + target.sawUnparseableTweet = target.sawUnparseableTweet || source.sawUnparseableTweet; +} + +function parseResultInto( + result: any, + tweets: ThreadTweetSnapshot[], + fallbackId?: string, + metadata: Partial = {}, +): Pick { + if (!result) { + return { sawTweetResult: false, sawUnavailableTweet: false, sawUnparseableTweet: false }; + } + + if (isUnavailableTweetResult(result)) { + return { sawTweetResult: true, sawUnavailableTweet: true, sawUnparseableTweet: false }; + } + + const snapshot = parseThreadTweetResult(result, fallbackId, metadata); + if (!snapshot) { + return { sawTweetResult: true, sawUnavailableTweet: false, sawUnparseableTweet: true }; + } + + tweets.push(snapshot); + return { sawTweetResult: true, sawUnavailableTweet: false, sawUnparseableTweet: false }; +} + +function collectThreadEntries(entries: any[], out: ThreadTweetSnapshot[]): CollectThreadResult { + const result = emptyCollectThreadResult(); + for (const entry of entries) { + if (entry?.entryId?.startsWith('cursor-bottom')) { + result.nextCursor = entry?.content?.value; + continue; + } + + const direct = entry?.content?.itemContent?.tweet_results?.result; + const directParsed = parseResultInto(direct, out); + result.sawTweetResult = result.sawTweetResult || directParsed.sawTweetResult; + result.sawUnavailableTweet = result.sawUnavailableTweet || directParsed.sawUnavailableTweet; + result.sawUnparseableTweet = result.sawUnparseableTweet || directParsed.sawUnparseableTweet; + + const moduleItems = entry?.content?.items; + if (Array.isArray(moduleItems)) { + const moduleSnapshots: ThreadTweetSnapshot[] = []; + for (let index = 0; index < moduleItems.length; index++) { + const item = moduleItems[index]; + const itemResult = item?.item?.itemContent?.tweet_results?.result; + const parsed = parseResultInto(itemResult, moduleSnapshots, undefined, { + conversationEntryId: entry.entryId, + conversationDisplayType: entry?.content?.displayType, + conversationSection: conversationSection(entry.content), + conversationItemIndex: index, + }); + result.sawTweetResult = result.sawTweetResult || parsed.sawTweetResult; + result.sawUnavailableTweet = result.sawUnavailableTweet || parsed.sawUnavailableTweet; + result.sawUnparseableTweet = result.sawUnparseableTweet || parsed.sawUnparseableTweet; + } + const rootId = moduleSnapshots[0]?.id; + for (const snapshot of moduleSnapshots) { + if (rootId) snapshot.conversationRootId = rootId; + out.push(snapshot); + } + } + } + return result; +} + +export interface TweetDetailParseResult { + tweets: ThreadTweetSnapshot[]; + nextCursor?: string; + recognizedTimeline: boolean; + sawTweetResult: boolean; + sawUnavailableTweet: boolean; + sawUnparseableTweet: boolean; +} + +export function parseTweetDetailResponse(json: any): TweetDetailParseResult { + const instructionsValue = json?.data?.threaded_conversation_with_injections_v2?.instructions; + const recognizedTimeline = Array.isArray(instructionsValue); + const instructions = recognizedTimeline ? instructionsValue : []; + const tweets: ThreadTweetSnapshot[] = []; + const collectResult = emptyCollectThreadResult(); + + for (const instruction of instructions) { + if (instruction?.type === 'TimelineAddEntries' && Array.isArray(instruction.entries)) { + mergeCollectThreadResult(collectResult, collectThreadEntries(instruction.entries, tweets)); + } + if (instruction?.type === 'TimelinePinEntry' && instruction.entry) { + mergeCollectThreadResult(collectResult, collectThreadEntries([instruction.entry], tweets)); + } + } + + const byId = new Map(); + for (const tweet of tweets) { + if (!byId.has(tweet.id)) byId.set(tweet.id, tweet); + } + return { + tweets: Array.from(byId.values()).sort(compareThreadTweetsChronologically), + nextCursor: collectResult.nextCursor, + recognizedTimeline, + sawTweetResult: collectResult.sawTweetResult, + sawUnavailableTweet: collectResult.sawUnavailableTweet, + sawUnparseableTweet: collectResult.sawUnparseableTweet, + }; +} + +export function extractSameAuthorThreadBelow( + tweets: ThreadTweetSnapshot[], + focalTweetId: string, + focalAuthorHandle?: string, +): ThreadTweetSnapshot[] { + const focal = tweets.find((tweet) => tweet.id === focalTweetId); + const authorHandle = focalAuthorHandle ?? focal?.authorHandle; + if (!authorHandle) return []; + + const chainIds = new Set([focalTweetId]); + const below: ThreadTweetSnapshot[] = []; + const sorted = tweets + .filter((tweet) => tweet.id !== focalTweetId && sameHandle(tweet.authorHandle, authorHandle)) + .sort(compareThreadTweetsChronologically); + + for (const tweet of sorted) { + if (!tweet.inReplyToStatusId || !chainIds.has(tweet.inReplyToStatusId)) continue; + below.push({ ...tweet, threadRole: 'post-thread' }); + chainIds.add(tweet.id); + } + + return below; +} diff --git a/tests/bookmarks-db.test.ts b/tests/bookmarks-db.test.ts index 7bea098..dcdcc8e 100644 --- a/tests/bookmarks-db.test.ts +++ b/tests/bookmarks-db.test.ts @@ -118,6 +118,64 @@ test('searchBookmarks: full-text search returns matching results', async () => { }); }); +test('searchBookmarks: indexes captured reply thread text', async () => { + const fixtures = [{ + ...FIXTURES[0], + text: 'Launch post with details in replies', + threadBelow: [{ + id: '11', + text: 'The actual release is Saperly with the paper link.', + authorHandle: 'alice', + links: ['https://saperly.com'], + url: 'https://x.com/alice/status/11', + }], + threadExpandedAt: '2026-01-01T12:05:00Z', + }]; + + await withIsolatedDataDir(async () => { + await buildIndex(); + const results = await searchBookmarks({ query: 'Saperly', limit: 10 }); + assert.equal(results.length, 1); + assert.equal(results[0].id, '1'); + }, fixtures); +}); + +test('searchBookmarks: migration backfills thread text from existing thread JSON', async () => { + await withIsolatedDataDir(async () => { + await buildIndex(); + + const dbPath = twitterBookmarksIndexPath(); + const db = await openDb(dbPath); + try { + db.run( + `UPDATE bookmarks + SET thread_below_json = ?, thread_text = NULL + WHERE id = ?`, + [JSON.stringify([{ + id: '12', + text: 'Migration-only reply mentioning Portola.', + authorHandle: 'alice', + url: 'https://x.com/alice/status/12', + }]), '1'] + ); + db.run('DROP TABLE IF EXISTS bookmarks_fts'); + db.run(`CREATE VIRTUAL TABLE bookmarks_fts USING fts5( + text, author_handle, author_name, article_text, + content=bookmarks, content_rowid=rowid, + tokenize='porter unicode61' + )`); + db.run("INSERT INTO bookmarks_fts(bookmarks_fts) VALUES('rebuild')"); + saveDb(db, dbPath); + } finally { + db.close(); + } + + const results = await searchBookmarks({ query: 'Portola', limit: 10 }); + assert.equal(results.length, 1); + assert.equal(results[0].id, '1'); + }); +}); + test('searchBookmarks: author filter works', async () => { await withIsolatedDataDir(async () => { await buildIndex(); diff --git a/tests/graphql-bookmarks.test.ts b/tests/graphql-bookmarks.test.ts index 53b91cc..636d66c 100644 --- a/tests/graphql-bookmarks.test.ts +++ b/tests/graphql-bookmarks.test.ts @@ -12,6 +12,7 @@ import { parseTweetArticleByRestId, parseTweetDetailResponse, parseTweetResultByRestId, + fetchTweetDetailViaGraphQL, sanitizeBookmarkedAt, scoreRecord, mergeBookmarkRecord, @@ -24,7 +25,7 @@ import { syncThreads, extractSameAuthorThreadBelow, } from '../src/graphql-bookmarks.js'; -import { buildIndex, getBookmarkById } from '../src/bookmarks-db.js'; +import { buildIndex, getBookmarkById, searchBookmarks } from '../src/bookmarks-db.js'; import { resolveFolder, formatFolderMirrorStats } from '../src/cli.js'; import type { BookmarkFolder, BookmarkRecord } from '../src/types.js'; @@ -610,6 +611,63 @@ test('parseTweetDetailResponse: expands thread tweet t.co links and stores links assert.deepEqual(parsed.tweets[0].links, ['https://example.com/reply']); }); +test('fetchTweetDetailViaGraphQL: treats recognized empty timelines as permanent empty', async () => { + const originalFetch = globalThis.fetch; + globalThis.fetch = (async () => new Response(JSON.stringify({ + data: { + threaded_conversation_with_injections_v2: { + instructions: [{ type: 'TimelineAddEntries', entries: [] }], + }, + }, + }), { + status: 200, + headers: { 'content-type': 'application/json' }, + })) as typeof fetch; + + try { + const result = await fetchTweetDetailViaGraphQL('100', 'ct0', 'ct0=ct0; auth_token=auth', { delayMs: 0 }); + assert.equal(result.status, 'empty'); + } finally { + globalThis.fetch = originalFetch; + } +}); + +test('fetchTweetDetailViaGraphQL: keeps unparseable tweet timelines transient', async () => { + const originalFetch = globalThis.fetch; + globalThis.fetch = (async () => new Response(JSON.stringify({ + data: { + threaded_conversation_with_injections_v2: { + instructions: [{ + type: 'TimelineAddEntries', + entries: [{ + entryId: 'tweet-100', + content: { + itemContent: { + tweet_results: { + result: { + __typename: 'Tweet', + rest_id: '100', + }, + }, + }, + }, + }], + }], + }, + }, + }), { + status: 200, + headers: { 'content-type': 'application/json' }, + })) as typeof fetch; + + try { + const result = await fetchTweetDetailViaGraphQL('100', 'ct0', 'ct0=ct0; auth_token=auth', { delayMs: 0 }); + assert.equal(result.status, 'error'); + } finally { + globalThis.fetch = originalFetch; + } +}); + test('parseBookmarksResponse: captures full note_tweet body from live bookmarks-feed fixture', () => { const fixture = loadFixture('bookmark-feed-note-tweet.json'); const { records } = parseBookmarksResponse(fixture, NOW); @@ -868,6 +926,9 @@ test('syncThreads: writes parent context and same-author continuations to JSONL const refreshed = await getBookmarkById('100'); assert.equal(refreshed?.threadContext[0]?.id, '99'); assert.equal(refreshed?.threadBelow[0]?.id, '101'); + + const searchResults = await searchBookmarks({ query: 'actual', limit: 10 }); + assert.ok(searchResults.some((result) => result.id === '100')); }, [bookmark]); }); From 840c9de1cb9c62e381f7cda3297fb1f748a7e5bf Mon Sep 17 00:00:00 2001 From: Eric Litman Date: Thu, 7 May 2026 17:29:32 +0300 Subject: [PATCH 4/5] fix: dedupe syndication thread fetch --- src/graphql-bookmarks.ts | 121 +++++++++++++++------------------------ 1 file changed, 45 insertions(+), 76 deletions(-) diff --git a/src/graphql-bookmarks.ts b/src/graphql-bookmarks.ts index f1a859d..2a9e035 100644 --- a/src/graphql-bookmarks.ts +++ b/src/graphql-bookmarks.ts @@ -1774,7 +1774,36 @@ export async function fetchTweetDetailViaGraphQL( return { tweets: Array.from(byId.values()).sort(compareThreadTweetsChronologically), status: 'ok' }; } -async function fetchTweetViaSyndication(tweetId: string): Promise { +function parseSyndicationThreadTweet(data: any, tweetId: string): ThreadTweetSnapshot | null { + if (!data?.text) return null; + const handle = data.user?.screen_name; + const mediaEntities: any[] = data.mediaDetails ?? []; + const urlEntities = syndicationUrlEntities(data); + + return { + id: String(data.id_str ?? tweetId), + text: expandVisibleUrlEntities(data.text, urlEntities), + authorHandle: handle, + authorName: data.user?.name, + authorProfileImageUrl: data.user?.profile_image_url_https, + postedAt: data.created_at ?? null, + media: mediaEntities.map((m: any) => m.media_url_https ?? m.media_url).filter(Boolean), + mediaObjects: mediaEntities.map((m: any) => ({ + type: m.type, + url: m.media_url_https ?? m.media_url, + width: m.original_info?.width, + height: m.original_info?.height, + })), + links: extractExpandedLinks(urlEntities), + conversationId: data.conversation_id_str, + inReplyToStatusId: data.in_reply_to_status_id_str, + url: `https://x.com/${handle ?? '_'}/status/${data.id_str ?? tweetId}`, + }; +} + +async function fetchSyndicationThreadTweet( + tweetId: string, +): Promise<{ tweet: ThreadTweetSnapshot | null; status: TweetFetchResult['status']; httpStatus?: number }> { for (let attempt = 0; attempt < 4; attempt++) { const response = await fetch(`${SYNDICATION_URL}?id=${tweetId}&token=x`, { headers: { @@ -1784,31 +1813,8 @@ async function fetchTweetViaSyndication(tweetId: string): Promise m.media_url_https ?? m.media_url).filter(Boolean), - mediaObjects: mediaEntities.map((m: any) => ({ - type: m.type, - url: m.media_url_https ?? m.media_url, - width: m.original_info?.width, - height: m.original_info?.height, - })), - links: extractExpandedLinks(urlEntities), - url: `https://x.com/${handle ?? '_'}/status/${data.id_str ?? tweetId}`, - }, - }; + const tweet = parseSyndicationThreadTweet(data, tweetId); + return tweet ? { tweet, status: 'ok' } : { tweet: null, status: 'empty' }; } if (response.status === 429) { @@ -1821,60 +1827,23 @@ async function fetchTweetViaSyndication(tweetId: string): Promise { - for (let attempt = 0; attempt < 4; attempt++) { - const response = await fetch(`${SYNDICATION_URL}?id=${tweetId}&token=x`, { - headers: { - 'user-agent': CHROME_UA, - }, - }); - - if (response.ok) { - const data = await response.json() as any; - if (!data?.text) return { tweet: null, status: 'empty' }; - const handle = data.user?.screen_name; - const mediaEntities: any[] = data.mediaDetails ?? []; - const urlEntities = syndicationUrlEntities(data); - return { - status: 'ok', - tweet: { - id: String(data.id_str ?? tweetId), - text: expandVisibleUrlEntities(data.text, urlEntities), - authorHandle: handle, - authorName: data.user?.name, - authorProfileImageUrl: data.user?.profile_image_url_https, - postedAt: data.created_at ?? null, - media: mediaEntities.map((m: any) => m.media_url_https ?? m.media_url).filter(Boolean), - mediaObjects: mediaEntities.map((m: any) => ({ - type: m.type, - url: m.media_url_https ?? m.media_url, - width: m.original_info?.width, - height: m.original_info?.height, - })), - links: extractExpandedLinks(urlEntities), - conversationId: data.conversation_id_str, - inReplyToStatusId: data.in_reply_to_status_id_str, - url: `https://x.com/${handle ?? '_'}/status/${data.id_str ?? tweetId}`, - }, - }; - } +async function fetchTweetViaSyndication(tweetId: string): Promise { + const result = await fetchSyndicationThreadTweet(tweetId); + return { + snapshot: result.tweet, + status: result.status, + httpStatus: result.httpStatus, + source: 'syndication', + }; +} - if (response.status === 429) { - await new Promise((r) => setTimeout(r, Math.min(15 * Math.pow(2, attempt), 120) * 1000)); - continue; - } - if (response.status >= 500) { - await new Promise((r) => setTimeout(r, 5000 * (attempt + 1))); - continue; - } - return { tweet: null, status: response.status === 404 ? 'not_found' : 'forbidden' }; - } - return { tweet: null, status: 'rate_limited' }; +async function fetchThreadTweetViaSyndication(tweetId: string): Promise<{ tweet: ThreadTweetSnapshot | null; status: TweetFetchResult['status'] }> { + return fetchSyndicationThreadTweet(tweetId); } // Text >= 275 chars may be truncated by Twitter's legacy.full_text limit From 7b630a232860356d3d36ce975a9a3279962e831f Mon Sep 17 00:00:00 2001 From: Eric Litman Date: Thu, 7 May 2026 17:49:29 +0300 Subject: [PATCH 5/5] fix: preserve partial thread sync data --- src/graphql-bookmarks.ts | 54 +++++++++------ tests/graphql-bookmarks.test.ts | 119 ++++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+), 23 deletions(-) diff --git a/src/graphql-bookmarks.ts b/src/graphql-bookmarks.ts index 2a9e035..52d5d6e 100644 --- a/src/graphql-bookmarks.ts +++ b/src/graphql-bookmarks.ts @@ -1904,6 +1904,10 @@ const THREAD_TRANSIENT_FAILURE_STATUSES = new Set([ 'error', ]); +function isPermanentThreadFailure(status: TweetFetchResult['status'] | null): boolean { + return status === 'not_found' || status === 'forbidden' || status === 'empty'; +} + export interface SyncThreadsOptions { onProgress?: (progress: ThreadSyncProgress) => void; delayMs?: number; @@ -1995,6 +1999,7 @@ async function expandThreadForRecord( let below: ThreadTweetSnapshot[] = []; if (cookies.csrfToken) { const detail = await fetchTweetDetailViaGraphQL(record.tweetId, cookies.csrfToken, cookies.cookieHeader, { delayMs }); + if (detail.status === 'empty') return { context, below: [], status: 'ok' }; if (detail.status !== 'ok') return { context, below: [], status: detail.status }; below = extractSameAuthorThreadBelow(detail.tweets, record.tweetId, record.authorHandle); } @@ -2049,6 +2054,15 @@ export async function syncThreads(options: SyncThreadsOptions = {}): Promise = []; + const queueThreadUpdate = (record: BookmarkRecord): void => { + dbUpdates.push({ + id: record.id, + threadContext: record.threadContext, + threadBelow: record.threadBelow, + threadExpandedAt: record.threadExpandedAt, + threadExpansionFailedAt: record.threadExpansionFailedAt, + }); + }; const persistProgress = async (): Promise => { await writeJsonLines(cachePath, records); if (dbUpdates.length > 0) { @@ -2059,7 +2073,7 @@ export async function syncThreads(options: SyncThreadsOptions = {}): Promise 0) contextFilled++; if (expanded.below.length > 0) belowFilled++; if (expanded.context.length === 0 && expanded.below.length === 0) emptyChecked++; - dbUpdates.push({ - id: record.id, - threadContext: record.threadContext, - threadBelow: record.threadBelow, - threadExpandedAt: record.threadExpandedAt, - }); + queueThreadUpdate(record); } else { + const hasPartialSnapshots = expanded.context.length > 0 || expanded.below.length > 0; + if (hasPartialSnapshots) { + if (expanded.context.length > 0) { + record.threadContext = expanded.context; + contextFilled++; + } + if (expanded.below.length > 0) { + record.threadBelow = expanded.below; + belowFilled++; + } + } failed++; const reason = GAP_FILL_FAILURE_REASONS[expanded.status] ?? expanded.status; failures.push({ @@ -2087,6 +2107,7 @@ export async function syncThreads(options: SyncThreadsOptions = {}): Promise { + const bookmark = makeRecord({ + id: '100', + tweetId: '100', + url: 'https://x.com/testuser/status/100', + text: 'Launch post', + authorHandle: 'testuser', + postedAt: '2026-04-01T00:00:00.000Z', + inReplyToStatusId: '99', + }); + const parent = makeTweetResult({ + tweet: { rest_id: '99' }, + legacy: { + id_str: '99', + full_text: 'Parent context from the original announcement', + conversation_id_str: '99', + in_reply_to_status_id_str: undefined, + }, + }); + const emptyDetail = { + data: { + threaded_conversation_with_injections_v2: { + instructions: [ + { type: 'TimelineAddEntries', entries: [] }, + ], + }, + }, + }; + + await withIsolatedGapFillDataDir(async () => { + await buildIndex(); + const originalFetch = globalThis.fetch; + let resultCalls = 0; + let detailCalls = 0; + globalThis.fetch = (async (input: RequestInfo | URL) => { + const url = String(input); + if (url.includes('/TweetResultByRestId?')) { + resultCalls += 1; + return new Response(JSON.stringify({ data: { tweetResult: { result: parent } } })); + } + if (url.includes('/TweetDetail?')) { + detailCalls += 1; + return new Response(JSON.stringify(emptyDetail)); + } + throw new Error(`Unexpected fetch: ${url}`); + }) as typeof fetch; + + try { + const result = await syncThreads({ + csrfToken: 'ct0', + cookieHeader: 'ct0=ct0; auth_token=auth', + delayMs: 0, + }); + + assert.equal(result.contextFilled, 1); + assert.equal(result.belowFilled, 0); + assert.equal(result.emptyChecked, 0); + assert.equal(result.failed, 0); + assert.equal(resultCalls, 1); + assert.equal(detailCalls, 1); + + const jsonl = await readFile(path.join(process.env.FT_DATA_DIR!, 'bookmarks.jsonl'), 'utf8'); + const stored = JSON.parse(jsonl.trim().split('\n').pop()!); + assert.equal(stored.threadContext[0].text, 'Parent context from the original announcement'); + assert.deepEqual(stored.threadBelow, []); + assert.ok(stored.threadExpandedAt); + assert.equal(stored.threadExpansionFailedAt, undefined); + + const refreshed = await getBookmarkById('100'); + assert.equal(refreshed?.threadContext[0]?.id, '99'); + assert.deepEqual(refreshed?.threadBelow, []); + assert.ok(refreshed?.threadExpandedAt); + assert.equal(refreshed?.threadExpansionFailedAt, null); + } finally { + globalThis.fetch = originalFetch; + } + }, [bookmark]); +}); + test('syncThreads: rechecks recent empty threads but skips old checked empties', async () => { const recentChecked = makeRecord({ id: '200', @@ -1043,6 +1122,46 @@ test('syncThreads: transient failures abort without stamping permanent failure', }, [first, second]); }); +test('syncThreads: transient failure checkpoints partial thread data without stamping permanent failure', async () => { + const bookmark = makeRecord({ + id: '700', + tweetId: '700', + url: 'https://x.com/testuser/status/700', + text: 'Rate limited after parent context', + authorHandle: 'testuser', + }); + const parent = { + id: '699', + text: 'Fetched parent before rate limit', + authorHandle: 'testuser', + url: 'https://x.com/testuser/status/699', + }; + + await withIsolatedGapFillDataDir(async () => { + await buildIndex(); + await assert.rejects( + () => syncThreads({ + threadFetcher: async () => ({ context: [parent], below: [], status: 'rate_limited' }), + delayMs: 0, + }), + /rate limiting/, + ); + + const jsonl = await readFile(path.join(process.env.FT_DATA_DIR!, 'bookmarks.jsonl'), 'utf8'); + const stored = JSON.parse(jsonl.trim().split('\n').pop()!); + assert.equal(stored.threadContext[0].text, 'Fetched parent before rate limit'); + assert.equal(stored.threadBelow, undefined); + assert.equal(stored.threadExpandedAt, undefined); + assert.equal(stored.threadExpansionFailedAt, undefined); + + const refreshed = await getBookmarkById('700'); + assert.equal(refreshed?.threadContext[0]?.id, '699'); + assert.deepEqual(refreshed?.threadBelow, []); + assert.equal(refreshed?.threadExpandedAt, null); + assert.equal(refreshed?.threadExpansionFailedAt, null); + }, [bookmark]); +}); + test('syncGaps: expands truncated note_tweet and stamps textExpandedAt', async () => { const fixture = loadFixture('tweet-result-by-rest-id-note-tweet.json'); const legacyPreview: string = fixture.data.tweetResult.result.legacy.full_text;