Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions scripts/tina-migration/algolia-indexer/test_strip.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import fs from 'fs';
import path from 'path';
import { fileURLToPath } from 'url';
import matter from 'gray-matter';

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);

function stripMdxToPlainText(rawContent) {
let text = rawContent;
text = text.replace(/^import\s+.*?[;\n]/gm, '');
text = text.replace(/<endIntro\s*\/?>/gi, '');
text = text.replace(/<(boxEmbed|imageEmbed|emailEmbed|youtubeEmbed)\s[\s\S]*?\/>/gi, '');
text = text.replace(/<[a-zA-Z][a-zA-Z0-9]*\s[^>]*\/>/g, '');
text = text.replace(/<(script|style)[\s\S]*?<\/\1>/gi, '');
text = text.replace(/<\/?[a-zA-Z][a-zA-Z0-9]*[^>]*>/g, '');
text = text.replace(/\{<>|<\/>\}/g, '');
text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');
text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');
text = text.replace(/^#{1,6}\s+/gm, '');
text = text.replace(/(\*{1,3}|_{1,3}|~~|==)(.*?)\1/g, '$2');
text = text.replace(/```[\s\S]*?```/g, '');
text = text.replace(/`([^`]*)`/g, '$1');
text = text.replace(/^[-*_]{3,}\s*$/gm, '');
text = text.replace(/^\s*[\*\-\+]\s+/gm, '');
text = text.replace(/^\s*\d+\.\s+/gm, '');
text = text.replace(/\n{2,}/g, '\n');
text = text.replace(/[ \t]{2,}/g, ' ');
return text.trim();
}

const ROOT_DIR = path.resolve(__dirname, '../../../public/uploads/rules');
const testFile = path.join(ROOT_DIR, 'ai-assisted-tools-for-prototyping/rule.mdx');
const raw = fs.readFileSync(testFile, 'utf-8');
const { data, content } = matter(raw);
const plain = stripMdxToPlainText(content);

console.log('--- TITLE ---');
console.log(data.title);
console.log('--- CONTENT LENGTH ---');
console.log(plain.length, 'chars');
console.log('--- FIRST 500 CHARS ---');
console.log(plain.substring(0, 500));
console.log('--- LAST 200 CHARS ---');
console.log(plain.substring(plain.length - 200));
157 changes: 99 additions & 58 deletions scripts/tina-migration/algolia-indexer/update_algolia_index.js
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,76 @@ const APP_ID = process.env.NEXT_PUBLIC_ALGOLIA_APP_ID;
const ADMIN_KEY = process.env.NEXT_PUBLIC_ALGOLIA_ADMIN_KEY;
const INDEX_NAME = process.env.NEXT_PUBLIC_ALGOLIA_INDEX_NAME || 'index-json';

// Maximum content length to stay within Algolia's 10KB record limit
// Frontmatter typically takes ~1-2KB, so we cap content at 8000 chars
const MAX_CONTENT_LENGTH = 8000;

if (!APP_ID || !ADMIN_KEY || !INDEX_NAME) {
console.error('⛔ Missing .env variable.');
process.exit(1);
}

const client = algoliasearch(APP_ID, ADMIN_KEY);

/**
* Strip MDX/JSX components, HTML tags, import statements, and markdown
* syntax to produce clean plain text suitable for full-text search indexing.
*/
function stripMdxToPlainText(rawContent) {
let text = rawContent;

// Remove import statements
text = text.replace(/^import\s+.*?[;\n]/gm, '');

// Remove <endIntro /> self-closing tags
text = text.replace(/<endIntro\s*\/?>/gi, '');

// Remove multi-line MDX component blocks (e.g. <boxEmbed ... />, <imageEmbed ... />, etc.)
// These can span multiple lines with JSX expressions like body={<>...</>}
text = text.replace(/<(boxEmbed|imageEmbed|emailEmbed|youtubeEmbed)\s[\s\S]*?\/>/gi, '');

// Remove any remaining self-closing JSX/HTML tags
text = text.replace(/<[a-zA-Z][a-zA-Z0-9]*\s[^>]*\/>/g, '');

// Remove paired HTML/JSX tags and their content for non-content tags
text = text.replace(/<(script|style)[\s\S]*?<\/\1>/gi, '');

// Remove remaining HTML/JSX opening and closing tags (keep inner content)
text = text.replace(/<\/?[a-zA-Z][a-zA-Z0-9]*[^>]*>/g, '');

// Remove JSX expression wrappers {<> ... </>}
text = text.replace(/\{<>|<\/>\}/g, '');

// Remove markdown image syntax ![alt](url)
text = text.replace(/!\[[^\]]*\]\([^)]*\)/g, '');

// Convert markdown links [text](url) to just text
text = text.replace(/\[([^\]]*)\]\([^)]*\)/g, '$1');

// Remove markdown heading markers
text = text.replace(/^#{1,6}\s+/gm, '');

// Remove markdown emphasis markers (bold, italic, strikethrough, highlight)
text = text.replace(/(\*{1,3}|_{1,3}|~~|==)(.*?)\1/g, '$2');

// Remove markdown code block fences
text = text.replace(/```[\s\S]*?```/g, '');
text = text.replace(/`([^`]*)`/g, '$1');

// Remove horizontal rules
text = text.replace(/^[-*_]{3,}\s*$/gm, '');

// Remove markdown list markers
text = text.replace(/^\s*[\*\-\+]\s+/gm, '');
text = text.replace(/^\s*\d+\.\s+/gm, '');

// Collapse multiple newlines and spaces
text = text.replace(/\n{2,}/g, '\n');
text = text.replace(/[ \t]{2,}/g, ' ');

return text.trim();
}

// Get current files
const files = await fg('**/*.mdx', { cwd: ROOT_DIR, absolute: true });

Expand All @@ -31,75 +94,53 @@ const currentObjects = files.map(fp => {
const rawSlug = path.relative(ROOT_DIR, path.dirname(fp)).replace(/\\/g, '/');
const slug = rawSlug.replace(/-+/g, '-');

// Strip MDX to plain text and truncate for Algolia record size limit
let plainContent = stripMdxToPlainText(content);
if (plainContent.length > MAX_CONTENT_LENGTH) {
plainContent = plainContent.substring(0, MAX_CONTENT_LENGTH) + '…';
}

// Only patch the content field — preserve all other fields managed by TinaCMS
return {
objectID: slug,
slug,
...frontmatter,
content: plainContent,
};
});
const currentObjectIDs = new Set(currentObjects.map(obj => obj.objectID));

// Get existing objects from Algolia
console.log('🔍 Fetching existing objects from Algolia...');
const existingObjects = [];

// Use searchForHits with empty query to get all objects
let page = 0;
const hitsPerPage = 1000; // Maximum allowed
let hasMore = true;

while (hasMore) {
try {
const { results } = await client.search({
requests: [{
indexName: INDEX_NAME,
query: '', // Empty query returns all results
page,
hitsPerPage,
attributesToRetrieve: ['objectID'] // Only need objectID for comparison
}]
});

const hits = results[0].hits;
existingObjects.push(...hits);

hasMore = hits.length === hitsPerPage;
page++;

console.log(`📄 Fetched page ${page}, total objects: ${existingObjects.length}`);
} catch (error) {
console.error('Error fetching existing objects:', error);
break;
}
}

const existingObjectIDs = new Set(existingObjects.map(obj => obj.objectID));

// Find objects to delete (exist in Algolia but not in current files)
const objectsToDelete = [...existingObjectIDs].filter(id => !currentObjectIDs.has(id));
console.log(`📊 Rules to update: ${currentObjects.length}`);

console.log(`📊 Current files: ${currentObjects.length}`);
console.log(`📊 Existing in index: ${existingObjects.length}`);
console.log(`🗑️ Objects to delete: ${objectsToDelete.length}`);

// Delete removed objects
if (objectsToDelete.length > 0) {
console.log(`🗑️ Deleting ${objectsToDelete.length} objects...`);
await client.deleteObjects({
indexName: INDEX_NAME,
objectIDs: objectsToDelete,
waitForTasks: true,
});
}

// Update/add current objects
// Patch content field on existing Algolia records without overwriting other fields
if (currentObjects.length > 0) {
console.log(`🔄 Updating ${currentObjects.length} objects...`);
await client.saveObjects({
console.log(`🔄 Patching content field on ${currentObjects.length} records...`);
await client.partialUpdateObjects({
indexName: INDEX_NAME,
objects: currentObjects,
createIfNotExists: false,
waitForTasks: true,
});
}

// Configure searchable attributes and snippet/highlight settings
console.log('⚙️ Configuring index settings...');
await client.setSettings({
indexName: INDEX_NAME,
indexSettings: {
searchableAttributes: [
'title',
'uri',
'seoDescription',
'content',
],
attributesToSnippet: [
'content:30',
],
attributesToHighlight: [
'title',
'content',
],
highlightPreTag: '<mark>',
highlightPostTag: '</mark>',
},
});

console.log('✅ Index updated successfully!');
2 changes: 1 addition & 1 deletion tina/tina-lock.json

Large diffs are not rendered by default.