diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts new file mode 100644 index 0000000..c6396db --- /dev/null +++ b/ui/__tests__/scientific-rag.test.ts @@ -0,0 +1,77 @@ +import { describe, expect, it } from 'vitest'; + +import { + buildCitationKey, + buildRagMetadata, + detectScientificSection, + formatRetrievedDocuments, + parseSemanticScholarReferences, + semanticScholarReferenceToText, +} from '@/utils/server/scientific-rag'; + +describe('scientific RAG helpers', () => { + it('detects compound scientific sections before generic methods', () => { + expect(detectScientificSection('Materials and Methods\nWe collected samples')).toBe( + 'materials-and-methods', + ); + expect(detectScientificSection('Abstract\nThis paper studies retrieval')).toBe('abstract'); + }); + + it('builds stable citation keys for uploaded documents', () => { + expect( + buildCitationKey({ title: 'My Paper.pdf', page: 3, chunkIndex: 2 }), + ).toBe('doc:my-paper-pdf:p3:c2'); + }); + + it('builds stable citation keys for Semantic Scholar references', () => { + expect( + buildCitationKey({ + sourceType: 'semantic-scholar', + paperId: 'abc123', + title: 'Ignored when paper id exists', + chunkIndex: 1, + }), + ).toBe('scholar:abc123:ref:c1'); + }); + + it('converts Semantic Scholar references into indexable text', () => { + const text = semanticScholarReferenceToText({ + paperId: 'paper-1', + title: 'Retrieval for Science', + abstract: 'A study of citation-grounded retrieval.', + authors: [{ name: 'Ada Lovelace' }, 'Grace Hopper'], + year: 2026, + venue: 'ISAAC', + }); + + expect(text).toContain('Title: Retrieval for Science'); + expect(text).toContain('Authors: Ada Lovelace, Grace Hopper'); + expect(text).toContain('Semantic Scholar Paper ID: paper-1'); + }); + + it('parses saved Semantic Scholar references from form fields', () => { + const refs = parseSemanticScholarReferences([ + JSON.stringify([{ paperId: 'paper-1', title: 'A' }]), + ]); + + expect(refs).toEqual([{ paperId: 'paper-1', title: 'A' }]); + }); + + it('formats retrieval results with citation keys and distances', () => { + const formatted = formatRetrievedDocuments({ + documents: [['Chunk text']], + metadatas: [[buildRagMetadata({ title: 'Paper', page: 1, chunkIndex: 0 })]], + distances: [[0.123456]], + }); + + expect(formatted).toContain('[doc:paper:p1:c0]'); + expect(formatted).toContain('Distance: 0.1235'); + expect(formatted).toContain('Chunk text'); + }); + + it('handles empty retrieval results defensively', () => { + expect(formatRetrievedDocuments({ documents: [[]], metadatas: [[]] })).toBe( + 'No relevant documents were retrieved.', + ); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..0e8b7e7 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,23 +1,35 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; + +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { + if (req.method !== 'POST') { + return res.status(405).end(); + } + + const query = typeof req.body?.input === 'string' ? req.body.input.trim() : ''; + if (!query) { + return res.status(400).json({ error: 'Missing retrieval query' }); + } + + const requestedResults = Number(req.body?.nResults || 6); + const nResults = Math.min(Math.max(requestedResults, 1), 10); + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; - const embedder = new TransformersEmbeddingFunction(); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); - - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + const results = await collection.query({ + nResults, + queryTexts: [query], + }); res.status(200).json(results); } catch (error) { @@ -29,4 +41,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } res.status(500).json({ error: 'An unexpected error occurred :(' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..9b74cce 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -3,10 +3,17 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; - +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; -import { v4 as uuidv4 } from 'uuid'; + +import { + buildRagMetadata, + detectScientificSection, + parseSemanticScholarReferences, + SCIENTIFIC_TEXT_SEPARATORS, + semanticScholarReferenceToText, + type ScientificReference, +} from '@/utils/server/scientific-rag'; export const config = { api: { @@ -29,25 +36,36 @@ export default async function handler( return res.status(400).json({ error: 'Failed to upload file' }); } + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; + const references = parseSemanticScholarReferences(fields.references); + + if (!pdfFile?.filepath && references.length === 0) { + return res.status(400).json({ + error: 'Upload a PDF or provide Semantic Scholar references', + }); + } + const client = new ChromaClient({ path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); - - const originalDocs = await loader.load(); + const originalDocs = []; - console.log(JSON.stringify(originalDocs)); + if (pdfFile?.filepath) { + const loader = new PDFLoader(pdfFile.filepath); + originalDocs.push(...(await loader.load())); + } + originalDocs.push(...semanticScholarReferencesToDocuments(references)); const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 700, + chunkOverlap: 120, + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); - - // Process the documents and perform other logic + const { ids, metadatas, documentContents } = processDocuments(docs); const embedder = new TransformersEmbeddingFunction(); @@ -65,6 +83,7 @@ export default async function handler( res.status(200).json({ message: 'Documents processed successfully', documentCount: ids.length, + semanticScholarReferenceCount: references.length, }); }); } catch (error) { @@ -75,30 +94,53 @@ export default async function handler( } } -function processDocuments(docs: any) { +function semanticScholarReferencesToDocuments(references: ScientificReference[]) { + return references.map((reference) => ({ + pageContent: semanticScholarReferenceToText(reference), + metadata: { + sourceType: 'semantic-scholar', + source: reference.url || reference.paperId || reference.title, + title: reference.title, + paperId: reference.paperId, + url: reference.url, + year: reference.year, + loc: { pageNumber: 'ref' }, + }, + })); +} + +function processDocuments(docs: any[]) { const ids = []; const metadatas = []; const documentContents = []; + const pageChunkCounts = new Map(); for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier - const id = uuidv4(); - ids.push(id); - - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; + const sourceType = document.metadata.sourceType || 'upload'; + const fallbackTitle = document.metadata.source + ? path.basename(document.metadata.source) + : 'Semantic Scholar reference'; + const titleFromMetadata = document.metadata.pdf?.info?.Title; + const title = titleFromMetadata || document.metadata.title || fallbackTitle; + const page = document.metadata.loc?.pageNumber || document.metadata.page || 'ref'; + const pageChunkKey = `${sourceType}:${title}:${page}`; + const chunkIndex = pageChunkCounts.get(pageChunkKey) || 0; + pageChunkCounts.set(pageChunkKey, chunkIndex + 1); + + const metadata = buildRagMetadata({ + title, + page, + source: document.metadata.source, + sourceType, + section: detectScientificSection(document.pageContent), + chunkIndex, + paperId: document.metadata.paperId, + url: document.metadata.url, + year: document.metadata.year, + }); - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; + ids.push(String(metadata.citationKey)); metadatas.push(metadata); - - // Add the page content to the documents array documentContents.push(document.pageContent); } diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..5ef7a59 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,7 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import { formatRetrievedDocuments } from '@/utils/server/scientific-rag'; +import { codeBlock, oneLine } from 'common-tags'; import { ChatBody, Message } from '@/types/chat'; @@ -14,41 +15,28 @@ export const config = { runtime: 'edge', }; -// Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +async function fetchAndFormatDocuments(lastMessageContent: string, req: Request) { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const url = new URL('/api/fetch-documents', req.url); + const response = await fetch(url, { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ input: lastMessageContent, nResults: 6 }), }); - + if (!response.ok) { throw new Error(`Error fetching documents: ${response.statusText}`); } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); - - return result; - + return formatRetrievedDocuments(data); } catch (error) { console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + return 'No relevant documents were retrieved.'; } } - - - - const handler = async (req: Request): Promise => { - try { const { model, messages, key, prompt, temperature } = (await req.json()) as ChatBody; @@ -62,87 +50,79 @@ const handler = async (req: Request): Promise => { let promptToSend = codeBlock` ${oneLine` - You are a very enthusiastic AI assistant who loves - to help people! Given the following information from - relevant documentation, answer the user's question using - only that information, outputted in markdown format. + You are a careful scientific research assistant. Given the following + retrieved document context, answer the user's question using only that + context and output markdown. `} ${oneLine` - If you are unsure - and the answer is not explicitly written in the documentation, say - "Sorry, I don't know how to help with that." + Every factual claim that depends on retrieved context must cite the exact + bracketed citation key, for example [doc:paper-title:p3:c1] or + [scholar:paper-id:ref:c0]. Prefer lower-distance sources when multiple + chunks contain similar information. `} - + ${oneLine` - Always include citations from the documentation. + If the answer is not explicitly supported by the retrieved context, say + "Sorry, I don't know how to help with that from the available documents." `} `; if (!promptToSend) { - promptToSend = DEFAULT_SYSTEM_PROMPT; + promptToSend = prompt || DEFAULT_SYSTEM_PROMPT; } const lastMessage = messages[messages.length - 1]; + const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content, req); - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); - let temperatureToUse = temperature; if (temperatureToUse == null) { temperatureToUse = DEFAULT_TEMPERATURE; } const prompt_tokens = encoding.encode(promptToSend); - let tokenCount = prompt_tokens.length; let messagesToSend: Message[] = []; - encoding.free(); - console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - - messagesToSend = [ + messagesToSend = [ { - role: "user", + role: 'user', content: codeBlock` - Here is the relevant documentation: + Here is the retrieved scientific context: ${relevantDocuments} `, }, { - role: "user", + role: 'user', content: codeBlock` ${oneLine` - Answer my next question using only the above documentation. - You must also follow the below rules when answering: + Answer my next question using only the context above. + Follow these rules: `} ${oneLine` - - Do not make up answers that are not provided in the documentation. + - Do not invent answers that are not supported by the retrieved context. `} ${oneLine` - - If you are unsure and the answer is not explicitly written - in the documentation context, say - "Sorry, I don't know how to help with that." + - Cite factual claims with the exact bracketed citation keys shown in the context. `} ${oneLine` - - Prefer splitting your response into multiple paragraphs. + - If no relevant context is available, say you do not know from the available documents. `} ${oneLine` - - Output as markdown with citations based on the documentation. + - Prefer concise markdown with direct citations. `} `, }, { - role: "user", + role: 'user', content: codeBlock` Here is my question: ${oneLine`${lastMessage.content}`} `, }, - ] - + ]; const stream = await OpenAIStream( model, diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..23a095e --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,174 @@ +export type ScientificSourceType = 'upload' | 'semantic-scholar'; + +export interface ScientificReference { + paperId?: string; + title?: string; + abstract?: string; + authors?: Array; + year?: number | string; + venue?: string; + url?: string; +} + +export interface RagMetadataInput { + title?: string; + page?: number | string; + source?: string; + sourceType?: ScientificSourceType; + section?: string; + chunkIndex?: number; + paperId?: string; + url?: string; + year?: number | string; +} + +export interface RetrievedDocuments { + documents?: string[][]; + metadatas?: Array>>; + distances?: number[][]; +} + +const SECTION_PATTERNS: Array<[string, RegExp]> = [ + ['materials-and-methods', /\bmaterials?\s+(and|&)\s+methods?\b/i], + ['abstract', /\babstract\b/i], + ['introduction', /\bintroduction\b/i], + ['background', /\bbackground\b/i], + ['methods', /\b(methods?|methodology)\b/i], + ['results', /\bresults?\b/i], + ['discussion', /\bdiscussion\b/i], + ['conclusion', /\bconclusions?\b/i], + ['limitations', /\blimitations?\b/i], + ['references', /\breferences?\b/i], +]; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nIntroduction', + '\nBackground', + '\nMaterials and Methods', + '\nMethods', + '\nMethodology', + '\nResults', + '\nDiscussion', + '\nConclusion', + '\nReferences', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +export function detectScientificSection(text = ''): string { + const firstLines = text.split('\n').slice(0, 6).join(' '); + const sample = firstLines || text.slice(0, 400); + + for (const [section, pattern] of SECTION_PATTERNS) { + if (pattern.test(sample)) { + return section; + } + } + + return 'body'; +} + +export function normalizeCitationPart(value: unknown, fallback = 'source'): string { + const normalized = String(value || fallback) + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 72); + + return normalized || fallback; +} + +export function buildCitationKey(input: RagMetadataInput): string { + const source = input.sourceType === 'semantic-scholar' ? 'scholar' : 'doc'; + const title = normalizeCitationPart(input.paperId || input.title || input.source, 'source'); + const page = input.page == null || input.page === '' ? 'ref' : `p${input.page}`; + const chunk = Number.isInteger(input.chunkIndex) ? `c${input.chunkIndex}` : 'c0'; + + return `${source}:${title}:${page}:${chunk}`; +} + +export function buildRagMetadata(input: RagMetadataInput): Record { + const metadata: Record = { + title: String(input.title || 'Untitled source'), + page: input.page == null || input.page === '' ? 'ref' : input.page, + source: String(input.source || input.paperId || input.url || 'unknown'), + sourceType: input.sourceType || 'upload', + section: input.section || 'body', + chunk: input.chunkIndex || 0, + }; + + const citationKey = buildCitationKey({ ...input, ...metadata }); + metadata.citationKey = citationKey; + + if (input.paperId) metadata.paperId = input.paperId; + if (input.url) metadata.url = input.url; + if (input.year) metadata.year = input.year; + + return metadata; +} + +function formatAuthors(authors: ScientificReference['authors']): string { + if (!Array.isArray(authors)) return ''; + + return authors + .map((author) => (typeof author === 'string' ? author : author?.name)) + .filter(Boolean) + .join(', '); +} + +export function semanticScholarReferenceToText(reference: ScientificReference): string { + const title = reference.title || 'Untitled Semantic Scholar reference'; + const authors = formatAuthors(reference.authors); + const parts = [ + `Title: ${title}`, + authors ? `Authors: ${authors}` : '', + reference.year ? `Year: ${reference.year}` : '', + reference.venue ? `Venue: ${reference.venue}` : '', + reference.abstract ? `Abstract: ${reference.abstract}` : '', + reference.url ? `URL: ${reference.url}` : '', + reference.paperId ? `Semantic Scholar Paper ID: ${reference.paperId}` : '', + ].filter(Boolean); + + return parts.join('\n'); +} + +export function parseSemanticScholarReferences(value: unknown): ScientificReference[] { + const raw = Array.isArray(value) ? value[0] : value; + if (!raw || typeof raw !== 'string') return []; + + try { + const parsed = JSON.parse(raw); + if (!Array.isArray(parsed)) return []; + + return parsed.filter((item) => item && typeof item === 'object'); + } catch { + return []; + } +} + +export function formatRetrievedDocuments(data: RetrievedDocuments): string { + const documents = data.documents?.[0] || []; + const metadatas = data.metadatas?.[0] || []; + const distances = data.distances?.[0] || []; + + if (!documents.length) { + return 'No relevant documents were retrieved.'; + } + + return documents + .map((content, index) => { + const metadata = metadatas[index] || {}; + const citationKey = metadata.citationKey || `source-${index + 1}`; + const title = metadata.title || 'Untitled source'; + const page = metadata.page || 'ref'; + const section = metadata.section || 'body'; + const distance = typeof distances[index] === 'number' ? `, Distance: ${distances[index].toFixed(4)}` : ''; + + return `Source ${index + 1} [${citationKey}] Title: ${title}, Page: ${page}, Section: ${section}${distance}\n${content}\n`; + }) + .join('\n'); +}