diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts new file mode 100644 index 0000000..eb6d9aa --- /dev/null +++ b/ui/__tests__/scientific-rag.test.ts @@ -0,0 +1,328 @@ +import { + buildCitationKey, + buildChunkMetadata, + buildEvidencePayload, + buildResearchQueries, + detectScientificSection, + fuseQueryResults, + parseBoundedInteger, + sectionWeight, +} from '@/utils/server/scientific-rag'; + +import { describe, expect, it } from 'vitest'; + +// --------------------------------------------------------------------------- +// parseBoundedInteger +// --------------------------------------------------------------------------- +describe('parseBoundedInteger', () => { + it('returns the fallback for invalid input', () => { + expect(parseBoundedInteger('bad', 8, 16)).toBe(8); + expect(parseBoundedInteger(undefined, 8, 16)).toBe(8); + expect(parseBoundedInteger(null, 8, 16)).toBe(8); + expect(parseBoundedInteger(0, 8, 16)).toBe(8); + expect(parseBoundedInteger(-5, 8, 16)).toBe(8); + }); + + it('clamps to max', () => { + expect(parseBoundedInteger(20, 8, 16)).toBe(16); + expect(parseBoundedInteger('100', 8, 16)).toBe(16); + }); + + it('accepts valid values within range', () => { + expect(parseBoundedInteger(10, 8, 16)).toBe(10); + expect(parseBoundedInteger('12', 8, 16)).toBe(12); + }); +}); + +// --------------------------------------------------------------------------- +// detectScientificSection +// --------------------------------------------------------------------------- +describe('detectScientificSection', () => { + it('detects abstract at chunk start', () => { + expect(detectScientificSection('Abstract\nThis paper proposes…')).toBe('abstract'); + }); + + it('detects methods section', () => { + expect(detectScientificSection('Methods\nWe trained a transformer…')).toBe('methods'); + }); + + it('detects results section', () => { + expect(detectScientificSection('Results\nThe model achieved 94% accuracy…')).toBe('results'); + }); + + it('detects "materials and methods"', () => { + expect(detectScientificSection('Materials and Methods\nSamples were…')).toBe( + 'materials and methods', + ); + }); + + it('falls back to body for plain text', () => { + expect(detectScientificSection('Some generic paragraph without headings.')).toBe('body'); + }); +}); + +// --------------------------------------------------------------------------- +// sectionWeight +// --------------------------------------------------------------------------- +describe('sectionWeight', () => { + it('abstract has the highest weight', () => { + expect(sectionWeight('abstract')).toBeGreaterThan(sectionWeight('body')); + }); + + it('results and methods are weighted above introduction', () => { + expect(sectionWeight('results')).toBeGreaterThan(sectionWeight('introduction')); + expect(sectionWeight('methods')).toBeGreaterThan(sectionWeight('introduction')); + }); + + it('references have the lowest weight', () => { + expect(sectionWeight('references')).toBeLessThan(sectionWeight('body')); + }); + + it('returns body weight for unknown sections', () => { + expect(sectionWeight('unknown-section')).toBe(sectionWeight('body')); + }); +}); + +// --------------------------------------------------------------------------- +// buildCitationKey +// --------------------------------------------------------------------------- +describe('buildCitationKey', () => { + it('produces stable human-readable keys', () => { + expect(buildCitationKey({ chunkIndex: 0, page: 3, title: 'My Paper' })).toBe( + 'my-paper:p3:c1', + ); + }); + + it('handles special characters in titles', () => { + const key = buildCitationKey({ + chunkIndex: 2, + page: 7, + title: 'Attention Is All You Need!', + }); + expect(key).toBe('attention-is-all-you-need:p7:c3'); + }); +}); + +// --------------------------------------------------------------------------- +// buildChunkMetadata +// --------------------------------------------------------------------------- +describe('buildChunkMetadata', () => { + it('extracts title from PDF metadata and strips temp path', () => { + const meta = buildChunkMetadata( + { + pageContent: 'Results\nThe method achieves SOTA.', + metadata: { + loc: { pageNumber: 5 }, + pdf: { info: { Title: 'Scientific RAG Study' } }, + source: '/tmp/uploads/secret/paper.pdf', + }, + }, + 'paper.pdf', + 2, + 1, + ); + + expect(meta.title).toBe('Scientific RAG Study'); + expect(meta.page).toBe(5); + expect(meta.section).toBe('results'); + expect(meta.citationKey).toBe('scientific-rag-study:p5:c2'); + // Public source must not leak the temp directory path + expect(meta.source).not.toContain('/tmp/uploads'); + expect(meta.source).toBe('paper.pdf'); + // Section weight for results should be above 1 + expect(meta.sectionWeight).toBeGreaterThan(1); + }); + + it('falls back gracefully when metadata is missing', () => { + const meta = buildChunkMetadata( + { pageContent: 'Some body text.' }, + 'fallback.pdf', + 0, + ); + + expect(meta.title).toBe('fallback.pdf'); + expect(meta.section).toBe('body'); + expect(meta.page).toBe('unknown'); + expect(meta.citationKey).toMatch(/^fallback(-pdf)?:punknown:c1$/); + }); +}); + +// --------------------------------------------------------------------------- +// buildResearchQueries +// --------------------------------------------------------------------------- +describe('buildResearchQueries', () => { + it('returns normalised original as first variant', () => { + const queries = buildResearchQueries('What is retrieval-augmented generation?'); + expect(queries[0]).toBe('What is retrieval-augmented generation'); + }); + + it('extracts quoted phrases as separate variants', () => { + const queries = buildResearchQueries( + 'What does "retrieval augmented generation" improve in science?', + ); + expect(queries).toContain('retrieval augmented generation'); + }); + + it('produces keyword-only fallback variant', () => { + const queries = buildResearchQueries( + 'How does cross-encoder reranking improve scientific retrieval accuracy?', + ); + // Should contain a keyword-compact form without stop words + const hasKeywords = queries.some( + (q) => q.includes('cross-encoder') || q.includes('reranking'), + ); + expect(hasKeywords).toBe(true); + }); + + it('never returns more than maxQueries variants', () => { + const queries = buildResearchQueries('some very long query', 2); + expect(queries.length).toBeLessThanOrEqual(2); + }); + + it('deduplicates identical variants', () => { + const queries = buildResearchQueries('short query'); + const unique = new Set(queries); + expect(unique.size).toBe(queries.length); + }); +}); + +// --------------------------------------------------------------------------- +// fuseQueryResults +// --------------------------------------------------------------------------- +describe('fuseQueryResults', () => { + const sampleResults = { + documents: [ + ['Abstract chunk about RAG systems.', 'Methods chunk explaining setup.'], + ['Abstract chunk about RAG systems.'], + ], + metadatas: [ + [ + { + citationKey: 'rag-study:p1:c1', + page: 1, + section: 'abstract', + sectionWeight: 1.4, + source: 'rag-study.pdf', + title: 'RAG Study', + sourceId: 'DOC-ABC', + chunkIndex: 0, + }, + { + citationKey: 'rag-study:p3:c1', + page: 3, + section: 'methods', + sectionWeight: 1.2, + source: 'rag-study.pdf', + title: 'RAG Study', + sourceId: 'DOC-ABC', + chunkIndex: 1, + }, + ], + [ + { + citationKey: 'rag-study:p1:c1', + page: 1, + section: 'abstract', + sectionWeight: 1.4, + source: 'rag-study.pdf', + title: 'RAG Study', + sourceId: 'DOC-ABC', + chunkIndex: 0, + }, + ], + ], + distances: [[0.05, 0.25], [0.06]], + ids: [['id-a', 'id-b'], ['id-a']], + }; + + it('deduplicates chunks that appear in multiple query results', () => { + const fused = fuseQueryResults(sampleResults, 10); + expect(fused.length).toBe(2); + }); + + it('accumulates rank scores for repeated chunks', () => { + const fused = fuseQueryResults(sampleResults, 10); + const abstractChunk = fused.find((r) => r.citationKey === 'rag-study:p1:c1'); + const methodsChunk = fused.find((r) => r.citationKey === 'rag-study:p3:c1'); + expect(abstractChunk).toBeDefined(); + expect(methodsChunk).toBeDefined(); + // Abstract chunk appeared in two result sets so it should rank higher + expect(abstractChunk!.rankScore).toBeGreaterThan(methodsChunk!.rankScore); + }); + + it('ranks abstract above body sections when scores are close', () => { + const results = { + documents: [['Abstract: core claim.', 'Body paragraph.']], + metadatas: [ + [ + { citationKey: 'doc:p1:c1', page: 1, section: 'abstract', sectionWeight: 1.4, source: 'doc.pdf', title: 'Doc', sourceId: 'D1', chunkIndex: 0 }, + { citationKey: 'doc:p5:c1', page: 5, section: 'body', sectionWeight: 0.8, source: 'doc.pdf', title: 'Doc', sourceId: 'D1', chunkIndex: 1 }, + ], + ], + distances: [[0.1, 0.1]], + ids: [['a', 'b']], + }; + const fused = fuseQueryResults(results, 10); + expect(fused[0].citationKey).toBe('doc:p1:c1'); + }); + + it('respects the limit parameter', () => { + const fused = fuseQueryResults(sampleResults, 1); + expect(fused.length).toBe(1); + }); +}); + +// --------------------------------------------------------------------------- +// buildEvidencePayload +// --------------------------------------------------------------------------- +describe('buildEvidencePayload', () => { + const results = { + documents: [['Chunk about scientific retrieval.'.repeat(15)]], + metadatas: [ + [ + { + citationKey: 'paper:p1:c1', + page: 1, + section: 'abstract', + sectionWeight: 1.4, + source: 'paper.pdf', + title: 'Paper', + sourceId: 'D1', + chunkIndex: 0, + }, + ], + ], + distances: [[0.1]], + ids: [['id-1']], + }; + + it('includes citation key in evidence context', () => { + const payload = buildEvidencePayload(results); + expect(payload.evidenceContext).toContain('[paper:p1:c1]'); + }); + + it('truncates chunks to maxChunkChars', () => { + const payload = buildEvidencePayload(results, { maxChunkChars: 40 }); + expect(payload.citations[0].content.length).toBeLessThanOrEqual(43); // 40 + '...' + }); + + it('caps total evidence context to maxEvidenceChars', () => { + const payload = buildEvidencePayload(results, { maxEvidenceChars: 50 }); + expect(payload.evidenceContext.length).toBeLessThanOrEqual(50); + }); + + it('builds source manifest grouped by source document', () => { + const payload = buildEvidencePayload(results); + expect(payload.sourceManifest).toHaveLength(1); + expect(payload.sourceManifest[0]).toMatchObject({ + source: 'paper.pdf', + title: 'Paper', + citationKeys: ['paper:p1:c1'], + }); + }); + + it('includes sectionWeight in evidence context block header', () => { + const payload = buildEvidencePayload(results); + expect(payload.evidenceContext).toContain('Weight: 1.40'); + }); +}); diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..46376c4 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,25 +1,71 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; -export default async function handler(req: NextApiRequest, res: NextApiResponse) { +import { + buildEvidencePayload, + buildResearchQueries, + parseBoundedInteger, +} from '@/utils/server/scientific-rag'; + +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; + +export default async function handler( + req: NextApiRequest, + res: NextApiResponse, +) { try { + if (req.method !== 'POST') { + res.setHeader('Allow', 'POST'); + return res.status(405).json({ error: 'Method not allowed' }); + } + + const query = + typeof req.body.input === 'string' ? req.body.input.trim() : ''; + + if (!query) { + return res.status(400).json({ error: 'Missing retrieval query' }); + } + + const nResults = parseBoundedInteger(req.body.nResults, 8, 16); + const maxEvidenceChars = parseBoundedInteger( + req.body.maxEvidenceChars, + 12000, + 30000, + ); + + // Expand the user query into multiple deterministic variants to improve + // recall across different phrasings. + const queryTexts = buildResearchQueries(query); + const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const query = req.body.input; - const embedder = new TransformersEmbeddingFunction(); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + // Run all query variants in a single Chroma call; each variant is a row + // in the returned matrix. + const results = await collection.query({ + nResults, + queryTexts, + include: ['documents', 'metadatas', 'distances'] as any, + }); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + // Fuse result sets with RRF + section weighting, then build a + // budget-capped evidence context for the LLM prompt. + const evidence = buildEvidencePayload(results, { + maxEvidenceChars, + maxResults: nResults, + }); - res.status(200).json(results); + res.status(200).json({ + ...results, + queryTexts, + ...evidence, + }); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); @@ -27,6 +73,6 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse) } else { console.error('Unknown error:', error); } - res.status(500).json({ error: 'An unexpected error occurred :(' }); + res.status(500).json({ error: 'An unexpected error occurred' }); } -} \ No newline at end of file +} diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..e87ba7b 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -1,9 +1,16 @@ import type { NextApiRequest, NextApiResponse } from 'next'; +import { + type LoadedDocument, + type PrimitiveMetadata, + SCIENTIFIC_SEPARATORS, + buildChunkMetadata, +} from '@/utils/server/scientific-rag'; + import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; @@ -20,35 +27,46 @@ export default async function handler( ) { try { if (req.method !== 'POST') { - return res.status(405).end(); + res.setHeader('Allow', 'POST'); + return res.status(405).json({ error: 'Method not allowed' }); } const form = new IncomingForm(); - form.parse(req, async (err, fields, files) => { + form.parse(req, async (err, _fields, files) => { if (err) { return res.status(400).json({ error: 'Failed to upload file' }); } + const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf; + + if (!pdfFile?.filepath) { + return res.status(400).json({ error: 'Missing PDF upload' }); + } + + const fallbackSource = + pdfFile.originalFilename ?? path.basename(pdfFile.filepath); + const client = new ChromaClient({ path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); - const loader = new PDFLoader(files.pdf[0].filepath); - + const loader = new PDFLoader(pdfFile.filepath); const originalDocs = await loader.load(); - console.log(JSON.stringify(originalDocs)); - - + // Section-aware splitter: tries scientific headings before falling back + // to paragraph and sentence breaks. const splitter = new RecursiveCharacterTextSplitter({ - chunkSize: 500, - chunkOverlap: 100, - }); + chunkSize: 900, + chunkOverlap: 180, + separators: SCIENTIFIC_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); - - // Process the documents and perform other logic - const { ids, metadatas, documentContents } = processDocuments(docs); + + const { ids, metadatas, documentContents } = processDocuments( + docs, + fallbackSource, + ); const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ @@ -75,30 +93,46 @@ export default async function handler( } } -function processDocuments(docs: any) { - const ids = []; - const metadatas = []; - const documentContents = []; +function processDocuments(docs: LoadedDocument[], fallbackSource: string) { + const ids: string[] = []; + const metadatas: PrimitiveMetadata[] = []; + const documentContents: string[] = []; - for (const document of docs) { - // Generate an ID for each document, or use some existing unique identifier + // Track how many chunks have been produced for each (source, page) pair so + // that citation keys are unique within a page. + const pageChunkCounts = new Map(); + + for (let index = 0; index < docs.length; index++) { + const document = docs[index]; const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; - - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); + const source = + typeof document.metadata === 'object' && + document.metadata !== null && + 'source' in document.metadata && + typeof (document.metadata as Record).source === 'string' + ? (document.metadata as Record).source as string + : fallbackSource; + + const page = + typeof document.metadata === 'object' && + document.metadata !== null && + 'loc' in document.metadata && + typeof (document.metadata as Record).loc === 'object' && + (document.metadata as Record).loc !== null && + 'pageNumber' in ((document.metadata as Record).loc as Record) + ? ((document.metadata as Record).loc as Record).pageNumber + : 'unknown'; + + const pageKey = `${source}:${page}`; + const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0; + pageChunkCounts.set(pageKey, pageChunkIndex + 1); + + metadatas.push( + buildChunkMetadata(document, fallbackSource, index, pageChunkIndex), + ); - // Add the page content to the documents array documentContents.push(document.pageContent); } diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..60c7c7a 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -1,6 +1,9 @@ import { DEFAULT_SYSTEM_PROMPT, DEFAULT_TEMPERATURE } from '@/utils/app/const'; import { OpenAIError, OpenAIStream } from '@/utils/server'; -import { codeBlock, oneLine } from 'common-tags' +import type { + ScientificCitation, + SourceManifestEntry, +} from '@/utils/server/scientific-rag'; import { ChatBody, Message } from '@/types/chat'; @@ -9,46 +12,76 @@ import wasm from '../../node_modules/@dqbd/tiktoken/lite/tiktoken_bg.wasm?module import tiktokenModel from '@dqbd/tiktoken/encoders/cl100k_base.json'; import { Tiktoken, init } from '@dqbd/tiktoken/lite/init'; +import { codeBlock, oneLine } from 'common-tags'; export const config = { runtime: 'edge', }; -// Function to fetch and format documents -async function fetchAndFormatDocuments(lastMessageContent: string) { +type FetchDocumentsResponse = { + citations?: ScientificCitation[]; + evidenceContext?: string; + sourceManifest?: SourceManifestEntry[]; +}; + +/** Formats the source manifest into a numbered list for the LLM prompt. */ +function formatSourceManifest(sourceManifest: SourceManifestEntry[]): string { + if (sourceManifest.length === 0) { + return 'No source manifest was produced.'; + } + return sourceManifest + .map( + (s, i) => + `${i + 1}. ${s.title} (${s.source}) -> ${s.citationKeys.join(', ')}`, + ) + .join('\n'); +} + +/** + * Calls the fetch-documents API using the same origin as the RAG chat request + * so the endpoint works in any deployment environment without a hard-coded + * localhost URL. + */ +async function fetchResearchEvidence( + req: Request, + lastMessageContent: string, +): Promise<{ + citations: ScientificCitation[]; + evidenceContext: string; + sourceManifest: SourceManifestEntry[]; +}> { try { - console.log("fetching documents") - const response = await fetch('http://localhost:3000/api/fetch-documents', { + const response = await fetch(new URL('/api/fetch-documents', req.url), { method: 'POST', headers: { 'Content-Type': 'application/json' }, - body: JSON.stringify({ input: lastMessageContent }), + body: JSON.stringify({ + input: lastMessageContent, + maxEvidenceChars: 12000, + nResults: 8, + }), }); - + if (!response.ok) { - throw new Error(`Error fetching documents: ${response.statusText}`); + throw new Error(`Evidence fetch failed: ${response.statusText}`); } - const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; - }).join(''); - - console.log(result); - - return result; + const data = (await response.json()) as FetchDocumentsResponse; + return { + citations: Array.isArray(data.citations) ? data.citations : [], + evidenceContext: + typeof data.evidenceContext === 'string' ? data.evidenceContext : '', + sourceManifest: Array.isArray(data.sourceManifest) + ? data.sourceManifest + : [], + }; } catch (error) { - console.error('Error fetching and formatting documents:', error); - throw error; // You may want to throw a more specific error object here + console.error('Error fetching research evidence:', error); + throw error; } } - - - - const handler = async (req: Request): Promise => { - try { const { model, messages, key, prompt, temperature } = (await req.json()) as ChatBody; @@ -60,94 +93,111 @@ const handler = async (req: Request): Promise => { tiktokenModel.pat_str, ); - let promptToSend = codeBlock` - ${oneLine` - You are a very enthusiastic AI assistant who loves - to help people! Given the following information from - relevant documentation, answer the user's question using - only that information, outputted in markdown format. - `} - - ${oneLine` - If you are unsure - and the answer is not explicitly written in the documentation, say - "Sorry, I don't know how to help with that." - `} - - ${oneLine` - Always include citations from the documentation. - `} - `; - - if (!promptToSend) { - promptToSend = DEFAULT_SYSTEM_PROMPT; - } + // Scientific research assistant system prompt with strict citation rules + const promptToSend = codeBlock` + ${oneLine` + You are a rigorous scientific research assistant. You answer questions + exclusively from the retrieved evidence provided below. Every factual + claim in your response must be backed by the exact citation key shown + in square brackets (e.g. [paper-title:p3:c2]). + `} + + ${oneLine` + If the retrieved evidence does not contain enough information to answer + the question, respond with: + "Sorry, I don't know how to help with that based on the uploaded documents." + `} + + ${oneLine` + Prioritise evidence from higher-weighted sections (Results, Methods, + Abstract) when sources disagree or overlap. + `} + ` || DEFAULT_SYSTEM_PROMPT; const lastMessage = messages[messages.length - 1]; - const relevantDocuments = await fetchAndFormatDocuments(lastMessage.content); - + const { citations, evidenceContext, sourceManifest } = + await fetchResearchEvidence(req, lastMessage.content); + let temperatureToUse = temperature; if (temperatureToUse == null) { temperatureToUse = DEFAULT_TEMPERATURE; } const prompt_tokens = encoding.encode(promptToSend); - let tokenCount = prompt_tokens.length; let messagesToSend: Message[] = []; - encoding.free(); - console.log(model, promptToSend, temperatureToUse, key, messagesToSend); - - - messagesToSend = [ + messagesToSend = [ + { + role: 'user', + content: codeBlock` + Here is the retrieved evidence (ordered by relevance score, highest first): + ${evidenceContext || 'No matching evidence was retrieved from the uploaded documents.'} + `, + }, { - role: "user", + role: 'user', content: codeBlock` - Here is the relevant documentation: - ${relevantDocuments} + Here is the source manifest (maps citation keys to source documents): + ${formatSourceManifest(sourceManifest)} `, }, { - role: "user", + role: 'user', content: codeBlock` ${oneLine` - Answer my next question using only the above documentation. - You must also follow the below rules when answering: + Answer my next question using only the evidence and source manifest above. + Follow these rules strictly: + `} + ${oneLine` + - Cite every factual claim with the exact citation key in brackets, + e.g. [paper-title:p3:c2]. Only use keys that appear in the evidence + or source manifest above. `} ${oneLine` - - Do not make up answers that are not provided in the documentation. + - When evidence from multiple sections agrees, cite all relevant keys. `} ${oneLine` - - If you are unsure and the answer is not explicitly written - in the documentation context, say - "Sorry, I don't know how to help with that." + - Prefer Results and Methods evidence over Introduction or Discussion + when the sections disagree. `} ${oneLine` - - Prefer splitting your response into multiple paragraphs. + - Do not fabricate information not present in the evidence. `} ${oneLine` - - Output as markdown with citations based on the documentation. + - If the evidence is insufficient, say + "Sorry, I don't know how to help with that based on the uploaded documents." + `} + ${oneLine` + - Format your response in markdown with clear paragraphs. `} `, }, + ...(citations.length === 0 + ? [ + { + role: 'user' as const, + content: + 'No citations were retrieved. If the evidence context is empty, reply with the standard "I don\'t know" message.', + }, + ] + : []), { - role: "user", + role: 'user', content: codeBlock` Here is my question: ${oneLine`${lastMessage.content}`} - `, + `, }, - ] - + ]; const stream = await OpenAIStream( model, promptToSend, - 0, + temperatureToUse, key, messagesToSend, ); diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..b39d67b --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,657 @@ +/** + * Scientific RAG pipeline utilities (ISAAC-497) + * + * Provides section-aware chunking, multi-query retrieval with reciprocal rank + * fusion, stable citation key generation, and bounded evidence context building + * optimised for scientific and research document workflows. + */ + +// --------------------------------------------------------------------------- +// Types +// --------------------------------------------------------------------------- + +type Primitive = string | number | boolean; +type UnknownRecord = Record; + +export type PrimitiveMetadata = Record; + +export type LoadedDocument = { + pageContent: string; + metadata?: unknown; +}; + +/** Section labels recognised in scientific papers. */ +export type ScientificSection = + | 'abstract' + | 'introduction' + | 'background' + | 'related work' + | 'methods' + | 'results' + | 'discussion' + | 'conclusion' + | 'references' + | 'body'; + +export type ScientificChunkMetadata = PrimitiveMetadata & { + /** Stable, human-readable key used as an in-text citation handle. */ + citationKey: string; + /** Sequential index of the chunk across the whole document. */ + chunkIndex: number; + /** Page number (1-based) or 'unknown'. */ + page: number | string; + /** Detected scientific section label. */ + section: ScientificSection; + /** Safe public filename, never a server-side temp path. */ + source: string; + /** Stable document fingerprint used to group chunks by source. */ + sourceId: string; + /** Document title extracted from PDF metadata or filename. */ + title: string; + /** Relevance weight based on the section (higher = more important). */ + sectionWeight: number; +}; + +export type ChromaQueryResults = { + documents?: unknown; + distances?: unknown; + ids?: unknown; + metadatas?: unknown; +}; + +export type FusedResult = { + citationKey: string; + content: string; + distance?: number; + id?: string; + metadata: Partial & PrimitiveMetadata; + /** Position in the fused ranking (0-based). */ + rank: number; + /** Combined RRF + distance + section-weight score. */ + rankScore: number; + /** Index of the query variant that first produced this chunk. */ + sourceQueryIndex: number; +}; + +export type ScientificCitation = { + key: string; + title: string; + source: string; + page: number | string; + section: ScientificSection | string; + sectionWeight: number; + distance?: number; + rankScore: number; + content: string; +}; + +export type SourceManifestEntry = { + sourceId: string; + title: string; + source: string; + citationKeys: string[]; +}; + +export type EvidencePayload = { + citations: ScientificCitation[]; + evidenceContext: string; + results: FusedResult[]; + sourceManifest: SourceManifestEntry[]; +}; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/** + * Scientific sections ordered from highest to lowest retrieval importance. + * Abstract, Results, and Methods are weighted most heavily because they + * contain the core claims and evidence of a paper. + */ +const SECTION_WEIGHTS: Record = { + abstract: 1.4, + results: 1.3, + 'materials and methods': 1.2, + methods: 1.2, + methodology: 1.2, + discussion: 1.1, + conclusion: 1.1, + introduction: 1.0, + background: 0.95, + 'related work': 0.9, + experiment: 1.15, + experiments: 1.15, + evaluation: 1.15, + limitations: 0.85, + references: 0.5, + body: 0.8, +}; + +const SCIENTIFIC_SECTIONS = Object.keys(SECTION_WEIGHTS) as ScientificSection[]; + +const STOP_WORDS = new Set([ + 'about', 'after', 'again', 'also', 'answer', 'based', 'before', + 'between', 'could', 'describe', 'does', 'explain', 'from', 'have', + 'how', 'into', 'paper', 'papers', 'please', 'research', 'should', + 'show', 'that', 'their', 'there', 'these', 'this', 'using', 'what', + 'when', 'where', 'which', 'with', +]); + +/** + * Text separators that respect scientific paper structure. + * The RecursiveCharacterTextSplitter tries each separator in order; + * section headers appear first so chunks tend to align with sections. + */ +export const SCIENTIFIC_SEPARATORS = [ + '\nAbstract\n', + '\nABSTRACT\n', + '\nIntroduction\n', + '\nINTRODUCTION\n', + '\nBackground\n', + '\nBACKGROUND\n', + '\nRelated Work\n', + '\nRELATED WORK\n', + '\nMethods\n', + '\nMETHODS\n', + '\nMaterials and Methods\n', + '\nMATERIALS AND METHODS\n', + '\nMethodology\n', + '\nMETHODOLOGY\n', + '\nExperiment\n', + '\nExperiments\n', + '\nEXPERIMENTS\n', + '\nResults\n', + '\nRESULTS\n', + '\nEvaluation\n', + '\nEVALUATION\n', + '\nDiscussion\n', + '\nDISCUSSION\n', + '\nLimitations\n', + '\nLIMITATIONS\n', + '\nConclusion\n', + '\nCONCLUSION\n', + '\nReferences\n', + '\nREFERENCES\n', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +// --------------------------------------------------------------------------- +// Small utilities +// --------------------------------------------------------------------------- + +function isRecord(value: unknown): value is UnknownRecord { + return typeof value === 'object' && value !== null && !Array.isArray(value); +} + +function asPrimitive(value: unknown): Primitive | undefined { + if ( + typeof value === 'string' || + typeof value === 'number' || + typeof value === 'boolean' + ) { + return value; + } + return undefined; +} + +function firstPrimitive( + record: UnknownRecord, + keys: string[], +): Primitive | undefined { + for (const key of keys) { + const val = asPrimitive(record[key]); + if (val !== undefined) return val; + } + return undefined; +} + +function nestedPrimitive( + record: UnknownRecord, + path: string[], +): Primitive | undefined { + let cursor: unknown = record; + for (const part of path) { + if (!isRecord(cursor)) return undefined; + cursor = cursor[part]; + } + return asPrimitive(cursor); +} + +export function asCleanString(value: unknown): string | undefined { + if (typeof value === 'string') { + const t = value.trim(); + return t.length > 0 ? t : undefined; + } + if (typeof value === 'number' || typeof value === 'boolean') { + return String(value); + } + return undefined; +} + +function asFiniteNumber(value: unknown): number | undefined { + if (typeof value === 'number' && Number.isFinite(value)) return value; + if (typeof value === 'string') { + const n = Number(value); + if (Number.isFinite(n)) return n; + } + return undefined; +} + +function publicBasename(value: string): string { + const [withoutQuery] = value.split(/[?#]/); + const parts = withoutQuery.replace(/\\/g, '/').split('/').filter(Boolean); + return parts[parts.length - 1] ?? value; +} + +function slugify(value: string, fallback: string): string { + return ( + value + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/^-+|-+$/g, '') + .slice(0, 48) || fallback + ); +} + +/** FNV-1a 32-bit hash returning an uppercase hex string. */ +function hashString(value: string): string { + let hash = 0x811c9dc5; + for (let i = 0; i < value.length; i++) { + hash ^= value.charCodeAt(i); + hash += (hash << 1) + (hash << 4) + (hash << 7) + (hash << 8) + (hash << 24); + } + return (hash >>> 0).toString(16).toUpperCase().padStart(8, '0'); +} + +function collapseWhitespace(value: string): string { + return value.replace(/\s+/g, ' ').trim(); +} + +function safeTruncate(value: string, maxChars: number): string { + if (value.length <= maxChars) return value; + if (maxChars <= 3) return '.'.repeat(Math.max(maxChars, 0)); + return `${value.slice(0, maxChars - 3).trimEnd()}...`; +} + +// --------------------------------------------------------------------------- +// Public helpers +// --------------------------------------------------------------------------- + +/** + * Clamps and parses an integer request parameter within [fallback, max]. + */ +export function parseBoundedInteger( + value: unknown, + fallback: number, + max: number, +): number { + const n = asFiniteNumber(value); + if (n === undefined || n <= 0) return fallback; + return Math.min(Math.floor(n), max); +} + +/** + * Identifies the scientific section of a chunk by scanning its first lines. + * Falls back to 'body' when no known heading is found. + */ +export function detectScientificSection(content: string): ScientificSection { + const sample = content.split('\n').slice(0, 10).join(' ').toLowerCase(); + const matched = SCIENTIFIC_SECTIONS.find((section) => { + const escaped = section.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); + return new RegExp(`\\b${escaped}\\b`).test(sample); + }); + return (matched as ScientificSection) ?? 'body'; +} + +/** + * Returns the importance weight for a given section label. + */ +export function sectionWeight(section: string): number { + return SECTION_WEIGHTS[section.toLowerCase()] ?? SECTION_WEIGHTS.body; +} + +/** + * Builds a stable, human-readable citation key. + * Format: `:p:c` + */ +export function buildCitationKey({ + chunkIndex, + page, + title, +}: { + chunkIndex: number; + page: number | string; + title: string; +}): string { + return `${slugify(title, 'document')}:p${page}:c${chunkIndex + 1}`; +} + +/** + * Produces a fully-typed metadata record for a document chunk. + * Strips server-side temp paths from the public `source` field. + */ +export function buildChunkMetadata( + document: LoadedDocument, + fallbackSource: string, + chunkIndex: number, + pageChunkIndex = chunkIndex, +): ScientificChunkMetadata { + const meta = isRecord(document.metadata) ? document.metadata : {}; + + const publicSource = publicBasename( + asCleanString( + firstPrimitive(meta, [ + 'originalFilename', + 'filename', + 'fileName', + 'sourceLabel', + 'source', + ]), + ) ?? fallbackSource, + ); + + const fallbackTitle = publicSource || fallbackSource || 'document'; + const title = + asCleanString(firstPrimitive(meta, ['title', 'documentTitle'])) ?? + asCleanString(nestedPrimitive(meta, ['pdf', 'info', 'Title'])) ?? + fallbackTitle; + + const page = + asFiniteNumber(firstPrimitive(meta, ['page', 'pageNumber'])) ?? + asFiniteNumber(nestedPrimitive(meta, ['loc', 'pageNumber'])) ?? + 'unknown'; + + const section = detectScientificSection(document.pageContent); + const weight = sectionWeight(section); + + const citationKey = buildCitationKey({ + chunkIndex: pageChunkIndex, + page, + title, + }); + + const sourceId = `DOC-${hashString( + `${title.toLowerCase()}|${publicSource.toLowerCase()}`, + )}`; + + return { + citationKey, + chunkIndex, + page, + section, + source: publicSource, + sourceId, + title, + sectionWeight: weight, + }; +} + +/** + * Expands a user query into multiple deterministic variants for multi-query + * retrieval, improving recall across phrasing differences. + * + * Variants: + * 1. Normalised original query (punctuation stripped) + * 2. Quoted phrases extracted verbatim + * 3. Keyword-only compact form (stop words removed) + * 4. Reduced keyword form (top-6 terms) for broad fallback + */ +export function buildResearchQueries(input: string, maxQueries = 4): string[] { + const normalised = collapseWhitespace(input.replace(/[?!.]+$/g, '')); + const queries: string[] = [normalised]; + + // Extract quoted phrases (min 8, max 120 chars) + const quotedPhrases = Array.from(input.matchAll(/"([^"]{8,120})"/g)) + .map((m) => collapseWhitespace(m[1])) + .filter(Boolean); + queries.push(...quotedPhrases); + + // Build term list with stop-word filtering + const terms = normalised + .toLowerCase() + .replace(/[^a-z0-9\s-]/g, ' ') + .split(/\s+/) + .map((t) => t.trim()) + .filter((t) => t.length >= 4 && !STOP_WORDS.has(t)); + const uniqueTerms = Array.from(new Set(terms)).slice(0, 10); + + if (uniqueTerms.length >= 2) queries.push(uniqueTerms.join(' ')); + if (uniqueTerms.length >= 4) queries.push(uniqueTerms.slice(0, 6).join(' ')); + + return Array.from(new Set(queries.filter(Boolean))).slice(0, maxQueries); +} + +// --------------------------------------------------------------------------- +// Internal retrieval helpers +// --------------------------------------------------------------------------- + +function matrixRow(value: unknown, index: number): unknown[] { + if (!Array.isArray(value)) return []; + const row = value[index]; + return Array.isArray(row) ? row : []; +} + +function normaliseMetadata( + raw: unknown, + fallbackIndex: number, +): Partial & PrimitiveMetadata { + const meta = isRecord(raw) ? raw : {}; + + const title = + asCleanString(firstPrimitive(meta, ['title', 'documentTitle'])) ?? + 'Untitled source'; + + const source = publicBasename( + asCleanString( + firstPrimitive(meta, ['source', 'filename', 'fileName', 'sourcePath']), + ) ?? 'unknown-source', + ); + + const page = + asFiniteNumber(firstPrimitive(meta, ['page', 'pageNumber'])) ?? + asCleanString(firstPrimitive(meta, ['page', 'pageNumber'])) ?? + 'unknown'; + + const chunkIndex = + asFiniteNumber(firstPrimitive(meta, ['chunkIndex', 'chunk_index'])) ?? + fallbackIndex; + + const section = + (asCleanString(firstPrimitive(meta, ['section'])) as ScientificSection) ?? + 'body'; + + const weight = + asFiniteNumber(firstPrimitive(meta, ['sectionWeight', 'section_weight'])) ?? + sectionWeight(section); + + const citationKey = + asCleanString(firstPrimitive(meta, ['citationKey', 'citation_key'])) ?? + buildCitationKey({ chunkIndex, page, title }); + + const sourceId = + asCleanString(firstPrimitive(meta, ['sourceId', 'source_id'])) ?? + `DOC-${hashString(`${title.toLowerCase()}|${source.toLowerCase()}`)}`; + + return { + citationKey, + chunkIndex, + page, + section, + source, + sourceId, + title, + sectionWeight: weight, + }; +} + +// --------------------------------------------------------------------------- +// Reciprocal Rank Fusion with section weighting +// --------------------------------------------------------------------------- + +/** + * Merges multiple Chroma query result sets using Reciprocal Rank Fusion (RRF). + * + * Score formula per chunk per query result position `r` (0-based): + * score += queryWeight * sectionWeight / (r + 1) + * + 0.2 * sectionWeight / (1 + distance) [distance bonus] + * + * Duplicate chunks (same citation key + content fingerprint) have their + * scores accumulated across result sets, giving a natural re-ranking boost. + */ +export function fuseQueryResults( + results: ChromaQueryResults, + limit: number, +): FusedResult[] { + const docsByQuery = Array.isArray(results.documents) ? results.documents : []; + const fused = new Map(); + + for (let qi = 0; qi < docsByQuery.length; qi++) { + const documents = matrixRow(results.documents, qi); + const metadatas = matrixRow(results.metadatas, qi); + const distances = matrixRow(results.distances, qi); + const ids = matrixRow(results.ids, qi); + + for (let rank = 0; rank < documents.length; rank++) { + const content = asCleanString(documents[rank]); + if (!content) continue; + + const normalised = collapseWhitespace(content); + const meta = normaliseMetadata(metadatas[rank], rank); + const distance = asFiniteNumber(distances[rank]); + const weight = meta.sectionWeight ?? 1.0; + + const citationKey = + meta.citationKey ?? + `SRC-${hashString(`${meta.source}|${meta.page}|${normalised}`)}`; + + // Deduplication key: citation key + content fingerprint + const dedupeKey = `${citationKey}|${hashString(normalised.slice(0, 400))}`; + + // Primary query gets full weight; sub-queries get 0.88 + const queryWeight = qi === 0 ? 1.0 : 0.88; + const rrfScore = (queryWeight * weight) / (rank + 1); + const distanceBonus = + distance === undefined ? 0 : (0.2 * weight) / (1 + Math.max(distance, 0)); + const score = rrfScore + distanceBonus; + + const existing = fused.get(dedupeKey); + if (existing) { + existing.rankScore += score; + if ( + existing.distance === undefined || + (distance !== undefined && distance < existing.distance) + ) { + existing.distance = distance; + existing.rank = rank; + existing.sourceQueryIndex = qi; + } + continue; + } + + fused.set(dedupeKey, { + citationKey, + content: normalised, + distance, + id: asCleanString(ids[rank]), + metadata: meta, + rank, + rankScore: score, + sourceQueryIndex: qi, + }); + } + } + + return Array.from(fused.values()) + .sort((a, b) => b.rankScore - a.rankScore) + .slice(0, limit); +} + +// --------------------------------------------------------------------------- +// Evidence payload builder +// --------------------------------------------------------------------------- + +/** + * Converts fused retrieval results into a structured evidence payload ready + * for injection into the LLM prompt. + * + * The `evidenceContext` string is budget-capped at `maxEvidenceChars` so the + * final prompt never overflows the model's context window. + */ +export function buildEvidencePayload( + results: ChromaQueryResults, + options: { + maxChunkChars?: number; + maxEvidenceChars?: number; + maxResults?: number; + } = {}, +): EvidencePayload { + const maxResults = options.maxResults ?? 8; + const maxChunkChars = options.maxChunkChars ?? 1400; + const maxEvidenceChars = options.maxEvidenceChars ?? 12000; + + const fusedResults = fuseQueryResults(results, maxResults); + + const citations: ScientificCitation[] = fusedResults.map((r) => ({ + key: r.citationKey, + title: asCleanString(r.metadata.title) ?? 'Untitled source', + source: asCleanString(r.metadata.source) ?? 'unknown-source', + page: r.metadata.page ?? 'unknown', + section: (asCleanString(r.metadata.section) ?? 'body') as ScientificSection, + sectionWeight: (r.metadata.sectionWeight as number) ?? 1.0, + distance: r.distance, + rankScore: r.rankScore, + content: safeTruncate(r.content, maxChunkChars), + })); + + // Build source manifest (one entry per unique document) + const manifest = new Map(); + for (const c of citations) { + const sourceId = `DOC-${hashString( + `${c.title.toLowerCase()}|${c.source.toLowerCase()}`, + )}`; + const existing = manifest.get(sourceId); + if (existing) { + existing.citationKeys.push(c.key); + } else { + manifest.set(sourceId, { + sourceId, + title: c.title, + source: c.source, + citationKeys: [c.key], + }); + } + } + + // Build evidence context string with budget cap + let usedChars = 0; + const evidenceBlocks: string[] = []; + + for (const c of citations) { + const distStr = + c.distance === undefined ? '' : ` | Distance: ${c.distance.toFixed(4)}`; + const block = [ + `[${c.key}] Title: ${c.title} | Source: ${c.source} | Page: ${c.page} | Section: ${c.section} | Weight: ${c.sectionWeight.toFixed(2)}${distStr}`, + c.content, + ].join('\n'); + + if (usedChars + block.length > maxEvidenceChars) { + const remaining = maxEvidenceChars - usedChars; + if (remaining > 0) evidenceBlocks.push(safeTruncate(block, remaining)); + break; + } + + evidenceBlocks.push(block); + usedChars += block.length; + } + + return { + citations, + evidenceContext: evidenceBlocks.join('\n\n'), + results: fusedResults, + sourceManifest: Array.from(manifest.values()), + }; +}