aietal · jamilahmadzai · May 25, 2026
diff --git a/ui/__tests__/research-rag.test.ts b/ui/__tests__/research-rag.test.ts
@@ -0,0 +1,137 @@
+import {
+  buildEvidencePayload,
+  buildResearchMetadata,
+  buildResearchQueries,
+  detectScientificSection,
+  fuseChromaResults,
+  parseBoundedInteger,
+} from '@/utils/server/research-rag';
+
+import { describe, expect, it } from 'vitest';
+
+describe('research RAG helpers', () => {
+  it('builds deterministic research query variants', () => {
+    expect(
+      buildResearchQueries(
+        'What does "retrieval augmented generation" improve in scientific workflows?',
+      ),
+    ).toEqual([
+      'What does "retrieval augmented generation" improve in scientific workflows',
+      'retrieval augmented generation',
+      'retrieval augmented generation improve scientific workflows',
+    ]);
+  });
+
+  it('detects scientific sections near chunk starts', () => {
+    expect(detectScientificSection('Abstract\nThis paper studies RAG.')).toBe(
+      'abstract',
+    );
+    expect(
+      detectScientificSection('Materials and Methods\nWe used a benchmark.'),
+    ).toBe('materials and methods');
+    expect(detectScientificSection('A general paragraph.')).toBe('body');
+  });
+
+  it('builds citation metadata without leaking temporary upload paths', () => {
+    const metadata = buildResearchMetadata(
+      {
+        pageContent: 'Results\nThe method improves grounded answers.',
+        metadata: {
+          loc: { pageNumber: 7 },
+          pdf: { info: { Title: 'Grounded Scientific RAG' } },
+          source: '/tmp/uploads/private/source-paper.pdf',
+        },
+      },
+      'source-paper.pdf',
+      3,
+      1,
+    );
+
+    expect(metadata).toMatchObject({
+      citationKey: 'grounded-scientific-rag:p7:c2',
+      page: 7,
+      section: 'results',
+      source: 'source-paper.pdf',
+      title: 'Grounded Scientific RAG',
+    });
+    expect(metadata.source).not.toContain('/tmp/uploads');
+  });
+
+  it('bounds integer request parameters', () => {
+    expect(parseBoundedInteger('20', 8, 16)).toBe(16);
+    expect(parseBoundedInteger(0, 8, 16)).toBe(8);
+    expect(parseBoundedInteger('bad', 8, 16)).toBe(8);
+  });
+
+  it('fuses duplicate chunks across query variants', () => {
+    const fused = fuseChromaResults(
+      {
+        documents: [
+          ['The answer is grounded in chunk one.', 'A second chunk.'],
+          ['The answer is grounded in chunk one.'],
+        ],
+        metadatas: [
+          [
+            {
+              citationKey: 'paper:p1:c1',
+              page: 1,
+              source: 'paper.pdf',
+              title: 'Paper',
+            },
+            {
+              citationKey: 'paper:p2:c1',
+              page: 2,
+              source: 'paper.pdf',
+              title: 'Paper',
+            },
+          ],
+          [
+            {
+              citationKey: 'paper:p1:c1',
+              page: 1,
+              source: 'paper.pdf',
+              title: 'Paper',
+            },
+          ],
+        ],
+        distances: [[0.05, 0.4], [0.06]],
+        ids: [['a', 'b'], ['a']],
+      },
+      4,
+    );
+
+    expect(fused).toHaveLength(2);
+    expect(fused[0].citationKey).toBe('paper:p1:c1');
+    expect(fused[0].rankScore).toBeGreaterThan(fused[1].rankScore);
+  });
+
+  it('formats bounded evidence and source manifests', () => {
+    const payload = buildEvidencePayload(
+      {
+        documents: [['Chunk about scientific retrieval.'.repeat(20)]],
+        metadatas: [
+          [
+            {
+              citationKey: 'paper:p1:c1',
+              page: 1,
+              section: 'abstract',
+              source: 'paper.pdf',
+              title: 'Paper',
+            },
+          ],
+        ],
+        distances: [[0.1]],
+      },
+      { maxChunkChars: 40, maxEvidenceChars: 200, maxResults: 2 },
+    );
+
+    expect(payload.citations).toHaveLength(1);
+    expect(payload.citations[0].content.length).toBeLessThanOrEqual(40);
+    expect(payload.evidenceContext).toContain('[paper:p1:c1]');
+    expect(payload.sourceManifest[0]).toMatchObject({
+      citationKeys: ['paper:p1:c1'],
+      source: 'paper.pdf',
+      title: 'Paper',
+    });
+  });
+});
diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts
@@ -1,25 +1,66 @@
-import type { NextApiRequest, NextApiResponse } from "next";
-import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
+import type { NextApiRequest, NextApiResponse } from 'next';
 
-export default async function handler(req: NextApiRequest, res: NextApiResponse) {
+import {
+  buildEvidencePayload,
+  buildResearchQueries,
+  parseBoundedInteger,
+} from '@/utils/server/research-rag';
+
+import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
+
+export default async function handler(
+  req: NextApiRequest,
+  res: NextApiResponse,
+) {
   try {
+    if (req.method !== 'POST') {
+      res.setHeader('Allow', 'POST');
+      return res.status(405).json({ error: 'Method not allowed' });
+    }
+
     const client = new ChromaClient({
-      path: "http://chroma-server:8000",
+      path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
     });
 
-    const query = req.body.input;
+    const query =
+      typeof req.body.input === 'string' ? req.body.input.trim() : '';
+
+    if (!query) {
+      return res.status(400).json({ error: 'Missing retrieval query' });
+    }
+
+    const nResults = parseBoundedInteger(req.body.nResults, 8, 16);
+    const maxEvidenceChars = parseBoundedInteger(
+      req.body.maxEvidenceChars,
+      12000,
+      30000,
+    );
+    const queryTexts = buildResearchQueries(query);
 
     const embedder = new TransformersEmbeddingFunction();
 
-    const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
+    const collection = await client.getOrCreateCollection({
+      name: 'default-collection',
+      embeddingFunction: embedder,
+    });
+
+    // Query deterministic research-focused variants, then fuse the result sets.
+    const results = await collection.query({
+      nResults,
+      queryTexts,
+      include: ['documents', 'metadatas', 'distances'] as any,
+    });
 
-  // query the collection
-  const results = await collection.query({
-      nResults: 4, 
-      queryTexts: [query]
-  }) 
+    const evidence = buildEvidencePayload(results, {
+      maxEvidenceChars,
+      maxResults: nResults,
+    });
 
-    res.status(200).json(results);
+    res.status(200).json({
+      ...results,
+      queryTexts,
+      ...evidence,
+    });
   } catch (error) {
     if (error instanceof Error) {
       console.error('Error message:', error.message);
@@ -29,4 +70,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
     }
     res.status(500).json({ error: 'An unexpected error occurred :(' });
   }
-}
+}
diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts
@@ -1,10 +1,16 @@
 import type { NextApiRequest, NextApiResponse } from 'next';
 
+import {
+  type LoadedDocument,
+  type PrimitiveMetadata,
+  RESEARCH_TEXT_SEPARATORS,
+  buildResearchMetadata,
+} from '@/utils/server/research-rag';
+
 import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
 import { IncomingForm } from 'formidable';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
-
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import path from 'path';
 import { v4 as uuidv4 } from 'uuid';
 
@@ -33,22 +39,31 @@ export default async function handler(
         path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
       });
 
-      const loader = new PDFLoader(files.pdf[0].filepath);
+      const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;
 
-      const originalDocs = await loader.load();
+      if (!pdfFile?.filepath) {
+        return res.status(400).json({ error: 'Missing PDF upload' });
+      }
 
-      console.log(JSON.stringify(originalDocs));
+      const fallbackSource =
+        pdfFile.originalFilename ?? path.basename(pdfFile.filepath);
+      const loader = new PDFLoader(pdfFile.filepath);
 
+      const originalDocs = await loader.load();
 
       const splitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 500,
-        chunkOverlap: 100,
-      });      
+        chunkSize: 900,
+        chunkOverlap: 180,
+        separators: RESEARCH_TEXT_SEPARATORS,
+      });
 
       const docs = await splitter.splitDocuments(originalDocs);
- 
+
       // Process the documents and perform other logic
-      const { ids, metadatas, documentContents } = processDocuments(docs);
+      const { ids, metadatas, documentContents } = processDocuments(
+        docs,
+        fallbackSource,
+      );
 
       const embedder = new TransformersEmbeddingFunction();
       const collection = await client.getOrCreateCollection({
@@ -75,28 +90,41 @@ export default async function handler(
   }
 }
 
-function processDocuments(docs: any) {
-  const ids = [];
-  const metadatas = [];
-  const documentContents = [];
+function processDocuments(docs: LoadedDocument[], fallbackSource: string) {
+  const ids: string[] = [];
+  const metadatas: PrimitiveMetadata[] = [];
+  const documentContents: string[] = [];
+  const pageChunkCounts = new Map<string, number>();
 
-  for (const document of docs) {
+  for (let index = 0; index < docs.length; index += 1) {
+    const document = docs[index];
     // Generate an ID for each document, or use some existing unique identifier
     const id = uuidv4();
     ids.push(id);
 
-    const fallbackTitle = path.basename(document.metadata.source);
-    const titleFromMetadata = document.metadata.pdf.info.Title;
-
-    const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
-
-
-    const metadata = {
-      title: title,
-      page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
-      source: document.metadata.source, // Define this function to extract verse info
-    };
-    metadatas.push(metadata);
+    const source =
+      typeof document.metadata === 'object' &&
+      document.metadata !== null &&
+      'source' in document.metadata &&
+      typeof document.metadata.source === 'string'
+        ? document.metadata.source
+        : fallbackSource;
+    const page =
+      typeof document.metadata === 'object' &&
+      document.metadata !== null &&
+      'loc' in document.metadata &&
+      typeof document.metadata.loc === 'object' &&
+      document.metadata.loc !== null &&
+      'pageNumber' in document.metadata.loc
+        ? document.metadata.loc.pageNumber
+        : 'unknown';
+    const pageKey = `${source}:${page}`;
+    const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0;
+    pageChunkCounts.set(pageKey, pageChunkIndex + 1);
+
+    metadatas.push(
+      buildResearchMetadata(document, fallbackSource, index, pageChunkIndex),
+    );
 
     // Add the page content to the documents array
     documentContents.push(document.pageContent);