aietal · dremonkey23 · May 20, 2026
diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts
@@ -0,0 +1,77 @@
+import { describe, expect, it } from 'vitest';
+
+import {
+  buildCitationKey,
+  buildRagMetadata,
+  detectScientificSection,
+  formatRetrievedDocuments,
+  parseSemanticScholarReferences,
+  semanticScholarReferenceToText,
+} from '@/utils/server/scientific-rag';
+
+describe('scientific RAG helpers', () => {
+  it('detects compound scientific sections before generic methods', () => {
+    expect(detectScientificSection('Materials and Methods\nWe collected samples')).toBe(
+      'materials-and-methods',
+    );
+    expect(detectScientificSection('Abstract\nThis paper studies retrieval')).toBe('abstract');
+  });
+
+  it('builds stable citation keys for uploaded documents', () => {
+    expect(
+      buildCitationKey({ title: 'My Paper.pdf', page: 3, chunkIndex: 2 }),
+    ).toBe('doc:my-paper-pdf:p3:c2');
+  });
+
+  it('builds stable citation keys for Semantic Scholar references', () => {
+    expect(
+      buildCitationKey({
+        sourceType: 'semantic-scholar',
+        paperId: 'abc123',
+        title: 'Ignored when paper id exists',
+        chunkIndex: 1,
+      }),
+    ).toBe('scholar:abc123:ref:c1');
+  });
+
+  it('converts Semantic Scholar references into indexable text', () => {
+    const text = semanticScholarReferenceToText({
+      paperId: 'paper-1',
+      title: 'Retrieval for Science',
+      abstract: 'A study of citation-grounded retrieval.',
+      authors: [{ name: 'Ada Lovelace' }, 'Grace Hopper'],
+      year: 2026,
+      venue: 'ISAAC',
+    });
+
+    expect(text).toContain('Title: Retrieval for Science');
+    expect(text).toContain('Authors: Ada Lovelace, Grace Hopper');
+    expect(text).toContain('Semantic Scholar Paper ID: paper-1');
+  });
+
+  it('parses saved Semantic Scholar references from form fields', () => {
+    const refs = parseSemanticScholarReferences([
+      JSON.stringify([{ paperId: 'paper-1', title: 'A' }]),
+    ]);
+
+    expect(refs).toEqual([{ paperId: 'paper-1', title: 'A' }]);
+  });
+
+  it('formats retrieval results with citation keys and distances', () => {
+    const formatted = formatRetrievedDocuments({
+      documents: [['Chunk text']],
+      metadatas: [[buildRagMetadata({ title: 'Paper', page: 1, chunkIndex: 0 })]],
+      distances: [[0.123456]],
+    });
+
+    expect(formatted).toContain('[doc:paper:p1:c0]');
+    expect(formatted).toContain('Distance: 0.1235');
+    expect(formatted).toContain('Chunk text');
+  });
+
+  it('handles empty retrieval results defensively', () => {
+    expect(formatRetrievedDocuments({ documents: [[]], metadatas: [[]] })).toBe(
+      'No relevant documents were retrieved.',
+    );
+  });
+});
diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts
@@ -1,23 +1,35 @@
-import type { NextApiRequest, NextApiResponse } from "next";
-import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
+import type { NextApiRequest, NextApiResponse } from 'next';
+
+import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
 
 export default async function handler(req: NextApiRequest, res: NextApiResponse) {
   try {
+    if (req.method !== 'POST') {
+      return res.status(405).end();
+    }
+
+    const query = typeof req.body?.input === 'string' ? req.body.input.trim() : '';
+    if (!query) {
+      return res.status(400).json({ error: 'Missing retrieval query' });
+    }
+
+    const requestedResults = Number(req.body?.nResults || 6);
+    const nResults = Math.min(Math.max(requestedResults, 1), 10);
+
     const client = new ChromaClient({
-      path: "http://chroma-server:8000",
+      path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
     });
 
-    const query = req.body.input;
-
     const embedder = new TransformersEmbeddingFunction();
+    const collection = await client.getOrCreateCollection({
+      name: 'default-collection',
+      embeddingFunction: embedder,
+    });
 
-    const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
-
-  // query the collection
-  const results = await collection.query({
-      nResults: 4, 
-      queryTexts: [query]
-  }) 
+    const results = await collection.query({
+      nResults,
+      queryTexts: [query],
+    });
 
     res.status(200).json(results);
   } catch (error) {
@@ -29,4 +41,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
     }
     res.status(500).json({ error: 'An unexpected error occurred :(' });
   }
-}
+}
diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts
@@ -3,10 +3,17 @@ import type { NextApiRequest, NextApiResponse } from 'next';
 import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
 import { IncomingForm } from 'formidable';
 import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
-import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";
-
+import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
 import path from 'path';
-import { v4 as uuidv4 } from 'uuid';
+
+import {
+  buildRagMetadata,
+  detectScientificSection,
+  parseSemanticScholarReferences,
+  SCIENTIFIC_TEXT_SEPARATORS,
+  semanticScholarReferenceToText,
+  type ScientificReference,
+} from '@/utils/server/scientific-rag';
 
 export const config = {
   api: {
@@ -29,25 +36,36 @@ export default async function handler(
         return res.status(400).json({ error: 'Failed to upload file' });
       }
 
+      const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;
+      const references = parseSemanticScholarReferences(fields.references);
+
+      if (!pdfFile?.filepath && references.length === 0) {
+        return res.status(400).json({
+          error: 'Upload a PDF or provide Semantic Scholar references',
+        });
+      }
+
       const client = new ChromaClient({
         path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
       });
 
-      const loader = new PDFLoader(files.pdf[0].filepath);
-
-      const originalDocs = await loader.load();
+      const originalDocs = [];
 
-      console.log(JSON.stringify(originalDocs));
+      if (pdfFile?.filepath) {
+        const loader = new PDFLoader(pdfFile.filepath);
+        originalDocs.push(...(await loader.load()));
+      }
 
+      originalDocs.push(...semanticScholarReferencesToDocuments(references));
 
       const splitter = new RecursiveCharacterTextSplitter({
-        chunkSize: 500,
-        chunkOverlap: 100,
-      });      
+        chunkSize: 700,
+        chunkOverlap: 120,
+        separators: SCIENTIFIC_TEXT_SEPARATORS,
+      });
 
       const docs = await splitter.splitDocuments(originalDocs);
-
-      // Process the documents and perform other logic
+
       const { ids, metadatas, documentContents } = processDocuments(docs);
 
       const embedder = new TransformersEmbeddingFunction();
@@ -65,6 +83,7 @@ export default async function handler(
       res.status(200).json({
         message: 'Documents processed successfully',
         documentCount: ids.length,
+        semanticScholarReferenceCount: references.length,
       });
     });
   } catch (error) {
@@ -75,30 +94,53 @@ export default async function handler(
   }
 }
 
-function processDocuments(docs: any) {
+function semanticScholarReferencesToDocuments(references: ScientificReference[]) {
+  return references.map((reference) => ({
+    pageContent: semanticScholarReferenceToText(reference),
+    metadata: {
+      sourceType: 'semantic-scholar',
+      source: reference.url || reference.paperId || reference.title,
+      title: reference.title,
+      paperId: reference.paperId,
+      url: reference.url,
+      year: reference.year,
+      loc: { pageNumber: 'ref' },
+    },
+  }));
+}
+
+function processDocuments(docs: any[]) {
   const ids = [];
   const metadatas = [];
   const documentContents = [];
+  const pageChunkCounts = new Map<string, number>();
 
   for (const document of docs) {
-    // Generate an ID for each document, or use some existing unique identifier
-    const id = uuidv4();
-    ids.push(id);
-
-    const fallbackTitle = path.basename(document.metadata.source);
-    const titleFromMetadata = document.metadata.pdf.info.Title;
-
-    const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
+    const sourceType = document.metadata.sourceType || 'upload';
+    const fallbackTitle = document.metadata.source
+      ? path.basename(document.metadata.source)
+      : 'Semantic Scholar reference';
+    const titleFromMetadata = document.metadata.pdf?.info?.Title;
+    const title = titleFromMetadata || document.metadata.title || fallbackTitle;
+    const page = document.metadata.loc?.pageNumber || document.metadata.page || 'ref';
+    const pageChunkKey = `${sourceType}:${title}:${page}`;
+    const chunkIndex = pageChunkCounts.get(pageChunkKey) || 0;
+    pageChunkCounts.set(pageChunkKey, chunkIndex + 1);
+
+    const metadata = buildRagMetadata({
+      title,
+      page,
+      source: document.metadata.source,
+      sourceType,
+      section: detectScientificSection(document.pageContent),
+      chunkIndex,
+      paperId: document.metadata.paperId,
+      url: document.metadata.url,
+      year: document.metadata.year,
+    });
 
-
-    const metadata = {
-      title: title,
-      page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
-      source: document.metadata.source, // Define this function to extract verse info
-    };
+    ids.push(String(metadata.citationKey));
     metadatas.push(metadata);
-
-    // Add the page content to the documents array
     documentContents.push(document.pageContent);
   }