Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 77 additions & 0 deletions ui/__tests__/scientific-rag.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import { describe, expect, it } from 'vitest';

import {
buildCitationKey,
buildRagMetadata,
detectScientificSection,
formatRetrievedDocuments,
parseSemanticScholarReferences,
semanticScholarReferenceToText,
} from '@/utils/server/scientific-rag';

describe('scientific RAG helpers', () => {
it('detects compound scientific sections before generic methods', () => {
expect(detectScientificSection('Materials and Methods\nWe collected samples')).toBe(
'materials-and-methods',
);
expect(detectScientificSection('Abstract\nThis paper studies retrieval')).toBe('abstract');
});

it('builds stable citation keys for uploaded documents', () => {
expect(
buildCitationKey({ title: 'My Paper.pdf', page: 3, chunkIndex: 2 }),
).toBe('doc:my-paper-pdf:p3:c2');
});

it('builds stable citation keys for Semantic Scholar references', () => {
expect(
buildCitationKey({
sourceType: 'semantic-scholar',
paperId: 'abc123',
title: 'Ignored when paper id exists',
chunkIndex: 1,
}),
).toBe('scholar:abc123:ref:c1');
});

it('converts Semantic Scholar references into indexable text', () => {
const text = semanticScholarReferenceToText({
paperId: 'paper-1',
title: 'Retrieval for Science',
abstract: 'A study of citation-grounded retrieval.',
authors: [{ name: 'Ada Lovelace' }, 'Grace Hopper'],
year: 2026,
venue: 'ISAAC',
});

expect(text).toContain('Title: Retrieval for Science');
expect(text).toContain('Authors: Ada Lovelace, Grace Hopper');
expect(text).toContain('Semantic Scholar Paper ID: paper-1');
});

it('parses saved Semantic Scholar references from form fields', () => {
const refs = parseSemanticScholarReferences([
JSON.stringify([{ paperId: 'paper-1', title: 'A' }]),
]);

expect(refs).toEqual([{ paperId: 'paper-1', title: 'A' }]);
});

it('formats retrieval results with citation keys and distances', () => {
const formatted = formatRetrievedDocuments({
documents: [['Chunk text']],
metadatas: [[buildRagMetadata({ title: 'Paper', page: 1, chunkIndex: 0 })]],
distances: [[0.123456]],
});

expect(formatted).toContain('[doc:paper:p1:c0]');
expect(formatted).toContain('Distance: 0.1235');
expect(formatted).toContain('Chunk text');
});

it('handles empty retrieval results defensively', () => {
expect(formatRetrievedDocuments({ documents: [[]], metadatas: [[]] })).toBe(
'No relevant documents were retrieved.',
);
});
});
38 changes: 25 additions & 13 deletions ui/pages/api/fetch-documents.ts
Original file line number Diff line number Diff line change
@@ -1,23 +1,35 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
import type { NextApiRequest, NextApiResponse } from 'next';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
try {
if (req.method !== 'POST') {
return res.status(405).end();
}

const query = typeof req.body?.input === 'string' ? req.body.input.trim() : '';
if (!query) {
return res.status(400).json({ error: 'Missing retrieval query' });
}

const requestedResults = Number(req.body?.nResults || 6);
const nResults = Math.min(Math.max(requestedResults, 1), 10);

const client = new ChromaClient({
path: "http://chroma-server:8000",
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const query = req.body.input;

const embedder = new TransformersEmbeddingFunction();
const collection = await client.getOrCreateCollection({
name: 'default-collection',
embeddingFunction: embedder,
});

const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });

// query the collection
const results = await collection.query({
nResults: 4,
queryTexts: [query]
})
const results = await collection.query({
nResults,
queryTexts: [query],
});

res.status(200).json(results);
} catch (error) {
Expand All @@ -29,4 +41,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
res.status(500).json({ error: 'An unexpected error occurred :(' });
}
}
}
100 changes: 71 additions & 29 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,17 @@ import type { NextApiRequest, NextApiResponse } from 'next';
import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import path from 'path';
import { v4 as uuidv4 } from 'uuid';

import {
buildRagMetadata,
detectScientificSection,
parseSemanticScholarReferences,
SCIENTIFIC_TEXT_SEPARATORS,
semanticScholarReferenceToText,
type ScientificReference,
} from '@/utils/server/scientific-rag';

export const config = {
api: {
Expand All @@ -29,25 +36,36 @@ export default async function handler(
return res.status(400).json({ error: 'Failed to upload file' });
}

const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;
const references = parseSemanticScholarReferences(fields.references);

if (!pdfFile?.filepath && references.length === 0) {
return res.status(400).json({
error: 'Upload a PDF or provide Semantic Scholar references',
});
}

const client = new ChromaClient({
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);

const originalDocs = await loader.load();
const originalDocs = [];

console.log(JSON.stringify(originalDocs));
if (pdfFile?.filepath) {
const loader = new PDFLoader(pdfFile.filepath);
originalDocs.push(...(await loader.load()));
}

originalDocs.push(...semanticScholarReferencesToDocuments(references));

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
chunkSize: 700,
chunkOverlap: 120,
separators: SCIENTIFIC_TEXT_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);

// Process the documents and perform other logic

const { ids, metadatas, documentContents } = processDocuments(docs);

const embedder = new TransformersEmbeddingFunction();
Expand All @@ -65,6 +83,7 @@ export default async function handler(
res.status(200).json({
message: 'Documents processed successfully',
documentCount: ids.length,
semanticScholarReferenceCount: references.length,
});
});
} catch (error) {
Expand All @@ -75,30 +94,53 @@ export default async function handler(
}
}

function processDocuments(docs: any) {
function semanticScholarReferencesToDocuments(references: ScientificReference[]) {
return references.map((reference) => ({
pageContent: semanticScholarReferenceToText(reference),
metadata: {
sourceType: 'semantic-scholar',
source: reference.url || reference.paperId || reference.title,
title: reference.title,
paperId: reference.paperId,
url: reference.url,
year: reference.year,
loc: { pageNumber: 'ref' },
},
}));
}

function processDocuments(docs: any[]) {
const ids = [];
const metadatas = [];
const documentContents = [];
const pageChunkCounts = new Map<string, number>();

for (const document of docs) {
// Generate an ID for each document, or use some existing unique identifier
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;
const sourceType = document.metadata.sourceType || 'upload';
const fallbackTitle = document.metadata.source
? path.basename(document.metadata.source)
: 'Semantic Scholar reference';
const titleFromMetadata = document.metadata.pdf?.info?.Title;
const title = titleFromMetadata || document.metadata.title || fallbackTitle;
const page = document.metadata.loc?.pageNumber || document.metadata.page || 'ref';
const pageChunkKey = `${sourceType}:${title}:${page}`;
const chunkIndex = pageChunkCounts.get(pageChunkKey) || 0;
pageChunkCounts.set(pageChunkKey, chunkIndex + 1);

const metadata = buildRagMetadata({
title,
page,
source: document.metadata.source,
sourceType,
section: detectScientificSection(document.pageContent),
chunkIndex,
paperId: document.metadata.paperId,
url: document.metadata.url,
year: document.metadata.year,
});


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
ids.push(String(metadata.citationKey));
metadatas.push(metadata);

// Add the page content to the documents array
documentContents.push(document.pageContent);
}

Expand Down
Loading