Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 137 additions & 0 deletions ui/__tests__/research-rag.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import {
buildEvidencePayload,
buildResearchMetadata,
buildResearchQueries,
detectScientificSection,
fuseChromaResults,
parseBoundedInteger,
} from '@/utils/server/research-rag';

import { describe, expect, it } from 'vitest';

describe('research RAG helpers', () => {
it('builds deterministic research query variants', () => {
expect(
buildResearchQueries(
'What does "retrieval augmented generation" improve in scientific workflows?',
),
).toEqual([
'What does "retrieval augmented generation" improve in scientific workflows',
'retrieval augmented generation',
'retrieval augmented generation improve scientific workflows',
]);
});

it('detects scientific sections near chunk starts', () => {
expect(detectScientificSection('Abstract\nThis paper studies RAG.')).toBe(
'abstract',
);
expect(
detectScientificSection('Materials and Methods\nWe used a benchmark.'),
).toBe('materials and methods');
expect(detectScientificSection('A general paragraph.')).toBe('body');
});

it('builds citation metadata without leaking temporary upload paths', () => {
const metadata = buildResearchMetadata(
{
pageContent: 'Results\nThe method improves grounded answers.',
metadata: {
loc: { pageNumber: 7 },
pdf: { info: { Title: 'Grounded Scientific RAG' } },
source: '/tmp/uploads/private/source-paper.pdf',
},
},
'source-paper.pdf',
3,
1,
);

expect(metadata).toMatchObject({
citationKey: 'grounded-scientific-rag:p7:c2',
page: 7,
section: 'results',
source: 'source-paper.pdf',
title: 'Grounded Scientific RAG',
});
expect(metadata.source).not.toContain('/tmp/uploads');
});

it('bounds integer request parameters', () => {
expect(parseBoundedInteger('20', 8, 16)).toBe(16);
expect(parseBoundedInteger(0, 8, 16)).toBe(8);
expect(parseBoundedInteger('bad', 8, 16)).toBe(8);
});

it('fuses duplicate chunks across query variants', () => {
const fused = fuseChromaResults(
{
documents: [
['The answer is grounded in chunk one.', 'A second chunk.'],
['The answer is grounded in chunk one.'],
],
metadatas: [
[
{
citationKey: 'paper:p1:c1',
page: 1,
source: 'paper.pdf',
title: 'Paper',
},
{
citationKey: 'paper:p2:c1',
page: 2,
source: 'paper.pdf',
title: 'Paper',
},
],
[
{
citationKey: 'paper:p1:c1',
page: 1,
source: 'paper.pdf',
title: 'Paper',
},
],
],
distances: [[0.05, 0.4], [0.06]],
ids: [['a', 'b'], ['a']],
},
4,
);

expect(fused).toHaveLength(2);
expect(fused[0].citationKey).toBe('paper:p1:c1');
expect(fused[0].rankScore).toBeGreaterThan(fused[1].rankScore);
});

it('formats bounded evidence and source manifests', () => {
const payload = buildEvidencePayload(
{
documents: [['Chunk about scientific retrieval.'.repeat(20)]],
metadatas: [
[
{
citationKey: 'paper:p1:c1',
page: 1,
section: 'abstract',
source: 'paper.pdf',
title: 'Paper',
},
],
],
distances: [[0.1]],
},
{ maxChunkChars: 40, maxEvidenceChars: 200, maxResults: 2 },
);

expect(payload.citations).toHaveLength(1);
expect(payload.citations[0].content.length).toBeLessThanOrEqual(40);
expect(payload.evidenceContext).toContain('[paper:p1:c1]');
expect(payload.sourceManifest[0]).toMatchObject({
citationKeys: ['paper:p1:c1'],
source: 'paper.pdf',
title: 'Paper',
});
});
});
67 changes: 54 additions & 13 deletions ui/pages/api/fetch-documents.ts
Original file line number Diff line number Diff line change
@@ -1,25 +1,66 @@
import type { NextApiRequest, NextApiResponse } from "next";
import { ChromaClient, TransformersEmbeddingFunction } from "chromadb";
import type { NextApiRequest, NextApiResponse } from 'next';

export default async function handler(req: NextApiRequest, res: NextApiResponse) {
import {
buildEvidencePayload,
buildResearchQueries,
parseBoundedInteger,
} from '@/utils/server/research-rag';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';

export default async function handler(
req: NextApiRequest,
res: NextApiResponse,
) {
try {
if (req.method !== 'POST') {
res.setHeader('Allow', 'POST');
return res.status(405).json({ error: 'Method not allowed' });
}

const client = new ChromaClient({
path: "http://chroma-server:8000",
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const query = req.body.input;
const query =
typeof req.body.input === 'string' ? req.body.input.trim() : '';

if (!query) {
return res.status(400).json({ error: 'Missing retrieval query' });
}

const nResults = parseBoundedInteger(req.body.nResults, 8, 16);
const maxEvidenceChars = parseBoundedInteger(
req.body.maxEvidenceChars,
12000,
30000,
);
const queryTexts = buildResearchQueries(query);

const embedder = new TransformersEmbeddingFunction();

const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder });
const collection = await client.getOrCreateCollection({
name: 'default-collection',
embeddingFunction: embedder,
});

// Query deterministic research-focused variants, then fuse the result sets.
const results = await collection.query({
nResults,
queryTexts,
include: ['documents', 'metadatas', 'distances'] as any,
});

// query the collection
const results = await collection.query({
nResults: 4,
queryTexts: [query]
})
const evidence = buildEvidencePayload(results, {
maxEvidenceChars,
maxResults: nResults,
});

res.status(200).json(results);
res.status(200).json({
...results,
queryTexts,
...evidence,
});
} catch (error) {
if (error instanceof Error) {
console.error('Error message:', error.message);
Expand All @@ -29,4 +70,4 @@ export default async function handler(req: NextApiRequest, res: NextApiResponse)
}
res.status(500).json({ error: 'An unexpected error occurred :(' });
}
}
}
82 changes: 55 additions & 27 deletions ui/pages/api/inject-documents.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import type { NextApiRequest, NextApiResponse } from 'next';

import {
type LoadedDocument,
type PrimitiveMetadata,
RESEARCH_TEXT_SEPARATORS,
buildResearchMetadata,
} from '@/utils/server/research-rag';

import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb';
import { IncomingForm } from 'formidable';
import { PDFLoader } from 'langchain/document_loaders/fs/pdf';
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter';
import path from 'path';
import { v4 as uuidv4 } from 'uuid';

Expand Down Expand Up @@ -33,22 +39,31 @@ export default async function handler(
path: process.env.CHROMA_PATH || 'http://chroma-server:8000',
});

const loader = new PDFLoader(files.pdf[0].filepath);
const pdfFile = Array.isArray(files.pdf) ? files.pdf[0] : files.pdf;

const originalDocs = await loader.load();
if (!pdfFile?.filepath) {
return res.status(400).json({ error: 'Missing PDF upload' });
}

console.log(JSON.stringify(originalDocs));
const fallbackSource =
pdfFile.originalFilename ?? path.basename(pdfFile.filepath);
const loader = new PDFLoader(pdfFile.filepath);

const originalDocs = await loader.load();

const splitter = new RecursiveCharacterTextSplitter({
chunkSize: 500,
chunkOverlap: 100,
});
chunkSize: 900,
chunkOverlap: 180,
separators: RESEARCH_TEXT_SEPARATORS,
});

const docs = await splitter.splitDocuments(originalDocs);

// Process the documents and perform other logic
const { ids, metadatas, documentContents } = processDocuments(docs);
const { ids, metadatas, documentContents } = processDocuments(
docs,
fallbackSource,
);

const embedder = new TransformersEmbeddingFunction();
const collection = await client.getOrCreateCollection({
Expand All @@ -75,28 +90,41 @@ export default async function handler(
}
}

function processDocuments(docs: any) {
const ids = [];
const metadatas = [];
const documentContents = [];
function processDocuments(docs: LoadedDocument[], fallbackSource: string) {
const ids: string[] = [];
const metadatas: PrimitiveMetadata[] = [];
const documentContents: string[] = [];
const pageChunkCounts = new Map<string, number>();

for (const document of docs) {
for (let index = 0; index < docs.length; index += 1) {
const document = docs[index];
// Generate an ID for each document, or use some existing unique identifier
const id = uuidv4();
ids.push(id);

const fallbackTitle = path.basename(document.metadata.source);
const titleFromMetadata = document.metadata.pdf.info.Title;

const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle;


const metadata = {
title: title,
page: document.metadata.loc.pageNumber, // Define this function to extract chapter info
source: document.metadata.source, // Define this function to extract verse info
};
metadatas.push(metadata);
const source =
typeof document.metadata === 'object' &&
document.metadata !== null &&
'source' in document.metadata &&
typeof document.metadata.source === 'string'
? document.metadata.source
: fallbackSource;
const page =
typeof document.metadata === 'object' &&
document.metadata !== null &&
'loc' in document.metadata &&
typeof document.metadata.loc === 'object' &&
document.metadata.loc !== null &&
'pageNumber' in document.metadata.loc
? document.metadata.loc.pageNumber
: 'unknown';
const pageKey = `${source}:${page}`;
const pageChunkIndex = pageChunkCounts.get(pageKey) ?? 0;
pageChunkCounts.set(pageKey, pageChunkIndex + 1);

metadatas.push(
buildResearchMetadata(document, fallbackSource, index, pageChunkIndex),
);

// Add the page content to the documents array
documentContents.push(document.pageContent);
Expand Down
Loading