Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
328 changes: 328 additions & 0 deletions ui/__tests__/scientific-rag.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,328 @@
import {
buildCitationKey,
buildChunkMetadata,
buildEvidencePayload,
buildResearchQueries,
detectScientificSection,
fuseQueryResults,
parseBoundedInteger,
sectionWeight,
} from '@/utils/server/scientific-rag';

import { describe, expect, it } from 'vitest';

// ---------------------------------------------------------------------------
// parseBoundedInteger
// ---------------------------------------------------------------------------
describe('parseBoundedInteger', () => {
it('returns the fallback for invalid input', () => {
expect(parseBoundedInteger('bad', 8, 16)).toBe(8);
expect(parseBoundedInteger(undefined, 8, 16)).toBe(8);
expect(parseBoundedInteger(null, 8, 16)).toBe(8);
expect(parseBoundedInteger(0, 8, 16)).toBe(8);
expect(parseBoundedInteger(-5, 8, 16)).toBe(8);
});

it('clamps to max', () => {
expect(parseBoundedInteger(20, 8, 16)).toBe(16);
expect(parseBoundedInteger('100', 8, 16)).toBe(16);
});

it('accepts valid values within range', () => {
expect(parseBoundedInteger(10, 8, 16)).toBe(10);
expect(parseBoundedInteger('12', 8, 16)).toBe(12);
});
});

// ---------------------------------------------------------------------------
// detectScientificSection
// ---------------------------------------------------------------------------
describe('detectScientificSection', () => {
it('detects abstract at chunk start', () => {
expect(detectScientificSection('Abstract\nThis paper proposes…')).toBe('abstract');
});

it('detects methods section', () => {
expect(detectScientificSection('Methods\nWe trained a transformer…')).toBe('methods');
});

it('detects results section', () => {
expect(detectScientificSection('Results\nThe model achieved 94% accuracy…')).toBe('results');
});

it('detects "materials and methods"', () => {
expect(detectScientificSection('Materials and Methods\nSamples were…')).toBe(
'materials and methods',
);
});

it('falls back to body for plain text', () => {
expect(detectScientificSection('Some generic paragraph without headings.')).toBe('body');
});
});

// ---------------------------------------------------------------------------
// sectionWeight
// ---------------------------------------------------------------------------
describe('sectionWeight', () => {
it('abstract has the highest weight', () => {
expect(sectionWeight('abstract')).toBeGreaterThan(sectionWeight('body'));
});

it('results and methods are weighted above introduction', () => {
expect(sectionWeight('results')).toBeGreaterThan(sectionWeight('introduction'));
expect(sectionWeight('methods')).toBeGreaterThan(sectionWeight('introduction'));
});

it('references have the lowest weight', () => {
expect(sectionWeight('references')).toBeLessThan(sectionWeight('body'));
});

it('returns body weight for unknown sections', () => {
expect(sectionWeight('unknown-section')).toBe(sectionWeight('body'));
});
});

// ---------------------------------------------------------------------------
// buildCitationKey
// ---------------------------------------------------------------------------
describe('buildCitationKey', () => {
it('produces stable human-readable keys', () => {
expect(buildCitationKey({ chunkIndex: 0, page: 3, title: 'My Paper' })).toBe(
'my-paper:p3:c1',
);
});

it('handles special characters in titles', () => {
const key = buildCitationKey({
chunkIndex: 2,
page: 7,
title: 'Attention Is All You Need!',
});
expect(key).toBe('attention-is-all-you-need:p7:c3');
});
});

// ---------------------------------------------------------------------------
// buildChunkMetadata
// ---------------------------------------------------------------------------
describe('buildChunkMetadata', () => {
it('extracts title from PDF metadata and strips temp path', () => {
const meta = buildChunkMetadata(
{
pageContent: 'Results\nThe method achieves SOTA.',
metadata: {
loc: { pageNumber: 5 },
pdf: { info: { Title: 'Scientific RAG Study' } },
source: '/tmp/uploads/secret/paper.pdf',
},
},
'paper.pdf',
2,
1,
);

expect(meta.title).toBe('Scientific RAG Study');
expect(meta.page).toBe(5);
expect(meta.section).toBe('results');
expect(meta.citationKey).toBe('scientific-rag-study:p5:c2');
// Public source must not leak the temp directory path
expect(meta.source).not.toContain('/tmp/uploads');
expect(meta.source).toBe('paper.pdf');
// Section weight for results should be above 1
expect(meta.sectionWeight).toBeGreaterThan(1);
});

it('falls back gracefully when metadata is missing', () => {
const meta = buildChunkMetadata(
{ pageContent: 'Some body text.' },
'fallback.pdf',
0,
);

expect(meta.title).toBe('fallback.pdf');
expect(meta.section).toBe('body');
expect(meta.page).toBe('unknown');
expect(meta.citationKey).toMatch(/^fallback(-pdf)?:punknown:c1$/);
});
});

// ---------------------------------------------------------------------------
// buildResearchQueries
// ---------------------------------------------------------------------------
describe('buildResearchQueries', () => {
it('returns normalised original as first variant', () => {
const queries = buildResearchQueries('What is retrieval-augmented generation?');
expect(queries[0]).toBe('What is retrieval-augmented generation');
});

it('extracts quoted phrases as separate variants', () => {
const queries = buildResearchQueries(
'What does "retrieval augmented generation" improve in science?',
);
expect(queries).toContain('retrieval augmented generation');
});

it('produces keyword-only fallback variant', () => {
const queries = buildResearchQueries(
'How does cross-encoder reranking improve scientific retrieval accuracy?',
);
// Should contain a keyword-compact form without stop words
const hasKeywords = queries.some(
(q) => q.includes('cross-encoder') || q.includes('reranking'),
);
expect(hasKeywords).toBe(true);
});

it('never returns more than maxQueries variants', () => {
const queries = buildResearchQueries('some very long query', 2);
expect(queries.length).toBeLessThanOrEqual(2);
});

it('deduplicates identical variants', () => {
const queries = buildResearchQueries('short query');
const unique = new Set(queries);
expect(unique.size).toBe(queries.length);
});
});

// ---------------------------------------------------------------------------
// fuseQueryResults
// ---------------------------------------------------------------------------
describe('fuseQueryResults', () => {
const sampleResults = {
documents: [
['Abstract chunk about RAG systems.', 'Methods chunk explaining setup.'],
['Abstract chunk about RAG systems.'],
],
metadatas: [
[
{
citationKey: 'rag-study:p1:c1',
page: 1,
section: 'abstract',
sectionWeight: 1.4,
source: 'rag-study.pdf',
title: 'RAG Study',
sourceId: 'DOC-ABC',
chunkIndex: 0,
},
{
citationKey: 'rag-study:p3:c1',
page: 3,
section: 'methods',
sectionWeight: 1.2,
source: 'rag-study.pdf',
title: 'RAG Study',
sourceId: 'DOC-ABC',
chunkIndex: 1,
},
],
[
{
citationKey: 'rag-study:p1:c1',
page: 1,
section: 'abstract',
sectionWeight: 1.4,
source: 'rag-study.pdf',
title: 'RAG Study',
sourceId: 'DOC-ABC',
chunkIndex: 0,
},
],
],
distances: [[0.05, 0.25], [0.06]],
ids: [['id-a', 'id-b'], ['id-a']],
};

it('deduplicates chunks that appear in multiple query results', () => {
const fused = fuseQueryResults(sampleResults, 10);
expect(fused.length).toBe(2);
});

it('accumulates rank scores for repeated chunks', () => {
const fused = fuseQueryResults(sampleResults, 10);
const abstractChunk = fused.find((r) => r.citationKey === 'rag-study:p1:c1');
const methodsChunk = fused.find((r) => r.citationKey === 'rag-study:p3:c1');
expect(abstractChunk).toBeDefined();
expect(methodsChunk).toBeDefined();
// Abstract chunk appeared in two result sets so it should rank higher
expect(abstractChunk!.rankScore).toBeGreaterThan(methodsChunk!.rankScore);
});

it('ranks abstract above body sections when scores are close', () => {
const results = {
documents: [['Abstract: core claim.', 'Body paragraph.']],
metadatas: [
[
{ citationKey: 'doc:p1:c1', page: 1, section: 'abstract', sectionWeight: 1.4, source: 'doc.pdf', title: 'Doc', sourceId: 'D1', chunkIndex: 0 },
{ citationKey: 'doc:p5:c1', page: 5, section: 'body', sectionWeight: 0.8, source: 'doc.pdf', title: 'Doc', sourceId: 'D1', chunkIndex: 1 },
],
],
distances: [[0.1, 0.1]],
ids: [['a', 'b']],
};
const fused = fuseQueryResults(results, 10);
expect(fused[0].citationKey).toBe('doc:p1:c1');
});

it('respects the limit parameter', () => {
const fused = fuseQueryResults(sampleResults, 1);
expect(fused.length).toBe(1);
});
});

// ---------------------------------------------------------------------------
// buildEvidencePayload
// ---------------------------------------------------------------------------
describe('buildEvidencePayload', () => {
const results = {
documents: [['Chunk about scientific retrieval.'.repeat(15)]],
metadatas: [
[
{
citationKey: 'paper:p1:c1',
page: 1,
section: 'abstract',
sectionWeight: 1.4,
source: 'paper.pdf',
title: 'Paper',
sourceId: 'D1',
chunkIndex: 0,
},
],
],
distances: [[0.1]],
ids: [['id-1']],
};

it('includes citation key in evidence context', () => {
const payload = buildEvidencePayload(results);
expect(payload.evidenceContext).toContain('[paper:p1:c1]');
});

it('truncates chunks to maxChunkChars', () => {
const payload = buildEvidencePayload(results, { maxChunkChars: 40 });
expect(payload.citations[0].content.length).toBeLessThanOrEqual(43); // 40 + '...'
});

it('caps total evidence context to maxEvidenceChars', () => {
const payload = buildEvidencePayload(results, { maxEvidenceChars: 50 });
expect(payload.evidenceContext.length).toBeLessThanOrEqual(50);
});

it('builds source manifest grouped by source document', () => {
const payload = buildEvidencePayload(results);
expect(payload.sourceManifest).toHaveLength(1);
expect(payload.sourceManifest[0]).toMatchObject({
source: 'paper.pdf',
title: 'Paper',
citationKeys: ['paper:p1:c1'],
});
});

it('includes sectionWeight in evidence context block header', () => {
const payload = buildEvidencePayload(results);
expect(payload.evidenceContext).toContain('Weight: 1.40');
});
});
Loading