aietal · watcharaponthod-code · May 30, 2026
diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts
@@ -0,0 +1,328 @@
+import {
+  buildCitationKey,
+  buildChunkMetadata,
+  buildEvidencePayload,
+  buildResearchQueries,
+  detectScientificSection,
+  fuseQueryResults,
+  parseBoundedInteger,
+  sectionWeight,
+} from '@/utils/server/scientific-rag';
+
+import { describe, expect, it } from 'vitest';
+
+// ---------------------------------------------------------------------------
+// parseBoundedInteger
+// ---------------------------------------------------------------------------
+describe('parseBoundedInteger', () => {
+  it('returns the fallback for invalid input', () => {
+    expect(parseBoundedInteger('bad', 8, 16)).toBe(8);
+    expect(parseBoundedInteger(undefined, 8, 16)).toBe(8);
+    expect(parseBoundedInteger(null, 8, 16)).toBe(8);
+    expect(parseBoundedInteger(0, 8, 16)).toBe(8);
+    expect(parseBoundedInteger(-5, 8, 16)).toBe(8);
+  });
+
+  it('clamps to max', () => {
+    expect(parseBoundedInteger(20, 8, 16)).toBe(16);
+    expect(parseBoundedInteger('100', 8, 16)).toBe(16);
+  });
+
+  it('accepts valid values within range', () => {
+    expect(parseBoundedInteger(10, 8, 16)).toBe(10);
+    expect(parseBoundedInteger('12', 8, 16)).toBe(12);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// detectScientificSection
+// ---------------------------------------------------------------------------
+describe('detectScientificSection', () => {
+  it('detects abstract at chunk start', () => {
+    expect(detectScientificSection('Abstract\nThis paper proposes…')).toBe('abstract');
+  });
+
+  it('detects methods section', () => {
+    expect(detectScientificSection('Methods\nWe trained a transformer…')).toBe('methods');
+  });
+
+  it('detects results section', () => {
+    expect(detectScientificSection('Results\nThe model achieved 94% accuracy…')).toBe('results');
+  });
+
+  it('detects "materials and methods"', () => {
+    expect(detectScientificSection('Materials and Methods\nSamples were…')).toBe(
+      'materials and methods',
+    );
+  });
+
+  it('falls back to body for plain text', () => {
+    expect(detectScientificSection('Some generic paragraph without headings.')).toBe('body');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// sectionWeight
+// ---------------------------------------------------------------------------
+describe('sectionWeight', () => {
+  it('abstract has the highest weight', () => {
+    expect(sectionWeight('abstract')).toBeGreaterThan(sectionWeight('body'));
+  });
+
+  it('results and methods are weighted above introduction', () => {
+    expect(sectionWeight('results')).toBeGreaterThan(sectionWeight('introduction'));
+    expect(sectionWeight('methods')).toBeGreaterThan(sectionWeight('introduction'));
+  });
+
+  it('references have the lowest weight', () => {
+    expect(sectionWeight('references')).toBeLessThan(sectionWeight('body'));
+  });
+
+  it('returns body weight for unknown sections', () => {
+    expect(sectionWeight('unknown-section')).toBe(sectionWeight('body'));
+  });
+});
+
+// ---------------------------------------------------------------------------
+// buildCitationKey
+// ---------------------------------------------------------------------------
+describe('buildCitationKey', () => {
+  it('produces stable human-readable keys', () => {
+    expect(buildCitationKey({ chunkIndex: 0, page: 3, title: 'My Paper' })).toBe(
+      'my-paper:p3:c1',
+    );
+  });
+
+  it('handles special characters in titles', () => {
+    const key = buildCitationKey({
+      chunkIndex: 2,
+      page: 7,
+      title: 'Attention Is All You Need!',
+    });
+    expect(key).toBe('attention-is-all-you-need:p7:c3');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// buildChunkMetadata
+// ---------------------------------------------------------------------------
+describe('buildChunkMetadata', () => {
+  it('extracts title from PDF metadata and strips temp path', () => {
+    const meta = buildChunkMetadata(
+      {
+        pageContent: 'Results\nThe method achieves SOTA.',
+        metadata: {
+          loc: { pageNumber: 5 },
+          pdf: { info: { Title: 'Scientific RAG Study' } },
+          source: '/tmp/uploads/secret/paper.pdf',
+        },
+      },
+      'paper.pdf',
+      2,
+      1,
+    );
+
+    expect(meta.title).toBe('Scientific RAG Study');
+    expect(meta.page).toBe(5);
+    expect(meta.section).toBe('results');
+    expect(meta.citationKey).toBe('scientific-rag-study:p5:c2');
+    // Public source must not leak the temp directory path
+    expect(meta.source).not.toContain('/tmp/uploads');
+    expect(meta.source).toBe('paper.pdf');
+    // Section weight for results should be above 1
+    expect(meta.sectionWeight).toBeGreaterThan(1);
+  });
+
+  it('falls back gracefully when metadata is missing', () => {
+    const meta = buildChunkMetadata(
+      { pageContent: 'Some body text.' },
+      'fallback.pdf',
+      0,
+    );
+
+    expect(meta.title).toBe('fallback.pdf');
+    expect(meta.section).toBe('body');
+    expect(meta.page).toBe('unknown');
+    expect(meta.citationKey).toMatch(/^fallback(-pdf)?:punknown:c1$/);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// buildResearchQueries
+// ---------------------------------------------------------------------------
+describe('buildResearchQueries', () => {
+  it('returns normalised original as first variant', () => {
+    const queries = buildResearchQueries('What is retrieval-augmented generation?');
+    expect(queries[0]).toBe('What is retrieval-augmented generation');
+  });
+
+  it('extracts quoted phrases as separate variants', () => {
+    const queries = buildResearchQueries(
+      'What does "retrieval augmented generation" improve in science?',
+    );
+    expect(queries).toContain('retrieval augmented generation');
+  });
+
+  it('produces keyword-only fallback variant', () => {
+    const queries = buildResearchQueries(
+      'How does cross-encoder reranking improve scientific retrieval accuracy?',
+    );
+    // Should contain a keyword-compact form without stop words
+    const hasKeywords = queries.some(
+      (q) => q.includes('cross-encoder') || q.includes('reranking'),
+    );
+    expect(hasKeywords).toBe(true);
+  });
+
+  it('never returns more than maxQueries variants', () => {
+    const queries = buildResearchQueries('some very long query', 2);
+    expect(queries.length).toBeLessThanOrEqual(2);
+  });
+
+  it('deduplicates identical variants', () => {
+    const queries = buildResearchQueries('short query');
+    const unique = new Set(queries);
+    expect(unique.size).toBe(queries.length);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// fuseQueryResults
+// ---------------------------------------------------------------------------
+describe('fuseQueryResults', () => {
+  const sampleResults = {
+    documents: [
+      ['Abstract chunk about RAG systems.', 'Methods chunk explaining setup.'],
+      ['Abstract chunk about RAG systems.'],
+    ],
+    metadatas: [
+      [
+        {
+          citationKey: 'rag-study:p1:c1',
+          page: 1,
+          section: 'abstract',
+          sectionWeight: 1.4,
+          source: 'rag-study.pdf',
+          title: 'RAG Study',
+          sourceId: 'DOC-ABC',
+          chunkIndex: 0,
+        },
+        {
+          citationKey: 'rag-study:p3:c1',
+          page: 3,
+          section: 'methods',
+          sectionWeight: 1.2,
+          source: 'rag-study.pdf',
+          title: 'RAG Study',
+          sourceId: 'DOC-ABC',
+          chunkIndex: 1,
+        },
+      ],
+      [
+        {
+          citationKey: 'rag-study:p1:c1',
+          page: 1,
+          section: 'abstract',
+          sectionWeight: 1.4,
+          source: 'rag-study.pdf',
+          title: 'RAG Study',
+          sourceId: 'DOC-ABC',
+          chunkIndex: 0,
+        },
+      ],
+    ],
+    distances: [[0.05, 0.25], [0.06]],
+    ids: [['id-a', 'id-b'], ['id-a']],
+  };
+
+  it('deduplicates chunks that appear in multiple query results', () => {
+    const fused = fuseQueryResults(sampleResults, 10);
+    expect(fused.length).toBe(2);
+  });
+
+  it('accumulates rank scores for repeated chunks', () => {
+    const fused = fuseQueryResults(sampleResults, 10);
+    const abstractChunk = fused.find((r) => r.citationKey === 'rag-study:p1:c1');
+    const methodsChunk = fused.find((r) => r.citationKey === 'rag-study:p3:c1');
+    expect(abstractChunk).toBeDefined();
+    expect(methodsChunk).toBeDefined();
+    // Abstract chunk appeared in two result sets so it should rank higher
+    expect(abstractChunk!.rankScore).toBeGreaterThan(methodsChunk!.rankScore);
+  });
+
+  it('ranks abstract above body sections when scores are close', () => {
+    const results = {
+      documents: [['Abstract: core claim.', 'Body paragraph.']],
+      metadatas: [
+        [
+          { citationKey: 'doc:p1:c1', page: 1, section: 'abstract', sectionWeight: 1.4, source: 'doc.pdf', title: 'Doc', sourceId: 'D1', chunkIndex: 0 },
+          { citationKey: 'doc:p5:c1', page: 5, section: 'body', sectionWeight: 0.8, source: 'doc.pdf', title: 'Doc', sourceId: 'D1', chunkIndex: 1 },
+        ],
+      ],
+      distances: [[0.1, 0.1]],
+      ids: [['a', 'b']],
+    };
+    const fused = fuseQueryResults(results, 10);
+    expect(fused[0].citationKey).toBe('doc:p1:c1');
+  });
+
+  it('respects the limit parameter', () => {
+    const fused = fuseQueryResults(sampleResults, 1);
+    expect(fused.length).toBe(1);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// buildEvidencePayload
+// ---------------------------------------------------------------------------
+describe('buildEvidencePayload', () => {
+  const results = {
+    documents: [['Chunk about scientific retrieval.'.repeat(15)]],
+    metadatas: [
+      [
+        {
+          citationKey: 'paper:p1:c1',
+          page: 1,
+          section: 'abstract',
+          sectionWeight: 1.4,
+          source: 'paper.pdf',
+          title: 'Paper',
+          sourceId: 'D1',
+          chunkIndex: 0,
+        },
+      ],
+    ],
+    distances: [[0.1]],
+    ids: [['id-1']],
+  };
+
+  it('includes citation key in evidence context', () => {
+    const payload = buildEvidencePayload(results);
+    expect(payload.evidenceContext).toContain('[paper:p1:c1]');
+  });
+
+  it('truncates chunks to maxChunkChars', () => {
+    const payload = buildEvidencePayload(results, { maxChunkChars: 40 });
+    expect(payload.citations[0].content.length).toBeLessThanOrEqual(43); // 40 + '...'
+  });
+
+  it('caps total evidence context to maxEvidenceChars', () => {
+    const payload = buildEvidencePayload(results, { maxEvidenceChars: 50 });
+    expect(payload.evidenceContext.length).toBeLessThanOrEqual(50);
+  });
+
+  it('builds source manifest grouped by source document', () => {
+    const payload = buildEvidencePayload(results);
+    expect(payload.sourceManifest).toHaveLength(1);
+    expect(payload.sourceManifest[0]).toMatchObject({
+      source: 'paper.pdf',
+      title: 'Paper',
+      citationKeys: ['paper:p1:c1'],
+    });
+  });
+
+  it('includes sectionWeight in evidence context block header', () => {
+    const payload = buildEvidencePayload(results);
+    expect(payload.evidenceContext).toContain('Weight: 1.40');
+  });
+});