diff --git a/package.json b/package.json index ed2e817..426b148 100644 --- a/package.json +++ b/package.json @@ -36,14 +36,6 @@ "express": "^4.18.2", "js-yaml": "^4.1.1" }, - "peerDependencies": { - "playwright": ">=1.40.0" - }, - "peerDependenciesMeta": { - "playwright": { - "optional": true - } - }, "devDependencies": { "@types/blessed": "^0.1.7", "@types/better-sqlite3": "^7.6.13", diff --git a/src/cli/commands/add.ts b/src/cli/commands/add.ts deleted file mode 100644 index 591f40c..0000000 --- a/src/cli/commands/add.ts +++ /dev/null @@ -1,179 +0,0 @@ -/** - * `ob add ` CLI command. - * - * Ingests content from a URL using the fast-path primary extractor, falling - * back to a Playwright headless-browser fetch when: - * - The Playwright fallback is explicitly enabled (via config or environment - * variable `OB_PLAYWRIGHT_FALLBACK=1`), AND - * - The primary extractor returns fewer characters than the configured - * minimum content length threshold. - * - * Usage: - * ob add [--playwright-fallback] [--min-content-length ] [--timeout ] - * - * The command writes the ingested content to stdout (or a file if `--output` - * is provided) in a format compatible with the downstream ingestion pipeline. - * It exits with a non-zero status code only on unrecoverable errors (e.g. an - * invalid URL); partial or empty content from a gracefully-degraded fetch is - * not treated as a fatal error. - */ - -import { IngestionService, type IngestionConfig } from '../../lib/ingestion/service.js'; -import { FetchExtractor } from '../../lib/ingestion/extractor-fetch.js'; -import { PlaywrightExtractor, type PlaywrightBrowserChannel } from '../../lib/ingestion/extractor-playwright.js'; -import type { ExtractResult } from '../../lib/ingestion/extractor.js'; -import * as fs from 'fs'; - -/** Options accepted by the `add` command. */ -export interface AddCommandOptions { - /** Enable the Playwright fallback (overrides config). */ - playwrightFallback?: boolean; - /** Minimum content length before fallback is triggered. */ - minContentLength?: number; - /** Playwright navigation timeout in milliseconds. */ - timeout?: number; - /** Browser channel to use for Playwright. */ - browser?: PlaywrightBrowserChannel; - /** Write output to this file path instead of stdout. */ - output?: string; - /** - * Injectable file-write function for testing. - * Defaults to `fs.writeFileSync` when not provided. - * @internal - */ - _writeFile?: (path: string, data: string, encoding: BufferEncoding) => void; -} - -/** - * Dependency-injectable runner for the `add` command. - * - * Separated from Commander.js setup so it can be unit-tested without starting - * a real CLI process. - * - * @param url The URL to ingest. - * @param options Command options. - * @returns The ingested {@link ExtractResult}. - */ -export async function runAdd( - url: string, - options: AddCommandOptions = {} -): Promise { - if (!url) { - throw new Error('URL is required'); - } - - const ingestionConfig: IngestionConfig = { - playwrightFallback: options.playwrightFallback ?? false, - minContentLength: options.minContentLength, - }; - - const playwrightExtractor = new PlaywrightExtractor({ - browser: options.browser ?? 'chromium', - timeoutMs: options.timeout ?? 30_000, - }); - - const service = new IngestionService( - new FetchExtractor(), - playwrightExtractor, - ingestionConfig - ); - - const result = await service.ingest(url); - - if (options.output) { - const writeFile = options._writeFile ?? fs.writeFileSync; - writeFile(options.output, result.text, 'utf8'); - } else { - process.stdout.write(result.text); - if (result.text.length > 0 && !result.text.endsWith('\n')) { - process.stdout.write('\n'); - } - } - - return result; -} - -// --------------------------------------------------------------------------- -// Commander.js registration (optional — used when loaded as a plugin command) -// --------------------------------------------------------------------------- - -/** - * Register the `add` command with a Commander.js program. - * - * This function follows the same pattern as other commands in `src/commands/`. - * - * @param program The root Commander.js Command instance. - */ -export function registerAddCommand(program: { - command: (name: string) => CommandBuilder; -}): void { - program - .command('add ') - .description('Ingest content from a URL (with optional Playwright fallback for JS-heavy pages)') - .option( - '--playwright-fallback', - 'Enable Playwright headless-browser fallback (requires `playwright` package)', - false - ) - .option( - '--min-content-length ', - 'Minimum character count before Playwright fallback is triggered', - '200' - ) - .option( - '--timeout ', - 'Playwright navigation timeout in milliseconds', - '30000' - ) - .option( - '--browser ', - 'Playwright browser channel: chromium | firefox | webkit', - 'chromium' - ) - .option('--output ', 'Write ingested content to file instead of stdout') - .action(async (url: string, opts: Record) => { - const parsePositiveInt = ( - raw: string | boolean | undefined, - name: string - ): number | undefined => { - if (raw === undefined || raw === false || raw === '') return undefined; - const n = parseInt(String(raw), 10); - if (isNaN(n) || n <= 0) { - process.stderr.write( - `[ob add] Error: --${name} must be a positive integer (received: ${String(raw)})\n` - ); - process.exit(1); - } - return n; - }; - - const options: AddCommandOptions = { - playwrightFallback: Boolean(opts['playwrightFallback']), - minContentLength: parsePositiveInt(opts['minContentLength'], 'min-content-length'), - timeout: parsePositiveInt(opts['timeout'], 'timeout'), - browser: opts['browser'] as PlaywrightBrowserChannel | undefined, - output: opts['output'] ? String(opts['output']) : undefined, - }; - - try { - await runAdd(url, options); - } catch (err) { - process.stderr.write( - `[ob add] Error: ${err instanceof Error ? err.message : String(err)}\n` - ); - process.exit(1); - } - }); -} - -// --------------------------------------------------------------------------- -// Minimal type shim for Commander.js .option / .action chaining -// (avoids requiring a Commander.js type import in this standalone module) -// --------------------------------------------------------------------------- - -interface CommandBuilder { - description: (desc: string) => CommandBuilder; - option: (flags: string, description: string, defaultValue?: string | boolean) => CommandBuilder; - // eslint-disable-next-line @typescript-eslint/no-explicit-any - action: (fn: (...args: any[]) => void | Promise) => CommandBuilder; -} diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts deleted file mode 100644 index c9fa0f4..0000000 --- a/src/lib/ingestion/extractor-fetch.ts +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Primary fast-path extractor. - * - * Fetches a URL using the built-in `fetch` API and extracts plain text by - * stripping HTML tags. This is the default extractor used by the ingestion - * service; it is fast but cannot handle pages that render content with - * client-side JavaScript. - */ - -import type { Extractor, ExtractResult } from './extractor.js'; - -/** Configuration for the primary fetch-based extractor. */ -export interface FetchExtractorConfig { - /** - * Request timeout in milliseconds. - * @default 15000 - */ - timeoutMs?: number; - /** - * User-Agent string to send with requests. - */ - userAgent?: string; -} - -const DEFAULT_TIMEOUT_MS = 15_000; -const DEFAULT_USER_AGENT = - 'Mozilla/5.0 (compatible; OpenBrain-Ingestion/1.0; +https://github.com/TheWizardsCode/ContextHub)'; - -/** - * Strips HTML tags and collapses whitespace from `html`. - * Returns the resulting plain text. - */ -export function htmlToText(html: string): string { - // Remove script and style blocks entirely (allow attributes/whitespace before closing >). - let text = html - .replace(/]*>/gi, ' ') - .replace(/]*>/gi, ' '); - // Replace block-level elements with newlines. - text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n'); - // Strip all remaining tags. - text = text.replace(/<[^>]+>/g, ' '); - // Decode common named HTML entities in a single pass to avoid double-decoding - // (e.g. &lt; must become <, not <). - // Note: numeric character references (e.g. A or A) are intentionally - // not decoded here since they are uncommon in plain-content pages and the - // purpose of this extractor is fast-path text extraction, not full HTML parsing. - text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => { - switch (match) { - case '&': return '&'; - case '<': return '<'; - case '>': return '>'; - case '"': return '"'; - case ''': return "'"; - case ' ': return ' '; - default: return match; - } - }); - // Collapse whitespace. - return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim(); -} - -/** - * Extracts the content of the first `` tag in `html`. - */ -export function extractTitle(html: string): string | undefined { - const match = /<title[^>]*>([^<]*)<\/title>/i.exec(html); - return match ? match[1].trim() : undefined; -} - -/** - * Primary extractor that uses HTTP fetch + HTML-to-text conversion. - * - * This extractor is fast and has no external runtime dependencies. It is - * the default first-pass extractor used by the ingestion service before - * attempting the Playwright fallback. - */ -export class FetchExtractor implements Extractor { - private readonly config: Required<FetchExtractorConfig>; - - constructor(config: FetchExtractorConfig = {}) { - this.config = { - timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS, - userAgent: config.userAgent ?? DEFAULT_USER_AGENT, - }; - } - - async extract(url: string): Promise<ExtractResult> { - let html: string; - - try { - const controller = new AbortController(); - const timer = setTimeout( - () => controller.abort(), - this.config.timeoutMs - ); - try { - const response = await fetch(url, { - signal: controller.signal, - headers: { 'User-Agent': this.config.userAgent }, - }); - html = await response.text(); - } finally { - clearTimeout(timer); - } - } catch { - return { text: '', url }; - } - - return { - text: htmlToText(html), - html, - title: extractTitle(html), - url, - }; - } -} diff --git a/src/lib/ingestion/extractor-playwright.ts b/src/lib/ingestion/extractor-playwright.ts deleted file mode 100644 index e8d6a87..0000000 --- a/src/lib/ingestion/extractor-playwright.ts +++ /dev/null @@ -1,220 +0,0 @@ -/** - * Playwright-based fallback extractor. - * - * Uses a headless Chromium (or configured) browser to render JavaScript-heavy - * pages that the primary fast-path extractor cannot handle. - * - * Design principles: - * - Playwright is imported dynamically at runtime so the module does not fail - * to load when the `playwright` package is absent. - * - Each invocation uses a fresh browser context to avoid credential leakage - * (no cookies, local storage, or auth tokens are shared between runs). - * - All failures (launch error, navigation timeout, navigation error) are - * handled gracefully: the extractor returns an empty result and logs a - * warning rather than throwing. - */ - -import type { Extractor, ExtractResult } from './extractor.js'; - -/** Playwright browser channel to launch. */ -export type PlaywrightBrowserChannel = 'chromium' | 'firefox' | 'webkit'; - -/** Configuration for PlaywrightExtractor. */ -export interface PlaywrightExtractorConfig { - /** - * Which browser to use. - * @default 'chromium' - */ - browser?: PlaywrightBrowserChannel; - /** - * Navigation timeout in milliseconds. - * @default 30000 - */ - timeoutMs?: number; -} - -/** - * Classifies errors that the PlaywrightExtractor can encounter. - * Used in telemetry payloads. - */ -export type PlaywrightErrorType = - | 'launch_failed' - | 'timeout' - | 'navigation_error' - | null; - -/** Internal logger type so tests can inject a spy. */ -export type Logger = { - warn: (message: string, ...meta: unknown[]) => void; -}; - -const DEFAULT_TIMEOUT_MS = 30_000; - -/** - * Playwright-based URL extractor. - * - * Implements the same {@link Extractor} interface as the primary extractor so - * it can be used as a drop-in replacement or fallback in the ingestion service. - */ -export class PlaywrightExtractor implements Extractor { - private readonly config: Required<PlaywrightExtractorConfig>; - private readonly logger: Logger; - private readonly loadPlaywrightFn: () => Promise<BrowserModule>; - - /** - * @param config Browser configuration. - * @param logger Optional logger; defaults to `console`. - * @param loadPlaywrightFn Optional override for the playwright loader. - * Inject a stub here in tests to avoid a real import. - */ - constructor( - config: PlaywrightExtractorConfig = {}, - logger?: Logger, - loadPlaywrightFn?: () => Promise<BrowserModule> - ) { - this.config = { - browser: config.browser ?? 'chromium', - timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS, - }; - this.logger = logger ?? console; - this.loadPlaywrightFn = loadPlaywrightFn ?? loadPlaywright; - } - - /** - * Fetch and extract content from `url` using a headless browser. - * - * Returns an empty {@link ExtractResult} if Playwright is not installed, - * the browser fails to launch, or navigation times out. - */ - async extract(url: string): Promise<ExtractResult> { - let browserModule: BrowserModule | null = null; - - // Dynamic import guard: fail gracefully if playwright is not installed. - try { - browserModule = await this.loadPlaywrightFn(); - } catch { - this.logger.warn( - '[PlaywrightExtractor] playwright package is not installed. ' + - 'Install it with: npm install playwright' - ); - return { text: '', url }; - } - - const launcher = browserModule[this.config.browser]; - let browser: BrowserLike | null = null; - - try { - browser = await launcher.launch({ headless: true }); - } catch (err) { - this.logger.warn( - '[PlaywrightExtractor] Failed to launch browser:', - err instanceof Error ? err.message : err - ); - return { text: '', url }; - } - - try { - // Fresh context — no cookies, localStorage, or auth tokens. - const context = await browser.newContext({ - storageState: undefined, - }); - const page = await context.newPage(); - - try { - await page.goto(url, { - timeout: this.config.timeoutMs, - waitUntil: 'domcontentloaded', - }); - } catch (err) { - const isTimeout = - err instanceof Error && err.message.includes('Timeout'); - const errorType: PlaywrightErrorType = isTimeout - ? 'timeout' - : 'navigation_error'; - this.logger.warn( - `[PlaywrightExtractor] Navigation ${errorType} for ${url}:`, - err instanceof Error ? err.message : err - ); - return { text: '', url }; - } - - const html = await page.content(); - const title = await page.title().catch(() => undefined); - // evaluate() runs in the browser context where `document` is available. - // Cast through a minimal interface to avoid TypeScript's non-DOM lib - // complaint without resorting to an untyped `any`. - const text = await page.evaluate(() => { - const g = globalThis as unknown as { - document?: { body?: { innerText?: string } }; - }; - return g.document?.body?.innerText ?? ''; - }); - - await context.close(); - - return { text, html, title, url }; - } finally { - try { - await browser.close(); - } catch { - // Ignore close errors. - } - } - } -} - -// --------------------------------------------------------------------------- -// Internal browser-module loading abstraction (facilitates testing) -// --------------------------------------------------------------------------- - -/** Minimal shape of a Playwright browser launcher we use. */ -export interface BrowserTypeLike { - launch(options: { headless: boolean }): Promise<BrowserLike>; -} - -/** Minimal shape of a Playwright Browser we use. */ -export interface BrowserLike { - newContext(options: { storageState: undefined }): Promise<BrowserContextLike>; - close(): Promise<void>; -} - -/** Minimal shape of a Playwright BrowserContext we use. */ -export interface BrowserContextLike { - newPage(): Promise<PageLike>; - close(): Promise<void>; -} - -/** Minimal shape of a Playwright Page we use. */ -export interface PageLike { - goto(url: string, options: { timeout: number; waitUntil: string }): Promise<unknown>; - content(): Promise<string>; - title(): Promise<string>; - evaluate<T>(fn: () => T): Promise<T>; -} - -/** The subset of the playwright module that PlaywrightExtractor uses. */ -export interface BrowserModule { - chromium: BrowserTypeLike; - firefox: BrowserTypeLike; - webkit: BrowserTypeLike; -} - -/** - * Load the playwright module. - * - * Isolated into a separate function so callers can override it at construction - * time (via the `loadPlaywrightFn` constructor parameter) without needing to - * actually install playwright. - * - * The `any` cast is intentional: `playwright` is an optional peer dependency - * that may not be present at compile time, so its types are unavailable. - * The {@link BrowserModule} interface documents the exact subset we rely on. - * - * @internal - */ -export async function loadPlaywright(): Promise<BrowserModule> { - // playwright is an optional peer dependency; the cast is deliberate. - // eslint-disable-next-line @typescript-eslint/no-explicit-any - const mod = (await import('playwright' as any)) as BrowserModule; - return mod; -} diff --git a/src/lib/ingestion/extractor.ts b/src/lib/ingestion/extractor.ts deleted file mode 100644 index 6b6bef9..0000000 --- a/src/lib/ingestion/extractor.ts +++ /dev/null @@ -1,37 +0,0 @@ -/** - * Extractor interface for URL content retrieval. - * - * All extractors (primary fast-path and Playwright fallback) implement this - * interface so they can be swapped in the ingestion pipeline without changes - * to the service layer. - */ - -/** - * The result of extracting content from a URL. - */ -export interface ExtractResult { - /** The plain-text content extracted from the page. */ - text: string; - /** The raw HTML of the page (optional; used for downstream parsing). */ - html?: string; - /** The page title, if available. */ - title?: string; - /** The URL that was fetched (may differ from the requested URL after redirects). */ - url: string; -} - -/** - * Interface that all URL extractors must implement. - */ -export interface Extractor { - /** - * Fetch and extract content from the given URL. - * - * Implementations must not throw. Failures are surfaced via a result with - * an empty `text` field (and optionally an error logged internally). - * - * @param url The URL to fetch. - * @returns A promise resolving to the extracted content. - */ - extract(url: string): Promise<ExtractResult>; -} diff --git a/src/lib/ingestion/service.ts b/src/lib/ingestion/service.ts deleted file mode 100644 index 36549a1..0000000 --- a/src/lib/ingestion/service.ts +++ /dev/null @@ -1,189 +0,0 @@ -/** - * Ingestion service with Playwright fallback orchestration. - * - * Orchestrates URL content retrieval using a fast primary extractor and an - * optional Playwright-based fallback for JavaScript-heavy pages. - * - * The fallback is opt-in: it is only activated when: - * 1. `config.playwrightFallback` is `true` (or the `OB_PLAYWRIGHT_FALLBACK` - * environment variable is set to a truthy value), AND - * 2. The primary extractor returns fewer characters than - * `config.minContentLength` (default: 200). - * - * On every invocation, a structured telemetry entry is emitted via the - * provided logger. Telemetry never includes URL text, page content, or any - * user-identifiable data. - */ - -import type { Extractor, ExtractResult } from './extractor.js'; - -/** Configuration for the ingestion service. */ -export interface IngestionConfig { - /** - * Enable the Playwright fallback extractor. - * - * Can also be enabled by setting the `OB_PLAYWRIGHT_FALLBACK` environment - * variable to any truthy value (`1`, `true`, `yes`). - * - * @default false - */ - playwrightFallback?: boolean; - - /** - * Minimum character count that primary extractor content must reach before - * the Playwright fallback is considered unnecessary. - * - * @default 200 - */ - minContentLength?: number; -} - -// --------------------------------------------------------------------------- -// Telemetry -// --------------------------------------------------------------------------- - -/** Structured telemetry entry emitted on every ingestion run. */ -export interface PlaywrightFallbackTelemetry { - /** Fixed event identifier. */ - event: 'playwright_fallback'; - /** Whether the fallback was actually triggered. */ - triggered: boolean; - /** Character count returned by the primary extractor. */ - primaryContentLength: number; - /** Character count returned by PlaywrightExtractor (0 if not triggered or failed). */ - fallbackContentLength: number; - /** Wall-clock time of the Playwright fetch in milliseconds (0 if not triggered). */ - durationMs: number; - /** Whether PlaywrightExtractor returned usable content. */ - success: boolean; - /** Error classification, or null when no error occurred. */ - errorType: 'launch_failed' | 'timeout' | 'navigation_error' | null; - /** Always "playwright" for this fallback. */ - provider: 'playwright'; -} - -/** Minimal logger interface used by the service. */ -export interface ServiceLogger { - info: (message: string, meta?: PlaywrightFallbackTelemetry) => void; - warn: (message: string, ...args: unknown[]) => void; -} - -// --------------------------------------------------------------------------- -// IngestionService -// --------------------------------------------------------------------------- - -const DEFAULT_MIN_CONTENT_LENGTH = 200; - -/** - * Determines whether the Playwright fallback is enabled, considering both the - * programmatic config and the `OB_PLAYWRIGHT_FALLBACK` environment variable. - */ -function isFallbackEnabled(config: IngestionConfig): boolean { - if (config.playwrightFallback === true) return true; - const envVal = process.env['OB_PLAYWRIGHT_FALLBACK']; - return envVal === '1' || envVal === 'true' || envVal === 'yes'; -} - -/** - * Service that orchestrates URL ingestion with an optional Playwright - * fallback for JavaScript-heavy pages. - */ -export class IngestionService { - private readonly primaryExtractor: Extractor; - private readonly playwrightExtractor: Extractor; - private readonly config: IngestionConfig; - private readonly logger: ServiceLogger; - - constructor( - primaryExtractor: Extractor, - playwrightExtractor: Extractor, - config: IngestionConfig = {}, - logger?: ServiceLogger - ) { - this.primaryExtractor = primaryExtractor; - this.playwrightExtractor = playwrightExtractor; - this.config = config; - this.logger = logger ?? { - info: (msg, meta) => { - const line = meta ? `${msg} ${JSON.stringify(meta)}` : msg; - process.stdout.write(`${line}\n`); - }, - warn: (msg, ...args) => { - const parts = [msg, ...args.map(a => String(a))].join(' '); - process.stderr.write(`${parts}\n`); - }, - }; - } - - /** - * Ingest content from `url`. - * - * 1. Runs the primary extractor. - * 2. If the fallback is enabled and primary content is below the threshold, - * runs PlaywrightExtractor. - * 3. Emits a telemetry entry regardless of which path was taken. - * 4. Returns the best available result (fallback if it produced content, - * otherwise the primary result). - * - * @param url The URL to ingest. - * @returns The best {@link ExtractResult} available. - */ - async ingest(url: string): Promise<ExtractResult> { - const minLen = - this.config.minContentLength ?? DEFAULT_MIN_CONTENT_LENGTH; - - const primaryResult = await this.primaryExtractor.extract(url); - const primaryLen = primaryResult.text.length; - - if (!isFallbackEnabled(this.config) || primaryLen >= minLen) { - this.emitTelemetry({ - event: 'playwright_fallback', - triggered: false, - primaryContentLength: primaryLen, - fallbackContentLength: 0, - durationMs: 0, - success: false, - errorType: null, - provider: 'playwright', - }); - return primaryResult; - } - - // Fallback triggered. - const startMs = Date.now(); - let fallbackResult: ExtractResult = { text: '', url }; - let errorType: PlaywrightFallbackTelemetry['errorType'] = null; - - try { - fallbackResult = await this.playwrightExtractor.extract(url); - } catch (err) { - // PlaywrightExtractor should not throw, but guard anyway. - this.logger.warn( - '[IngestionService] Unexpected error from PlaywrightExtractor:', - err instanceof Error ? err.message : err - ); - errorType = 'navigation_error'; - } - - const durationMs = Date.now() - startMs; - const fallbackLen = fallbackResult.text.length; - const success = fallbackLen > 0; - - this.emitTelemetry({ - event: 'playwright_fallback', - triggered: true, - primaryContentLength: primaryLen, - fallbackContentLength: fallbackLen, - durationMs, - success, - errorType, - provider: 'playwright', - }); - - return success ? fallbackResult : primaryResult; - } - - private emitTelemetry(entry: PlaywrightFallbackTelemetry): void { - this.logger.info('[telemetry]', entry); - } -} diff --git a/tests/fixtures/playwright-fallback/js-heavy-page.html b/tests/fixtures/playwright-fallback/js-heavy-page.html deleted file mode 100644 index a61b2f6..0000000 --- a/tests/fixtures/playwright-fallback/js-heavy-page.html +++ /dev/null @@ -1,22 +0,0 @@ -<!DOCTYPE html> -<html lang="en"> -<head> - <meta charset="UTF-8"> - <title>JS-Heavy Page (fixture) - - - -
- - diff --git a/tests/fixtures/playwright-fallback/static-page.html b/tests/fixtures/playwright-fallback/static-page.html deleted file mode 100644 index 8bd1886..0000000 --- a/tests/fixtures/playwright-fallback/static-page.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - - Static Page (fixture) - - -

Static Page

-

This page has enough static content that the primary extractor can handle - it without needing the Playwright fallback. The text here is well over two - hundred characters so that a minContentLength check of 200 will pass and the - Playwright extractor will not be invoked at all for this fixture. The quick - brown fox jumped over the lazy dog.

- - diff --git a/tests/lib/ingestion/add-command.test.ts b/tests/lib/ingestion/add-command.test.ts deleted file mode 100644 index 9ff780f..0000000 --- a/tests/lib/ingestion/add-command.test.ts +++ /dev/null @@ -1,106 +0,0 @@ -/** - * Unit tests for src/cli/commands/add.ts - * - * Tests exercise the `runAdd` function using stubbed IngestionService - * instances. No real network requests or browser launches occur. - */ - -import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { runAdd } from '../../../src/cli/commands/add.js'; -import * as serviceModule from '../../../src/lib/ingestion/service.js'; -import type { ExtractResult } from '../../../src/lib/ingestion/extractor.js'; - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -function mockIngestResult(text: string): ExtractResult { - return { text, url: 'https://example.com' }; -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -describe('runAdd', () => { - let ingestSpy: ReturnType; - let stdoutSpy: ReturnType; - - beforeEach(() => { - // Spy on IngestionService.prototype.ingest so we can control the result. - ingestSpy = vi - .spyOn(serviceModule.IngestionService.prototype, 'ingest') - .mockResolvedValue(mockIngestResult('Ingested content')); - - // Capture stdout writes. - stdoutSpy = vi.spyOn(process.stdout, 'write').mockImplementation(() => true); - }); - - afterEach(() => { - vi.restoreAllMocks(); - }); - - it('writes ingested content to stdout by default', async () => { - await runAdd('https://example.com'); - - expect(stdoutSpy).toHaveBeenCalledWith('Ingested content'); - }); - - it('returns the ExtractResult', async () => { - const result = await runAdd('https://example.com'); - expect(result.text).toBe('Ingested content'); - expect(result.url).toBe('https://example.com'); - }); - - it('throws when URL is empty', async () => { - await expect(runAdd('')).rejects.toThrow('URL is required'); - }); - - it('passes playwrightFallback option to IngestionService', async () => { - // We can inspect the IngestionService constructor call via the spy on ingest. - await runAdd('https://example.com', { playwrightFallback: true }); - expect(ingestSpy).toHaveBeenCalledWith('https://example.com'); - }); - - it('writes to output file when --output is specified', async () => { - const writeFileSpy = vi.fn(); - - await runAdd('https://example.com', { - output: '/tmp/output.txt', - _writeFile: writeFileSpy, - }); - - expect(writeFileSpy).toHaveBeenCalledWith('/tmp/output.txt', 'Ingested content', 'utf8'); - // Should not also write to stdout. - expect(stdoutSpy).not.toHaveBeenCalled(); - }); - - it('appends newline to stdout when content does not end with one', async () => { - ingestSpy.mockResolvedValueOnce(mockIngestResult('No newline content')); - const writes: string[] = []; - stdoutSpy.mockImplementation((chunk: unknown) => { - writes.push(String(chunk)); - return true; - }); - - await runAdd('https://example.com'); - - expect(writes.join('')).toMatch(/\n$/); - }); - - it('does not append extra newline when content already ends with one', async () => { - ingestSpy.mockResolvedValueOnce(mockIngestResult('Content with newline\n')); - const writes: string[] = []; - stdoutSpy.mockImplementation((chunk: unknown) => { - writes.push(String(chunk)); - return true; - }); - - await runAdd('https://example.com'); - - // Should be exactly one newline at the end, not two. - const combined = writes.join(''); - expect(combined.endsWith('\n\n')).toBe(false); - expect(combined.endsWith('\n')).toBe(true); - }); -}); diff --git a/tests/lib/ingestion/extractor-fetch.test.ts b/tests/lib/ingestion/extractor-fetch.test.ts deleted file mode 100644 index 1a680b4..0000000 --- a/tests/lib/ingestion/extractor-fetch.test.ts +++ /dev/null @@ -1,131 +0,0 @@ -/** - * Unit tests for src/lib/ingestion/extractor-fetch.ts - */ - -import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { htmlToText, extractTitle, FetchExtractor } from '../../../src/lib/ingestion/extractor-fetch.js'; - -// --------------------------------------------------------------------------- -// htmlToText -// --------------------------------------------------------------------------- - -describe('htmlToText', () => { - it('strips HTML tags', () => { - expect(htmlToText('

Hello world

')).toBe('Hello world'); - }); - - it('removes script blocks entirely', () => { - const html = '

Visible

After

'; - const text = htmlToText(html); - expect(text).not.toContain('var x'); - expect(text).toContain('Visible'); - expect(text).toContain('After'); - }); - - it('removes style blocks entirely', () => { - const html = '

Text

'; - const text = htmlToText(html); - expect(text).not.toContain('color'); - expect(text).toContain('Text'); - }); - - it('decodes common HTML entities', () => { - //   decodes to a regular space which is then trimmed by whitespace normalization. - expect(htmlToText('& < > " '  ')).toBe("& < > \" '"); - }); - - it('collapses whitespace', () => { - expect(htmlToText(' hello world ')).toBe('hello world'); - }); - - it('returns empty string for empty input', () => { - expect(htmlToText('')).toBe(''); - }); -}); - -// --------------------------------------------------------------------------- -// extractTitle -// --------------------------------------------------------------------------- - -describe('extractTitle', () => { - it('extracts a simple title', () => { - expect(extractTitle('My Page')).toBe('My Page'); - }); - - it('returns undefined when no title tag', () => { - expect(extractTitle('')).toBeUndefined(); - }); - - it('trims whitespace from title', () => { - expect(extractTitle(' Spaces ')).toBe('Spaces'); - }); -}); - -// --------------------------------------------------------------------------- -// FetchExtractor -// --------------------------------------------------------------------------- - -describe('FetchExtractor', () => { - const mockFetch = vi.fn(); - - beforeEach(() => { - vi.stubGlobal('fetch', mockFetch); - mockFetch.mockClear(); - }); - - afterEach(() => { - vi.restoreAllMocks(); - }); - - function mockResponse(html: string) { - return { - ok: true, - text: vi.fn(async () => html), - }; - } - - it('returns extracted text from a successful fetch', async () => { - const html = 'Test

Hello world

'; - mockFetch.mockResolvedValueOnce(mockResponse(html)); - - const extractor = new FetchExtractor(); - const result = await extractor.extract('https://example.com'); - - expect(result.url).toBe('https://example.com'); - expect(result.text).toContain('Hello world'); - expect(result.title).toBe('Test'); - expect(result.html).toBe(html); - }); - - it('returns empty text on network failure', async () => { - mockFetch.mockRejectedValueOnce(new Error('Network error')); - - const extractor = new FetchExtractor(); - const result = await extractor.extract('https://unreachable.example.com'); - - expect(result.text).toBe(''); - expect(result.url).toBe('https://unreachable.example.com'); - }); - - it('returns empty text on abort (timeout)', async () => { - mockFetch.mockRejectedValueOnce( - Object.assign(new Error('The operation was aborted'), { name: 'AbortError' }) - ); - - const extractor = new FetchExtractor({ timeoutMs: 1 }); - const result = await extractor.extract('https://slow.example.com'); - - expect(result.text).toBe(''); - }); - - it('passes the User-Agent header', async () => { - const html = '

content

'; - mockFetch.mockResolvedValueOnce(mockResponse(html)); - - const extractor = new FetchExtractor({ userAgent: 'TestAgent/1.0' }); - await extractor.extract('https://example.com'); - - const [, requestInit] = mockFetch.mock.calls[0] as [string, RequestInit]; - expect((requestInit.headers as Record)['User-Agent']).toBe('TestAgent/1.0'); - }); -}); diff --git a/tests/lib/ingestion/extractor-playwright.test.ts b/tests/lib/ingestion/extractor-playwright.test.ts deleted file mode 100644 index c069003..0000000 --- a/tests/lib/ingestion/extractor-playwright.test.ts +++ /dev/null @@ -1,291 +0,0 @@ -/** - * Unit tests for src/lib/ingestion/extractor-playwright.ts - * - * Tests exercise the PlaywrightExtractor class using stubbed browser - * objects injected via the constructor's `loadPlaywrightFn` parameter. - * No real browser is launched and playwright is never imported. - * - * Testing strategy: - * - `PlaywrightExtractor` accepts an optional `loadPlaywrightFn` in its - * constructor. Tests pass a factory that returns controlled fake browser - * implementations instead of a real playwright module. - * - This matches the "mock/stub" option described in the issue's CI/Testing - * Strategy section and avoids ESM spy limitations. - */ - -import { describe, it, expect, vi } from 'vitest'; -import { - PlaywrightExtractor, - type BrowserModule, - type BrowserTypeLike, - type BrowserLike, - type BrowserContextLike, - type PageLike, -} from '../../../src/lib/ingestion/extractor-playwright.js'; - -// --------------------------------------------------------------------------- -// Helpers: minimal browser stubs -// --------------------------------------------------------------------------- - -function buildPageStub(overrides: Partial = {}): PageLike { - return { - goto: vi.fn(async () => null), - content: vi.fn(async () => '

Playwright content

'), - title: vi.fn(async () => 'Playwright Page'), - evaluate: vi.fn(async () => 'Playwright content'), - ...overrides, - }; -} - -function buildContextStub(page: PageLike): BrowserContextLike { - return { - newPage: vi.fn(async () => page), - close: vi.fn(async () => undefined), - }; -} - -function buildBrowserStub(context: BrowserContextLike): BrowserLike { - return { - newContext: vi.fn(async () => context), - close: vi.fn(async () => undefined), - }; -} - -function buildLauncherStub(browser: BrowserLike): BrowserTypeLike { - return { - launch: vi.fn(async () => browser), - }; -} - -function buildBrowserModule(chromium: BrowserTypeLike): BrowserModule { - return { - chromium, - firefox: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), - webkit: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), - }; -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -describe('PlaywrightExtractor', () => { - - // ------------------------------------------------------------------------- - // Happy path - // ------------------------------------------------------------------------- - - it('returns extracted text and html from a successful navigation', async () => { - const page = buildPageStub({ - content: vi.fn(async () => '

JS content

'), - title: vi.fn(async () => 'JS Page'), - evaluate: vi.fn(async () => 'JS content'), - }); - const launcher = buildLauncherStub(buildBrowserStub(buildContextStub(page))); - const extractor = new PlaywrightExtractor( - {}, - { warn: vi.fn() }, - async () => buildBrowserModule(launcher) - ); - - const result = await extractor.extract('https://example.com'); - - expect(result.text).toBe('JS content'); - expect(result.html).toContain('JS content'); - expect(result.title).toBe('JS Page'); - expect(result.url).toBe('https://example.com'); - }); - - it('uses a fresh browser context (storageState is undefined)', async () => { - const page = buildPageStub(); - const context = buildContextStub(page); - const browser = buildBrowserStub(context); - const launcher = buildLauncherStub(browser); - const extractor = new PlaywrightExtractor( - {}, - { warn: vi.fn() }, - async () => buildBrowserModule(launcher) - ); - - await extractor.extract('https://example.com'); - - expect(browser.newContext).toHaveBeenCalledWith({ storageState: undefined }); - }); - - it('closes the browser even when navigation succeeds', async () => { - const page = buildPageStub(); - const context = buildContextStub(page); - const browser = buildBrowserStub(context); - const launcher = buildLauncherStub(browser); - const extractor = new PlaywrightExtractor( - {}, - { warn: vi.fn() }, - async () => buildBrowserModule(launcher) - ); - - await extractor.extract('https://example.com'); - - expect(browser.close).toHaveBeenCalled(); - }); - - it('passes headless:true to browser.launch', async () => { - const page = buildPageStub(); - const context = buildContextStub(page); - const browser = buildBrowserStub(context); - const launcher = buildLauncherStub(browser); - const extractor = new PlaywrightExtractor( - {}, - { warn: vi.fn() }, - async () => buildBrowserModule(launcher) - ); - - await extractor.extract('https://example.com'); - - expect(launcher.launch).toHaveBeenCalledWith({ headless: true }); - }); - - it('passes configured timeout to page.goto', async () => { - const page = buildPageStub(); - const context = buildContextStub(page); - const browser = buildBrowserStub(context); - const launcher = buildLauncherStub(browser); - const extractor = new PlaywrightExtractor( - { timeoutMs: 5000 }, - { warn: vi.fn() }, - async () => buildBrowserModule(launcher) - ); - - await extractor.extract('https://example.com'); - - expect(page.goto).toHaveBeenCalledWith( - 'https://example.com', - expect.objectContaining({ timeout: 5000 }) - ); - }); - - // ------------------------------------------------------------------------- - // Graceful degradation: playwright not installed - // ------------------------------------------------------------------------- - - it('returns empty result and warns when playwright is not installed', async () => { - const warnSpy = vi.fn(); - const extractor = new PlaywrightExtractor( - {}, - { warn: warnSpy }, - async () => { throw new Error("Cannot find module 'playwright'"); } - ); - - const result = await extractor.extract('https://example.com'); - - expect(result.text).toBe(''); - expect(result.url).toBe('https://example.com'); - expect(warnSpy).toHaveBeenCalledWith( - expect.stringContaining('playwright package is not installed') - ); - }); - - // ------------------------------------------------------------------------- - // Graceful degradation: launch failure - // ------------------------------------------------------------------------- - - it('returns empty result and warns when browser fails to launch', async () => { - const warnSpy = vi.fn(); - const launcher: BrowserTypeLike = { - launch: vi.fn(async () => { throw new Error('Executable not found'); }), - }; - const extractor = new PlaywrightExtractor( - {}, - { warn: warnSpy }, - async () => buildBrowserModule(launcher) - ); - - const result = await extractor.extract('https://example.com'); - - expect(result.text).toBe(''); - expect(warnSpy).toHaveBeenCalledWith( - expect.stringContaining('Failed to launch browser'), - expect.any(String) - ); - }); - - // ------------------------------------------------------------------------- - // Graceful degradation: navigation timeout - // ------------------------------------------------------------------------- - - it('returns empty result and warns on navigation timeout', async () => { - const warnSpy = vi.fn(); - const page = buildPageStub({ - goto: vi.fn(async () => { throw new Error('Timeout exceeded'); }), - }); - const extractor = new PlaywrightExtractor( - {}, - { warn: warnSpy }, - async () => buildBrowserModule( - buildLauncherStub(buildBrowserStub(buildContextStub(page))) - ) - ); - - const result = await extractor.extract('https://example.com'); - - expect(result.text).toBe(''); - expect(warnSpy).toHaveBeenCalledWith( - expect.stringMatching(/timeout|navigation_error/i), - expect.any(String) - ); - }); - - // ------------------------------------------------------------------------- - // Graceful degradation: navigation error (non-timeout) - // ------------------------------------------------------------------------- - - it('returns empty result and warns on non-timeout navigation error', async () => { - const warnSpy = vi.fn(); - const page = buildPageStub({ - goto: vi.fn(async () => { throw new Error('net::ERR_CONNECTION_REFUSED'); }), - }); - const extractor = new PlaywrightExtractor( - {}, - { warn: warnSpy }, - async () => buildBrowserModule( - buildLauncherStub(buildBrowserStub(buildContextStub(page))) - ) - ); - - const result = await extractor.extract('https://example.com'); - - expect(result.text).toBe(''); - expect(warnSpy).toHaveBeenCalledWith( - expect.stringContaining('navigation_error'), - expect.any(String) - ); - }); - - // ------------------------------------------------------------------------- - // Browser channel selection - // ------------------------------------------------------------------------- - - it('uses firefox when configured', async () => { - const firefoxPage = buildPageStub({ - evaluate: vi.fn(async () => 'Firefox content'), - }); - const firefoxLauncher = buildLauncherStub( - buildBrowserStub(buildContextStub(firefoxPage)) - ); - - const mod: BrowserModule = { - chromium: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), - firefox: firefoxLauncher, - webkit: buildLauncherStub(buildBrowserStub(buildContextStub(buildPageStub()))), - }; - - const extractor = new PlaywrightExtractor( - { browser: 'firefox' }, - { warn: vi.fn() }, - async () => mod - ); - const result = await extractor.extract('https://example.com'); - - expect(firefoxLauncher.launch).toHaveBeenCalled(); - expect(result.text).toBe('Firefox content'); - }); -}); diff --git a/tests/lib/ingestion/extractor.test.ts b/tests/lib/ingestion/extractor.test.ts deleted file mode 100644 index 6af2292..0000000 --- a/tests/lib/ingestion/extractor.test.ts +++ /dev/null @@ -1,47 +0,0 @@ -/** - * Unit tests for src/lib/ingestion/extractor.ts - * - * The Extractor interface and ExtractResult type are structural contracts; - * these tests verify they are exported with the expected shape. - */ - -import { describe, it, expect } from 'vitest'; -import type { Extractor, ExtractResult } from '../../../src/lib/ingestion/extractor.js'; - -describe('Extractor interface', () => { - it('can be implemented by a simple stub', async () => { - const stub: Extractor = { - async extract(url: string): Promise { - return { text: 'hello', url }; - }, - }; - - const result = await stub.extract('https://example.com'); - expect(result.text).toBe('hello'); - expect(result.url).toBe('https://example.com'); - }); - - it('ExtractResult has the expected fields', () => { - const result: ExtractResult = { - text: 'content', - html: '

content

', - title: 'Page title', - url: 'https://example.com', - }; - - expect(result.text).toBe('content'); - expect(result.html).toBe('

content

'); - expect(result.title).toBe('Page title'); - expect(result.url).toBe('https://example.com'); - }); - - it('ExtractResult html and title fields are optional', () => { - const result: ExtractResult = { - text: 'content', - url: 'https://example.com', - }; - - expect(result.html).toBeUndefined(); - expect(result.title).toBeUndefined(); - }); -}); diff --git a/tests/lib/ingestion/service.test.ts b/tests/lib/ingestion/service.test.ts deleted file mode 100644 index 9e5bdb5..0000000 --- a/tests/lib/ingestion/service.test.ts +++ /dev/null @@ -1,287 +0,0 @@ -/** - * Unit tests for src/lib/ingestion/service.ts - * - * Covers: - * - Primary-only path (fallback disabled or threshold not met) - * - Fallback trigger (short primary content + enabled flag) - * - Fallback via OB_PLAYWRIGHT_FALLBACK env var - * - Fallback returns empty → primary result is returned - * - Telemetry emission on every invocation - * - Graceful handling when PlaywrightExtractor throws unexpectedly - */ - -import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest'; -import { IngestionService, type IngestionConfig, type PlaywrightFallbackTelemetry } from '../../../src/lib/ingestion/service.js'; -import type { Extractor, ExtractResult } from '../../../src/lib/ingestion/extractor.js'; - -// --------------------------------------------------------------------------- -// Helpers -// --------------------------------------------------------------------------- - -function makeExtractor(result: ExtractResult): Extractor { - return { extract: vi.fn(async () => result) }; -} - -function makeResult(text: string, url = 'https://example.com'): ExtractResult { - return { text, url }; -} - -function capturedTelemetry( - infoSpy: ReturnType -): PlaywrightFallbackTelemetry | undefined { - const call = infoSpy.mock.calls.find( - (c: unknown[]) => c[0] === '[telemetry]' - ); - return call ? (call[1] as PlaywrightFallbackTelemetry) : undefined; -} - -// --------------------------------------------------------------------------- -// Tests -// --------------------------------------------------------------------------- - -describe('IngestionService', () => { - const URL = 'https://example.com'; - let infoSpy: ReturnType; - let warnSpy: ReturnType; - let logger: { info: typeof infoSpy; warn: typeof warnSpy }; - - beforeEach(() => { - infoSpy = vi.fn(); - warnSpy = vi.fn(); - logger = { info: infoSpy, warn: warnSpy }; - // Ensure env var is not set between tests. - delete process.env['OB_PLAYWRIGHT_FALLBACK']; - }); - - afterEach(() => { - delete process.env['OB_PLAYWRIGHT_FALLBACK']; - }); - - // ------------------------------------------------------------------------- - // Fallback disabled - // ------------------------------------------------------------------------- - - it('returns primary result when fallback is disabled (default)', async () => { - const primary = makeExtractor(makeResult('short', URL)); - const pw = makeExtractor(makeResult('playwright content', URL)); - - const svc = new IngestionService(primary, pw, {}, logger); - const result = await svc.ingest(URL); - - expect(result.text).toBe('short'); - expect(pw.extract).not.toHaveBeenCalled(); - }); - - it('returns primary result when fallback is disabled explicitly', async () => { - const primary = makeExtractor(makeResult('short', URL)); - const pw = makeExtractor(makeResult('playwright content', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: false }, logger - ); - const result = await svc.ingest(URL); - - expect(result.text).toBe('short'); - expect(pw.extract).not.toHaveBeenCalled(); - }); - - // ------------------------------------------------------------------------- - // Fallback not triggered (primary content is sufficient) - // ------------------------------------------------------------------------- - - it('does not trigger fallback when primary content meets threshold', async () => { - const longText = 'a'.repeat(300); - const primary = makeExtractor(makeResult(longText, URL)); - const pw = makeExtractor(makeResult('playwright content', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger - ); - const result = await svc.ingest(URL); - - expect(result.text).toBe(longText); - expect(pw.extract).not.toHaveBeenCalled(); - }); - - // ------------------------------------------------------------------------- - // Fallback triggered - // ------------------------------------------------------------------------- - - it('triggers fallback when primary content is below threshold', async () => { - const primary = makeExtractor(makeResult('too short', URL)); - const pw = makeExtractor(makeResult('Rich playwright content that is long enough', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger - ); - const result = await svc.ingest(URL); - - expect(result.text).toBe('Rich playwright content that is long enough'); - expect(pw.extract).toHaveBeenCalledWith(URL); - }); - - it('returns primary result when fallback also returns empty', async () => { - const primary = makeExtractor(makeResult('short', URL)); - const pw = makeExtractor(makeResult('', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true, minContentLength: 200 }, logger - ); - const result = await svc.ingest(URL); - - expect(result.text).toBe('short'); - }); - - // ------------------------------------------------------------------------- - // Environment variable activation - // ------------------------------------------------------------------------- - - it('activates fallback via OB_PLAYWRIGHT_FALLBACK=1', async () => { - process.env['OB_PLAYWRIGHT_FALLBACK'] = '1'; - const primary = makeExtractor(makeResult('short', URL)); - const pw = makeExtractor(makeResult('playwright result', URL)); - - const svc = new IngestionService(primary, pw, {}, logger); - const result = await svc.ingest(URL); - - expect(result.text).toBe('playwright result'); - expect(pw.extract).toHaveBeenCalled(); - }); - - it('activates fallback via OB_PLAYWRIGHT_FALLBACK=true', async () => { - process.env['OB_PLAYWRIGHT_FALLBACK'] = 'true'; - const primary = makeExtractor(makeResult('short', URL)); - const pw = makeExtractor(makeResult('playwright result', URL)); - - const svc = new IngestionService(primary, pw, {}, logger); - const result = await svc.ingest(URL); - - expect(result.text).toBe('playwright result'); - }); - - it('does not activate fallback via OB_PLAYWRIGHT_FALLBACK=0', async () => { - process.env['OB_PLAYWRIGHT_FALLBACK'] = '0'; - const primary = makeExtractor(makeResult('short', URL)); - const pw = makeExtractor(makeResult('playwright result', URL)); - - const svc = new IngestionService(primary, pw, {}, logger); - const result = await svc.ingest(URL); - - expect(result.text).toBe('short'); - expect(pw.extract).not.toHaveBeenCalled(); - }); - - // ------------------------------------------------------------------------- - // Telemetry - // ------------------------------------------------------------------------- - - it('emits telemetry with triggered=false when fallback is disabled', async () => { - const primary = makeExtractor(makeResult('some text', URL)); - const pw = makeExtractor(makeResult('', URL)); - - const svc = new IngestionService(primary, pw, {}, logger); - await svc.ingest(URL); - - const telemetry = capturedTelemetry(infoSpy); - expect(telemetry).toBeDefined(); - expect(telemetry!.event).toBe('playwright_fallback'); - expect(telemetry!.triggered).toBe(false); - expect(telemetry!.primaryContentLength).toBe('some text'.length); - expect(telemetry!.fallbackContentLength).toBe(0); - expect(telemetry!.durationMs).toBe(0); - expect(telemetry!.success).toBe(false); - expect(telemetry!.errorType).toBeNull(); - expect(telemetry!.provider).toBe('playwright'); - }); - - it('emits telemetry with triggered=true when fallback runs', async () => { - const primary = makeExtractor(makeResult('short', URL)); - const fallbackText = 'long fallback content from playwright'; - const pw = makeExtractor(makeResult(fallbackText, URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true }, logger - ); - await svc.ingest(URL); - - const telemetry = capturedTelemetry(infoSpy); - expect(telemetry!.triggered).toBe(true); - expect(telemetry!.primaryContentLength).toBe('short'.length); - expect(telemetry!.fallbackContentLength).toBe(fallbackText.length); - expect(telemetry!.success).toBe(true); - expect(telemetry!.errorType).toBeNull(); - expect(telemetry!.durationMs).toBeGreaterThanOrEqual(0); - }); - - it('emits telemetry with success=false when fallback returns empty', async () => { - const primary = makeExtractor(makeResult('tiny', URL)); - const pw = makeExtractor(makeResult('', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true }, logger - ); - await svc.ingest(URL); - - const telemetry = capturedTelemetry(infoSpy); - expect(telemetry!.triggered).toBe(true); - expect(telemetry!.success).toBe(false); - expect(telemetry!.fallbackContentLength).toBe(0); - }); - - // ------------------------------------------------------------------------- - // Graceful handling of unexpected PlaywrightExtractor error - // ------------------------------------------------------------------------- - - it('handles unexpected PlaywrightExtractor throw gracefully', async () => { - const primary = makeExtractor(makeResult('short', URL)); - const pw: Extractor = { - extract: vi.fn(async () => { throw new Error('unexpected crash'); }), - }; - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true }, logger - ); - // Must not throw. - const result = await svc.ingest(URL); - - // Falls back to primary result. - expect(result.text).toBe('short'); - expect(warnSpy).toHaveBeenCalledWith( - expect.stringContaining('Unexpected error'), - expect.any(String) - ); - }); - - // ------------------------------------------------------------------------- - // Default minContentLength - // ------------------------------------------------------------------------- - - it('uses default minContentLength of 200 when not specified', async () => { - // Text of exactly 199 chars → should trigger fallback. - const shortText = 'a'.repeat(199); - const primary = makeExtractor(makeResult(shortText, URL)); - const pw = makeExtractor(makeResult('playwright result', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true }, logger - ); - const result = await svc.ingest(URL); - - expect(result.text).toBe('playwright result'); - expect(pw.extract).toHaveBeenCalled(); - }); - - it('does not trigger fallback when text is exactly at default threshold (200 chars)', async () => { - const atThreshold = 'a'.repeat(200); - const primary = makeExtractor(makeResult(atThreshold, URL)); - const pw = makeExtractor(makeResult('playwright result', URL)); - - const svc = new IngestionService( - primary, pw, { playwrightFallback: true }, logger - ); - const result = await svc.ingest(URL); - - expect(result.text).toBe(atThreshold); - expect(pw.extract).not.toHaveBeenCalled(); - }); -});