Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@
"express": "^4.18.2",
"js-yaml": "^4.1.1"
},
"peerDependencies": {
"playwright": ">=1.40.0"
},
"peerDependenciesMeta": {
"playwright": {
"optional": true
}
},
"devDependencies": {
"@types/blessed": "^0.1.7",
"@types/better-sqlite3": "^7.6.13",
Expand Down
179 changes: 179 additions & 0 deletions src/cli/commands/add.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/**
* `ob add <url>` CLI command.
*
* Ingests content from a URL using the fast-path primary extractor, falling
* back to a Playwright headless-browser fetch when:
* - The Playwright fallback is explicitly enabled (via config or environment
* variable `OB_PLAYWRIGHT_FALLBACK=1`), AND
* - The primary extractor returns fewer characters than the configured
* minimum content length threshold.
*
* Usage:
* ob add <url> [--playwright-fallback] [--min-content-length <n>] [--timeout <ms>]
*
* The command writes the ingested content to stdout (or a file if `--output`
* is provided) in a format compatible with the downstream ingestion pipeline.
* It exits with a non-zero status code only on unrecoverable errors (e.g. an
* invalid URL); partial or empty content from a gracefully-degraded fetch is
* not treated as a fatal error.
*/

import { IngestionService, type IngestionConfig } from '../../lib/ingestion/service.js';
import { FetchExtractor } from '../../lib/ingestion/extractor-fetch.js';
import { PlaywrightExtractor, type PlaywrightBrowserChannel } from '../../lib/ingestion/extractor-playwright.js';
import type { ExtractResult } from '../../lib/ingestion/extractor.js';
import * as fs from 'fs';

/** Options accepted by the `add` command. */
export interface AddCommandOptions {
/** Enable the Playwright fallback (overrides config). */
playwrightFallback?: boolean;
/** Minimum content length before fallback is triggered. */
minContentLength?: number;
/** Playwright navigation timeout in milliseconds. */
timeout?: number;
/** Browser channel to use for Playwright. */
browser?: PlaywrightBrowserChannel;
/** Write output to this file path instead of stdout. */
output?: string;
/**
* Injectable file-write function for testing.
* Defaults to `fs.writeFileSync` when not provided.
* @internal
*/
_writeFile?: (path: string, data: string, encoding: BufferEncoding) => void;
}

/**
* Dependency-injectable runner for the `add` command.
*
* Separated from Commander.js setup so it can be unit-tested without starting
* a real CLI process.
*
* @param url The URL to ingest.
* @param options Command options.
* @returns The ingested {@link ExtractResult}.
*/
export async function runAdd(
url: string,
options: AddCommandOptions = {}
): Promise<ExtractResult> {
if (!url) {
throw new Error('URL is required');
}

const ingestionConfig: IngestionConfig = {
playwrightFallback: options.playwrightFallback ?? false,
minContentLength: options.minContentLength,
};

const playwrightExtractor = new PlaywrightExtractor({
browser: options.browser ?? 'chromium',
timeoutMs: options.timeout ?? 30_000,
});

const service = new IngestionService(
new FetchExtractor(),
playwrightExtractor,
ingestionConfig
);

const result = await service.ingest(url);

if (options.output) {
const writeFile = options._writeFile ?? fs.writeFileSync;
writeFile(options.output, result.text, 'utf8');
} else {
process.stdout.write(result.text);
if (result.text.length > 0 && !result.text.endsWith('\n')) {
process.stdout.write('\n');
}
}

return result;
}

// ---------------------------------------------------------------------------
// Commander.js registration (optional — used when loaded as a plugin command)
// ---------------------------------------------------------------------------

/**
* Register the `add` command with a Commander.js program.
*
* This function follows the same pattern as other commands in `src/commands/`.
*
* @param program The root Commander.js Command instance.
*/
export function registerAddCommand(program: {
command: (name: string) => CommandBuilder;
}): void {
program
.command('add <url>')
.description('Ingest content from a URL (with optional Playwright fallback for JS-heavy pages)')
.option(
'--playwright-fallback',
'Enable Playwright headless-browser fallback (requires `playwright` package)',
false
)
.option(
'--min-content-length <n>',
'Minimum character count before Playwright fallback is triggered',
'200'
)
.option(
'--timeout <ms>',
'Playwright navigation timeout in milliseconds',
'30000'
)
.option(
'--browser <channel>',
'Playwright browser channel: chromium | firefox | webkit',
'chromium'
)
.option('--output <file>', 'Write ingested content to file instead of stdout')
.action(async (url: string, opts: Record<string, string | boolean>) => {
const parsePositiveInt = (
raw: string | boolean | undefined,
name: string
): number | undefined => {
if (raw === undefined || raw === false || raw === '') return undefined;
const n = parseInt(String(raw), 10);
if (isNaN(n) || n <= 0) {
process.stderr.write(
`[ob add] Error: --${name} must be a positive integer (received: ${String(raw)})\n`
);
process.exit(1);
}
return n;
};

const options: AddCommandOptions = {
playwrightFallback: Boolean(opts['playwrightFallback']),
minContentLength: parsePositiveInt(opts['minContentLength'], 'min-content-length'),
timeout: parsePositiveInt(opts['timeout'], 'timeout'),
browser: opts['browser'] as PlaywrightBrowserChannel | undefined,
output: opts['output'] ? String(opts['output']) : undefined,
};

try {
await runAdd(url, options);
} catch (err) {
process.stderr.write(
`[ob add] Error: ${err instanceof Error ? err.message : String(err)}\n`
);
process.exit(1);
}
});
}

// ---------------------------------------------------------------------------
// Minimal type shim for Commander.js .option / .action chaining
// (avoids requiring a Commander.js type import in this standalone module)
// ---------------------------------------------------------------------------

interface CommandBuilder {
description: (desc: string) => CommandBuilder;
option: (flags: string, description: string, defaultValue?: string | boolean) => CommandBuilder;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
action: (fn: (...args: any[]) => void | Promise<void>) => CommandBuilder;
}
116 changes: 116 additions & 0 deletions src/lib/ingestion/extractor-fetch.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/**
* Primary fast-path extractor.
*
* Fetches a URL using the built-in `fetch` API and extracts plain text by
* stripping HTML tags. This is the default extractor used by the ingestion
* service; it is fast but cannot handle pages that render content with
* client-side JavaScript.
*/

import type { Extractor, ExtractResult } from './extractor.js';

/** Configuration for the primary fetch-based extractor. */
export interface FetchExtractorConfig {
/**
* Request timeout in milliseconds.
* @default 15000
*/
timeoutMs?: number;
/**
* User-Agent string to send with requests.
*/
userAgent?: string;
}

const DEFAULT_TIMEOUT_MS = 15_000;
const DEFAULT_USER_AGENT =
'Mozilla/5.0 (compatible; OpenBrain-Ingestion/1.0; +https://github.com/TheWizardsCode/ContextHub)';

/**
* Strips HTML tags and collapses whitespace from `html`.
* Returns the resulting plain text.
*/
export function htmlToText(html: string): string {
// Remove script and style blocks entirely (allow attributes/whitespace before closing >).
let text = html
.replace(/<script[\s\S]*?<\/script[^>]*>/gi, ' ')
.replace(/<style[\s\S]*?<\/style[^>]*>/gi, ' ');
// Replace block-level elements with newlines.
text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n');
// Strip all remaining tags.
text = text.replace(/<[^>]+>/g, ' ');
// Decode common named HTML entities in a single pass to avoid double-decoding
// (e.g. &amp;lt; must become &lt;, not <).
// Note: numeric character references (e.g. &#65; or &#x41;) are intentionally
// not decoded here since they are uncommon in plain-content pages and the
// purpose of this extractor is fast-path text extraction, not full HTML parsing.
text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => {
switch (match) {
case '&amp;': return '&';
case '&lt;': return '<';
case '&gt;': return '>';
case '&quot;': return '"';
case '&#39;': return "'";
case '&nbsp;': return ' ';
default: return match;
}
});
// Collapse whitespace.
return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
}

/**
* Extracts the content of the first `<title>` tag in `html`.
*/
export function extractTitle(html: string): string | undefined {
const match = /<title[^>]*>([^<]*)<\/title>/i.exec(html);
return match ? match[1].trim() : undefined;
}

/**
* Primary extractor that uses HTTP fetch + HTML-to-text conversion.
*
* This extractor is fast and has no external runtime dependencies. It is
* the default first-pass extractor used by the ingestion service before
* attempting the Playwright fallback.
*/
export class FetchExtractor implements Extractor {
private readonly config: Required<FetchExtractorConfig>;

constructor(config: FetchExtractorConfig = {}) {
this.config = {
timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS,
userAgent: config.userAgent ?? DEFAULT_USER_AGENT,
};
}

async extract(url: string): Promise<ExtractResult> {
let html: string;

try {
const controller = new AbortController();
const timer = setTimeout(
() => controller.abort(),
this.config.timeoutMs
);
try {
const response = await fetch(url, {
signal: controller.signal,
headers: { 'User-Agent': this.config.userAgent },
});
html = await response.text();
} finally {
clearTimeout(timer);
}
} catch {
return { text: '', url };
}

return {
text: htmlToText(html),
html,
title: extractTitle(html),
url,
};
}
}
Loading
Loading