TheWizardsCode · SorraTheOrc · Apr 5, 2026 · Apr 4, 2026 · Apr 4, 2026 · Apr 4, 2026
diff --git a/package.json b/package.json
@@ -36,6 +36,14 @@
     "express": "^4.18.2",
     "js-yaml": "^4.1.1"
   },
+  "peerDependencies": {
+    "playwright": ">=1.40.0"
+  },
+  "peerDependenciesMeta": {
+    "playwright": {
+      "optional": true
+    }
+  },
   "devDependencies": {
     "@types/blessed": "^0.1.7",
     "@types/better-sqlite3": "^7.6.13",

diff --git a/src/cli/commands/add.ts b/src/cli/commands/add.ts
@@ -0,0 +1,179 @@
+/**
+ * `ob add <url>` CLI command.
+ *
+ * Ingests content from a URL using the fast-path primary extractor, falling
+ * back to a Playwright headless-browser fetch when:
+ *   - The Playwright fallback is explicitly enabled (via config or environment
+ *     variable `OB_PLAYWRIGHT_FALLBACK=1`), AND
+ *   - The primary extractor returns fewer characters than the configured
+ *     minimum content length threshold.
+ *
+ * Usage:
+ *   ob add <url> [--playwright-fallback] [--min-content-length <n>] [--timeout <ms>]
+ *
+ * The command writes the ingested content to stdout (or a file if `--output`
+ * is provided) in a format compatible with the downstream ingestion pipeline.
+ * It exits with a non-zero status code only on unrecoverable errors (e.g. an
+ * invalid URL); partial or empty content from a gracefully-degraded fetch is
+ * not treated as a fatal error.
+ */
+
+import { IngestionService, type IngestionConfig } from '../../lib/ingestion/service.js';
+import { FetchExtractor } from '../../lib/ingestion/extractor-fetch.js';
+import { PlaywrightExtractor, type PlaywrightBrowserChannel } from '../../lib/ingestion/extractor-playwright.js';
+import type { ExtractResult } from '../../lib/ingestion/extractor.js';
+import * as fs from 'fs';
+
+/** Options accepted by the `add` command. */
+export interface AddCommandOptions {
+  /** Enable the Playwright fallback (overrides config). */
+  playwrightFallback?: boolean;
+  /** Minimum content length before fallback is triggered. */
+  minContentLength?: number;
+  /** Playwright navigation timeout in milliseconds. */
+  timeout?: number;
+  /** Browser channel to use for Playwright. */
+  browser?: PlaywrightBrowserChannel;
+  /** Write output to this file path instead of stdout. */
+  output?: string;
+  /**
+   * Injectable file-write function for testing.
+   * Defaults to `fs.writeFileSync` when not provided.
+   * @internal
+   */
+  _writeFile?: (path: string, data: string, encoding: BufferEncoding) => void;
+}
+
+/**
+ * Dependency-injectable runner for the `add` command.
+ *
+ * Separated from Commander.js setup so it can be unit-tested without starting
+ * a real CLI process.
+ *
+ * @param url The URL to ingest.
+ * @param options Command options.
+ * @returns The ingested {@link ExtractResult}.
+ */
+export async function runAdd(
+  url: string,
+  options: AddCommandOptions = {}
+): Promise<ExtractResult> {
+  if (!url) {
+    throw new Error('URL is required');
+  }
+
+  const ingestionConfig: IngestionConfig = {
+    playwrightFallback: options.playwrightFallback ?? false,
+    minContentLength: options.minContentLength,
+  };
+
+  const playwrightExtractor = new PlaywrightExtractor({
+    browser: options.browser ?? 'chromium',
+    timeoutMs: options.timeout ?? 30_000,
+  });
+
+  const service = new IngestionService(
+    new FetchExtractor(),
+    playwrightExtractor,
+    ingestionConfig
+  );
+
+  const result = await service.ingest(url);
+
+  if (options.output) {
+    const writeFile = options._writeFile ?? fs.writeFileSync;
+    writeFile(options.output, result.text, 'utf8');
+  } else {
+    process.stdout.write(result.text);
+    if (result.text.length > 0 && !result.text.endsWith('\n')) {
+      process.stdout.write('\n');
+    }
+  }
+
+  return result;
+}
+
+// ---------------------------------------------------------------------------
+// Commander.js registration (optional — used when loaded as a plugin command)
+// ---------------------------------------------------------------------------
+
+/**
+ * Register the `add` command with a Commander.js program.
+ *
+ * This function follows the same pattern as other commands in `src/commands/`.
+ *
+ * @param program The root Commander.js Command instance.
+ */
+export function registerAddCommand(program: {
+  command: (name: string) => CommandBuilder;
+}): void {
+  program
+    .command('add <url>')
+    .description('Ingest content from a URL (with optional Playwright fallback for JS-heavy pages)')
+    .option(
+      '--playwright-fallback',
+      'Enable Playwright headless-browser fallback (requires `playwright` package)',
+      false
+    )
+    .option(
+      '--min-content-length <n>',
+      'Minimum character count before Playwright fallback is triggered',
+      '200'
+    )
+    .option(
+      '--timeout <ms>',
+      'Playwright navigation timeout in milliseconds',
+      '30000'
+    )
+    .option(
+      '--browser <channel>',
+      'Playwright browser channel: chromium | firefox | webkit',
+      'chromium'
+    )
+    .option('--output <file>', 'Write ingested content to file instead of stdout')
+    .action(async (url: string, opts: Record<string, string | boolean>) => {
+      const parsePositiveInt = (
+        raw: string | boolean | undefined,
+        name: string
+      ): number | undefined => {
+        if (raw === undefined || raw === false || raw === '') return undefined;
+        const n = parseInt(String(raw), 10);
+        if (isNaN(n) || n <= 0) {
+          process.stderr.write(
+            `[ob add] Error: --${name} must be a positive integer (received: ${String(raw)})\n`
+          );
+          process.exit(1);
+        }
+        return n;
+      };
+
+      const options: AddCommandOptions = {
+        playwrightFallback: Boolean(opts['playwrightFallback']),
+        minContentLength: parsePositiveInt(opts['minContentLength'], 'min-content-length'),
+        timeout: parsePositiveInt(opts['timeout'], 'timeout'),
+        browser: opts['browser'] as PlaywrightBrowserChannel | undefined,
+        output: opts['output'] ? String(opts['output']) : undefined,
+      };
+
+      try {
+        await runAdd(url, options);
+      } catch (err) {
+        process.stderr.write(
+          `[ob add] Error: ${err instanceof Error ? err.message : String(err)}\n`
+        );
+        process.exit(1);
+      }
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Minimal type shim for Commander.js .option / .action chaining
+// (avoids requiring a Commander.js type import in this standalone module)
+// ---------------------------------------------------------------------------
+
+interface CommandBuilder {
+  description: (desc: string) => CommandBuilder;
+  option: (flags: string, description: string, defaultValue?: string | boolean) => CommandBuilder;
+  // eslint-disable-next-line @typescript-eslint/no-explicit-any
+  action: (fn: (...args: any[]) => void | Promise<void>) => CommandBuilder;
+}
diff --git a/src/lib/ingestion/extractor-fetch.ts b/src/lib/ingestion/extractor-fetch.ts
@@ -0,0 +1,116 @@
+/**
+ * Primary fast-path extractor.
+ *
+ * Fetches a URL using the built-in `fetch` API and extracts plain text by
+ * stripping HTML tags.  This is the default extractor used by the ingestion
+ * service; it is fast but cannot handle pages that render content with
+ * client-side JavaScript.
+ */
+
+import type { Extractor, ExtractResult } from './extractor.js';
+
+/** Configuration for the primary fetch-based extractor. */
+export interface FetchExtractorConfig {
+  /**
+   * Request timeout in milliseconds.
+   * @default 15000
+   */
+  timeoutMs?: number;
+  /**
+   * User-Agent string to send with requests.
+   */
+  userAgent?: string;
+}
+
+const DEFAULT_TIMEOUT_MS = 15_000;
+const DEFAULT_USER_AGENT =
+  'Mozilla/5.0 (compatible; OpenBrain-Ingestion/1.0; +https://github.com/TheWizardsCode/ContextHub)';
+
+/**
+ * Strips HTML tags and collapses whitespace from `html`.
+ * Returns the resulting plain text.
+ */
+export function htmlToText(html: string): string {
+  // Remove script and style blocks entirely (allow attributes/whitespace before closing >).
+  let text = html
+    .replace(/<script[\s\S]*?<\/script[^>]*>/gi, ' ')
+    .replace(/<style[\s\S]*?<\/style[^>]*>/gi, ' ');
+  // Replace block-level elements with newlines.
+  text = text.replace(/<\/(p|div|li|h[1-6]|br|tr|td|th|blockquote)[^>]*>/gi, '\n');
+  // Strip all remaining tags.
+  text = text.replace(/<[^>]+>/g, ' ');
+  // Decode common named HTML entities in a single pass to avoid double-decoding
+  // (e.g. &amp;lt; must become &lt;, not <).
+  // Note: numeric character references (e.g. &#65; or &#x41;) are intentionally
+  // not decoded here since they are uncommon in plain-content pages and the
+  // purpose of this extractor is fast-path text extraction, not full HTML parsing.
+  text = text.replace(/&(?:amp|lt|gt|quot|#39|nbsp);/g, (match) => {
+    switch (match) {
+      case '&amp;':  return '&';
+      case '&lt;':   return '<';
+      case '&gt;':   return '>';
+      case '&quot;': return '"';
+      case '&#39;':  return "'";
+      case '&nbsp;': return ' ';
+      default:       return match;
+    }
+  });
+  // Collapse whitespace.
+  return text.replace(/[ \t]+/g, ' ').replace(/\n{3,}/g, '\n\n').trim();
+}
+
+/**
+ * Extracts the content of the first `<title>` tag in `html`.
+ */
+export function extractTitle(html: string): string | undefined {
+  const match = /<title[^>]*>([^<]*)<\/title>/i.exec(html);
+  return match ? match[1].trim() : undefined;
+}
+
+/**
+ * Primary extractor that uses HTTP fetch + HTML-to-text conversion.
+ *
+ * This extractor is fast and has no external runtime dependencies.  It is
+ * the default first-pass extractor used by the ingestion service before
+ * attempting the Playwright fallback.
+ */
+export class FetchExtractor implements Extractor {
+  private readonly config: Required<FetchExtractorConfig>;
+
+  constructor(config: FetchExtractorConfig = {}) {
+    this.config = {
+      timeoutMs: config.timeoutMs ?? DEFAULT_TIMEOUT_MS,
+      userAgent: config.userAgent ?? DEFAULT_USER_AGENT,
+    };
+  }
+
+  async extract(url: string): Promise<ExtractResult> {
+    let html: string;
+
+    try {
+      const controller = new AbortController();
+      const timer = setTimeout(
+        () => controller.abort(),
+        this.config.timeoutMs
+      );
+      try {
+        const response = await fetch(url, {
+          signal: controller.signal,
+          headers: { 'User-Agent': this.config.userAgent },
+        });
+        html = await response.text();
+      } finally {
+        clearTimeout(timer);
+      }
+    } catch {
+      return { text: '', url };
+    }
+
+    return {
+      text: htmlToText(html),
+      html,
+      title: extractTitle(html),
+      url,
+    };
+  }
+}