SimplyLiz · SimplyLiz · Mar 21, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 25, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,14 @@
+# LLM provider API keys for benchmark comparisons (npm run bench:llm)
+# Copy to .env and uncomment the providers you want to test.
+
+# OpenAI (default model: gpt-4.1-mini)
+# OPENAI_API_KEY=sk-...
+# OPENAI_MODEL=gpt-4.1-mini
+
+# Anthropic (default model: claude-haiku-4-5-20251001)
+# ANTHROPIC_API_KEY=sk-ant-...
+# ANTHROPIC_MODEL=claude-haiku-4-5-20251001
+
+# Ollama (auto-detected when running locally — no env vars required)
+# OLLAMA_HOST=http://localhost:11434
+# OLLAMA_MODEL=llama3.2
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -5,7 +5,7 @@ updates:
     schedule:
       interval: weekly
       day: monday
-    target-branch: main
+    target-branch: develop
     open-pull-requests-limit: 10
     groups:
       production-deps:
@@ -24,5 +24,5 @@ updates:
     schedule:
       interval: weekly
       day: monday
-    target-branch: main
+    target-branch: develop
     open-pull-requests-limit: 10
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -2,12 +2,32 @@ name: CI
 
 on:
   push:
-    branches: [main]
+    branches: [main, develop]
     tags: ['v*.*.*']
   pull_request:
-    branches: [main]
+    branches: [main, develop]
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ci-${{ github.ref }}
+  cancel-in-progress: true
 
 jobs:
+  dependency-review:
+    if: github.event_name == 'pull_request'
+    runs-on: ubuntu-latest
+    continue-on-error: true
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/dependency-review-action@v4
+        with:
+          fail-on-severity: high
+
   audit:
     runs-on: ubuntu-latest
     steps:
@@ -50,10 +70,44 @@ jobs:
           else
             npm run test:coverage
           fi
+      - name: Upload coverage
+        if: matrix.node-version == 22
+        uses: codecov/codecov-action@v5
+        with:
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: false
+      - name: Coverage report on PR
+        if: matrix.node-version == 22 && github.event_name == 'pull_request'
+        uses: davelosert/vitest-coverage-report-action@v2
+        continue-on-error: true
       - run: npx tsc --noEmit
 
+  bench:
+    needs: [test]
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+          cache: npm
+      - run: npm ci
+      - run: npm run bench:check
+
+  e2e:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v6
+      - uses: actions/setup-node@v6
+        with:
+          node-version: 22
+          cache: npm
+      - run: npm ci
+      - name: Build, pack, lint, and smoke test
+        run: npm run test:e2e
+
   publish:
-    needs: [audit, lint, test]
+    needs: [audit, lint, test, bench, e2e]
     if: startsWith(github.ref, 'refs/tags/v')
     runs-on: ubuntu-latest
     permissions:
@@ -77,9 +131,15 @@ jobs:
             exit 1
           fi
 
+      - name: Validate changelog entry
+        run: |
+          TAG_VERSION="${GITHUB_REF_NAME#v}"
+          if ! grep -q "## \[${TAG_VERSION}\]" CHANGELOG.md; then
+            echo "::error::No CHANGELOG.md entry found for version ${TAG_VERSION}"
+            exit 1
+          fi
+
       - run: npm publish --provenance --access public
-        env:
-          NODE_AUTH_TOKEN: ${{ secrets.NODE_AUTH_TOKEN }}
 
       - name: Extract release notes
         id: release_notes

diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -0,0 +1,27 @@
+name: CodeQL
+
+on:
+  push:
+    branches: [main, develop]
+  pull_request:
+    branches: [main, develop]
+  schedule:
+    - cron: '0 6 * * 1'
+
+jobs:
+  analyze:
+    runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
+      security-events: write
+    steps:
+      - uses: actions/checkout@v6
+
+      - name: Initialize CodeQL
+        uses: github/codeql-action/init@v4
+        with:
+          languages: javascript-typescript
+
+      - name: Perform CodeQL Analysis
+        uses: github/codeql-action/analyze@v4
diff --git a/.gitignore b/.gitignore
@@ -5,9 +5,11 @@ venv/
 
 # build
 dist/
+coverage/
 build/
 *.egg-info/
 *.tsbuildinfo
+*.tgz
 
 # cache
 __pycache__/
@@ -33,6 +35,9 @@ __pycache__/
 .vscode/
 *.swp
 
+# demo
+demo/bundle.js
+
 # indexing / analysis artifacts
 .ckb/
 *.scip

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,77 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [Unreleased]
+
+## [1.3.0] - 2026-03-21
+
+### Added
+
+- **Quality benchmark overhaul** — replaced broken metrics (keywordRetention, factRetention, negationErrors) with five meaningful ones: task-based probes (~70 across 13 scenarios), information density, compressed-only quality score, negative compression detection, and summary coherence checks.
+- **Task-based probes** — hand-curated per-scenario checks that verify whether specific critical information (identifiers, code patterns, config values) survives compression. Probe failures surface real quality issues.
+- **LLM-as-judge scoring** (`--llm-judge` flag) — optional LLM evaluation of compression quality. Multi-provider support: OpenAI, Anthropic, Gemini (`@google/genai`), Ollama. Display-only, not used for regression testing.
+- **Gemini provider** for LLM benchmarks via `GEMINI_API_KEY` env var (default model: `gemini-2.5-flash`).
+- **Opt-in feature comparison** (`--features` flag) — runs quality benchmark with each opt-in feature enabled to measure their impact vs baseline.
+- **Quality history documentation** (`docs/quality-history.md`) — version-over-version quality tracking across v1.0.0, v1.1.0, v1.2.0 with opt-in feature impact analysis.
+- **Min-output-chars probes** to catch over-aggressive compression.
+- **Code block language aliases** in benchmarks (typescript/ts, python/py, yaml/yml).
+- New npm scripts: `bench:quality:judge`, `bench:quality:features`.
+
+### Changed
+
+- Coherence and negative compression regression thresholds now track increases from baseline, not just zero-to-nonzero transitions.
+- Information density regression check only applies when compression actually occurs (ratio > 1.01).
+- Quality benchmark table now shows: `Ratio EntRet CodeOK InfDen Probes Pass NegCp Coher CmpQ`.
+- `analyzeQuality()` accepts optional `CompressOptions` for feature testing.
+
+### Removed
+
+- `keywordRetention` metric (tautological — 100% on 12/13 scenarios).
+- `factRetention` and `factCount` metrics (fragile regex-based fact extractor).
+- `negationErrors` metric (noisy, rarely triggered).
+- `extractFacts()` and `analyzeSemanticFidelity()` functions.
+
+## [1.2.0] - 2026-03-20
+
+### Added
+
+- **Quality metrics** — `entity_retention`, `structural_integrity`, `reference_coherence`, and composite `quality_score` (0–1) computed automatically on every compression. Tracks identifier preservation, code fence survival, and reference coherence.
+- **Relevance threshold** (`relevanceThreshold`) — drops low-value messages to compact stubs instead of producing low-quality summaries. Consecutive stubs grouped. New stat: `messages_relevance_dropped`.
+- **Tiered budget strategy** (`budgetStrategy: 'tiered'`) — alternative to binary search that keeps recency window fixed and progressively compresses older content (tighten → stub → truncate).
+- **Entropy scorer** (`entropyScorer`) — plug in a small causal LM for information-theoretic sentence scoring. Modes: `'augment'` (weighted average with heuristic) or `'replace'` (entropy only).
+- **Conversation flow detection** (`conversationFlow: true`) — groups Q&A pairs, request→action→confirmation chains, corrections, and acknowledgments into compression units for more coherent summaries.
+- **Cross-message coreference** (`coreference: true`) — inlines entity definitions into compressed summaries when a preserved message references an entity defined only in a compressed message.
+- **Semantic clustering** (`semanticClustering: true`) — groups consecutive messages by topic using TF-IDF cosine similarity + entity overlap Jaccard, compresses each cluster as a unit.
+- **Compression depth** (`compressionDepth`) — `'gentle'` (default), `'moderate'` (tighter budgets), `'aggressive'` (entity-only stubs), `'auto'` (progressive escalation until `tokenBudget` fits).
+- **Discourse-aware summarization** (`discourseAware: true`) — experimental EDU-lite decomposition with dependency tracking. Reduces ratio 8–28% without a custom ML scorer; use exported `segmentEDUs`/`scoreEDUs`/`selectEDUs` directly instead.
+- **ML token classifier** (`mlTokenClassifier`) — per-token keep/remove classification via user-provided model (LLMLingua-2 style). Includes `createMockTokenClassifier` for testing.
+- **Importance-weighted retention** (`importanceScoring: true`) — per-message importance scoring based on forward-reference density, decision/correction content signals, and recency. Default threshold raised to 0.65.
+- **Contradiction detection** (`contradictionDetection: true`) — detects later messages that correct earlier ones. Superseded messages compressed with provenance annotation.
+- **A/B comparison tool** (`npm run bench:compare`) — side-by-side comparison of default vs v2 features.
+- **V2 Features Comparison** section in benchmark output — per-feature and recommended combo vs default.
+- **Adversarial test suite** — 8 edge-case tests (pronoun-heavy, scattered entities, correction chains, code-interleaved prose, near-duplicates, 10k+ char messages, mixed SQL/JSON/bash, full round-trip with all features).
+- New modules: `entities.ts`, `entropy.ts`, `flow.ts`, `coreference.ts`, `cluster.ts`, `discourse.ts`, `ml-classifier.ts`.
+- New types: `ImportanceMap`, `ContradictionAnnotation`, `MLTokenClassifier`, `TokenClassification`, `FlowChain`, `MessageCluster`, `EDU`, `EntityDefinition`.
+- Comprehensive [V2 features documentation](docs/v2-features.md) with tradeoff analysis per feature.
+
+### Changed
+
+- Adaptive summary budgets scale with content density when `compressionDepth` is set to `'moderate'` or higher (entity-dense content gets up to 45% budget, sparse content down to 15%).
+- Default path (no v2 options) produces identical output to v1.1.0 — all new features are opt-in.
+- Quality metrics section added to benchmark reporter and generated docs.
+
+### Fixed
+
+- Flow chains no longer skip non-member messages between chain endpoints.
+- Semantic clusters restricted to consecutive indices to preserve round-trip ordering.
+- Flow chains exclude messages with code fences to prevent structural integrity loss.
+
+## [1.1.0] - 2026-03-19
+
+### Added
+
+- Reasoning chain detection in classifier — preserves chain-of-thought, step-by-step analysis, formal proofs, and multi-step logical arguments as hard T0 (verbatim). Uses two-tier anchor system: strong anchors (explicit labels like `Reasoning:`, formal inference phrases) trigger on a single match; weak anchors (logical connectives like `therefore`, `hence`, `thus`) require 3+ distinct to fire. Defense-in-depth scoring boost in the summarizer ensures reasoning sentences survive even if classification is bypassed.
+
 ## [1.0.0] - 2025-02-24
 
 First stable release. Published as `context-compression-engine`.
@@ -34,4 +105,5 @@ First stable release. Published as `context-compression-engine`.
 - Benchmark suite with synthetic and real-session scenarios
 - LLM benchmark with multi-provider support (Claude, GPT, Gemini, Grok, Ollama)
 
+[1.1.0]: https://github.com/SimplyLiz/ContextCompressionEngine/releases/tag/v1.1.0
 [1.0.0]: https://github.com/SimplyLiz/ContextCompressionEngine/releases/tag/v1.0.0
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -13,6 +13,12 @@ npm run lint             # ESLint check
 npm run format           # Prettier write
 npm run format:check     # Prettier check
 npm run bench            # Run benchmark suite
+npm run bench:save       # Run, save baseline, regenerate docs/benchmark-results.md
+npm run bench:quality    # Run quality benchmark (probes, coherence, info density)
+npm run bench:quality:save   # Save quality baseline
+npm run bench:quality:check  # Compare against quality baseline
+npm run bench:quality:judge     # Run with LLM-as-judge (requires API key)
+npm run bench:quality:features  # Compare opt-in features vs baseline
 ```
 
 Run a single test file:
@@ -33,7 +39,9 @@ messages → classify → dedup → merge → summarize → size guard → resul
 
 - **classify** (`src/classify.ts`) — three-tier classification (T0 = preserve verbatim, T2 = compressible prose, T3 = filler/removable). Uses structural pattern detection (code fences, JSON, YAML, LaTeX), SQL/API-key anchors, and prose density scoring.
 - **dedup** (`src/dedup.ts`) — exact (djb2 hash + full comparison) and fuzzy (line-level Jaccard similarity) duplicate detection. Earlier duplicates are replaced with compact references.
-- **compress** (`src/compress.ts`) — orchestrator. Handles message merging, code-bearing message splitting (prose compressed, fences preserved inline), budget binary search over `recencyWindow`, and `forceConverge` hard-truncation.
+- **importance** (`src/importance.ts`) — per-message importance scoring: forward-reference density (how many later messages share entities), decision/correction content signals, and recency bonus. High-importance messages resist compression even outside recency window. Opt-in via `importanceScoring: true`.
+- **contradiction** (`src/contradiction.ts`) — detects later messages that correct/override earlier ones (topic-overlap gating + correction signal patterns like "actually", "don't use", "instead"). Superseded messages are compressed with provenance annotations. Opt-in via `contradictionDetection: true`.
+- **compress** (`src/compress.ts`) — orchestrator. Handles message merging, code-bearing message splitting (prose compressed, fences preserved inline), budget binary search over `recencyWindow`, and `forceConverge` hard-truncation (importance-aware ordering when `importanceScoring` is on).
 - **summarize** (internal in `compress.ts`) — deterministic sentence scoring: rewards technical identifiers (camelCase, snake_case), emphasis phrases, status words; penalizes filler. Paragraph-aware to keep topic boundaries.
 - **summarizer** (`src/summarizer.ts`) — LLM-powered summarization. `createSummarizer` wraps an LLM call with a prompt template. `createEscalatingSummarizer` adds three-level fallback: normal → aggressive → deterministic.
 - **expand** (`src/expand.ts`) — `uncompress()` restores originals from a `VerbatimMap` or lookup function. Supports recursive expansion for multi-round compression chains (max depth 10).
@@ -62,7 +70,7 @@ main ← develop ← feature branches
 - **TypeScript:** ES2020 target, NodeNext module resolution, strict mode, ESM-only
 - **Unused params** must be prefixed with `_` (ESLint enforced)
 - **Prettier:** 100 char width, 2-space indent, single quotes, trailing commas, semicolons
-- **Tests:** Vitest 4, test files in `tests/`, coverage via `@vitest/coverage-v8` (Node 20+ only)
-- **Node version:** ≥18 (.nvmrc: 22)
+- **Tests:** Vitest 4, test files in `tests/`, coverage via `@vitest/coverage-v8`
+- **Node version:** ≥20 (.nvmrc: 22)
 - **Always run `npm run format` before committing** — CI enforces `format:check`
 - **No author/co-author attribution** in commits, code, or docs
diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ The classifier is content-aware, not domain-specific. It preserves structured da
 
 ## Key findings
 
-The deterministic engine achieves **1.3-6.1x compression with zero latency and zero cost.** It scores sentences, packs a budget, strips filler — and in most scenarios, it compresses tighter than an LLM. LLM summarization is opt-in for cases where semantic understanding improves quality. See [Benchmarks](docs/benchmarks.md) for the full comparison.
+The deterministic engine achieves **1.3-6.1x compression with zero latency and zero cost.** It scores sentences, packs a budget, strips filler — and in most scenarios, it compresses tighter than an LLM. LLM summarization is opt-in for cases where semantic understanding improves quality. See [Benchmarks](docs/benchmarks.md) for methodology, [Benchmark Results](docs/benchmark-results.md) for the latest numbers, and [Quality History](docs/quality-history.md) for version-over-version quality tracking.
 
 ## Features