SimplyLiz · SimplyLiz · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,12 +7,40 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [1.2.0] - 2026-03-20
+
 ### Added
 
-- **Importance-weighted retention** (`importanceScoring: true`) — per-message importance scoring based on forward-reference density (how many later messages share entities with this one), decision/correction content signals, and recency. Messages scoring above `importanceThreshold` (default 0.35) are preserved even outside the recency window. `forceConverge` truncates low-importance messages first. New stats: `messages_importance_preserved`.
-- **Contradiction detection** (`contradictionDetection: true`) — detects later messages that correct or override earlier ones using topic-overlap gating (word-level Jaccard) and correction signal patterns (`actually`, `don't use`, `instead`, `scratch that`, etc.). Superseded messages are compressed with a provenance annotation (`[cce:superseded by ...]`) linking to the correction. New stats: `messages_contradicted`. New decision action: `contradicted`.
-- New exports: `computeImportance`, `scoreContentSignals`, `DEFAULT_IMPORTANCE_THRESHOLD`, `analyzeContradictions` for standalone use outside `compress()`.
-- New types: `ImportanceMap`, `ContradictionAnnotation`.
+- **Quality metrics** — `entity_retention`, `structural_integrity`, `reference_coherence`, and composite `quality_score` (0–1) computed automatically on every compression. Tracks identifier preservation, code fence survival, and reference coherence.
+- **Relevance threshold** (`relevanceThreshold`) — drops low-value messages to compact stubs instead of producing low-quality summaries. Consecutive stubs grouped. New stat: `messages_relevance_dropped`.
+- **Tiered budget strategy** (`budgetStrategy: 'tiered'`) — alternative to binary search that keeps recency window fixed and progressively compresses older content (tighten → stub → truncate).
+- **Entropy scorer** (`entropyScorer`) — plug in a small causal LM for information-theoretic sentence scoring. Modes: `'augment'` (weighted average with heuristic) or `'replace'` (entropy only).
+- **Conversation flow detection** (`conversationFlow: true`) — groups Q&A pairs, request→action→confirmation chains, corrections, and acknowledgments into compression units for more coherent summaries.
+- **Cross-message coreference** (`coreference: true`) — inlines entity definitions into compressed summaries when a preserved message references an entity defined only in a compressed message.
+- **Semantic clustering** (`semanticClustering: true`) — groups consecutive messages by topic using TF-IDF cosine similarity + entity overlap Jaccard, compresses each cluster as a unit.
+- **Compression depth** (`compressionDepth`) — `'gentle'` (default), `'moderate'` (tighter budgets), `'aggressive'` (entity-only stubs), `'auto'` (progressive escalation until `tokenBudget` fits).
+- **Discourse-aware summarization** (`discourseAware: true`) — experimental EDU-lite decomposition with dependency tracking. Reduces ratio 8–28% without a custom ML scorer; use exported `segmentEDUs`/`scoreEDUs`/`selectEDUs` directly instead.
+- **ML token classifier** (`mlTokenClassifier`) — per-token keep/remove classification via user-provided model (LLMLingua-2 style). Includes `createMockTokenClassifier` for testing.
+- **Importance-weighted retention** (`importanceScoring: true`) — per-message importance scoring based on forward-reference density, decision/correction content signals, and recency. Default threshold raised to 0.65.
+- **Contradiction detection** (`contradictionDetection: true`) — detects later messages that correct earlier ones. Superseded messages compressed with provenance annotation.
+- **A/B comparison tool** (`npm run bench:compare`) — side-by-side comparison of default vs v2 features.
+- **V2 Features Comparison** section in benchmark output — per-feature and recommended combo vs default.
+- **Adversarial test suite** — 8 edge-case tests (pronoun-heavy, scattered entities, correction chains, code-interleaved prose, near-duplicates, 10k+ char messages, mixed SQL/JSON/bash, full round-trip with all features).
+- New modules: `entities.ts`, `entropy.ts`, `flow.ts`, `coreference.ts`, `cluster.ts`, `discourse.ts`, `ml-classifier.ts`.
+- New types: `ImportanceMap`, `ContradictionAnnotation`, `MLTokenClassifier`, `TokenClassification`, `FlowChain`, `MessageCluster`, `EDU`, `EntityDefinition`.
+- Comprehensive [V2 features documentation](docs/v2-features.md) with tradeoff analysis per feature.
+
+### Changed
+
+- Adaptive summary budgets scale with content density when `compressionDepth` is set to `'moderate'` or higher (entity-dense content gets up to 45% budget, sparse content down to 15%).
+- Default path (no v2 options) produces identical output to v1.1.0 — all new features are opt-in.
+- Quality metrics section added to benchmark reporter and generated docs.
+
+### Fixed
+
+- Flow chains no longer skip non-member messages between chain endpoints.
+- Semantic clusters restricted to consecutive indices to preserve round-trip ordering.
+- Flow chains exclude messages with code fences to prevent structural integrity loss.
 
 ## [1.1.0] - 2026-03-19
 

diff --git a/bench/baseline.ts b/bench/baseline.ts
@@ -46,6 +46,13 @@ export interface RetentionResult {
   structuralRetention: number;
 }
 
+export interface QualityResult {
+  entityRetention: number;
+  structuralIntegrity: number;
+  referenceCoherence: number;
+  qualityScore: number;
+}
+
 export interface AncsResult {
   baselineRatio: number;
   importanceRatio: number;
@@ -62,6 +69,7 @@ export interface BenchmarkResults {
   fuzzyDedup: Record<string, FuzzyDedupResult>;
   bundleSize: Record<string, BundleSizeResult>;
   retention?: Record<string, RetentionResult>;
+  quality?: Record<string, QualityResult>;
   ancs?: Record<string, AncsResult>;
 }
 
@@ -1192,6 +1200,13 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string):
   lines.push(`| Average compression | ${fix(avgR)}x |`);
   lines.push(`| Best compression | ${fix(Math.max(...ratios))}x |`);
   lines.push(`| Round-trip integrity | all PASS |`);
+  if (latest.results.quality && Object.keys(latest.results.quality).length > 0) {
+    const qualityEntries = Object.values(latest.results.quality);
+    const avgQ = qualityEntries.reduce((s, q) => s + q.qualityScore, 0) / qualityEntries.length;
+    lines.push(`| Average quality score | ${fix(avgQ, 3)} |`);
+    const avgER = qualityEntries.reduce((s, q) => s + q.entityRetention, 0) / qualityEntries.length;
+    lines.push(`| Average entity retention | ${(avgER * 100).toFixed(0)}% |`);
+  }
   lines.push('');
 
   // --- Pie chart: message outcome distribution ---
@@ -1219,6 +1234,22 @@ export function generateBenchmarkDocs(baselinesDir: string, outputPath: string):
     lines.push('');
   }
 
+  // --- Quality ---
+  if (latest.results.quality && Object.keys(latest.results.quality).length > 0) {
+    lines.push('## Quality Metrics');
+    lines.push('');
+    lines.push(
+      '| Scenario | Entity Retention | Structural Integrity | Reference Coherence | Quality Score |',
+    );
+    lines.push('| --- | --- | --- | --- | --- |');
+    for (const [name, q] of Object.entries(latest.results.quality)) {
+      lines.push(
+        `| ${name} | ${(q.entityRetention * 100).toFixed(0)}% | ${(q.structuralIntegrity * 100).toFixed(0)}% | ${(q.referenceCoherence * 100).toFixed(0)}% | ${q.qualityScore.toFixed(3)} |`,
+      );
+    }
+    lines.push('');
+  }
+
   // --- Token budget ---
   lines.push(...generateTokenBudgetSection(latest.results));
   lines.push('');

diff --git a/bench/baselines/current.json b/bench/baselines/current.json
@@ -1,6 +1,6 @@
 {
-  "version": "1.1.0",
-  "generated": "2026-03-20T18:05:08.551Z",
+  "version": "1.2.0",
+  "generated": "2026-03-20T22:34:22.455Z",
   "results": {
     "basic": {
       "Coding assistant": {
@@ -16,8 +16,8 @@
         "preserved": 6
       },
       "Tool-heavy": {
-        "ratio": 1.4128440366972477,
-        "tokenRatio": 1.4043583535108959,
+        "ratio": 1.4009797060881735,
+        "tokenRatio": 1.3908872901678657,
         "compressed": 2,
         "preserved": 16
       },
@@ -102,10 +102,10 @@
         "deduped": 1
       },
       "Tool-heavy": {
-        "rw0Base": 1.4128440366972477,
-        "rw0Dup": 1.4128440366972477,
-        "rw4Base": 1.4128440366972477,
-        "rw4Dup": 1.4128440366972477,
+        "rw0Base": 1.4009797060881735,
+        "rw0Dup": 1.4009797060881735,
+        "rw4Base": 1.4009797060881735,
+        "rw4Dup": 1.4009797060881735,
         "deduped": 0
       },
       "Short conversation": {
@@ -158,7 +158,7 @@
       "Tool-heavy": {
         "exact": 0,
         "fuzzy": 0,
-        "ratio": 1.4128440366972477
+        "ratio": 1.4009797060881735
       },
       "Short conversation": {
         "exact": 0,
@@ -199,18 +199,38 @@
         "bytes": 10994,
         "gzipBytes": 4452
       },
+      "cluster.js": {
+        "bytes": 7587,
+        "gzipBytes": 2471
+      },
       "compress.js": {
-        "bytes": 53439,
-        "gzipBytes": 11671
+        "bytes": 86117,
+        "gzipBytes": 16727
       },
       "contradiction.js": {
         "bytes": 7700,
         "gzipBytes": 2717
       },
+      "coreference.js": {
+        "bytes": 4321,
+        "gzipBytes": 1500
+      },
       "dedup.js": {
         "bytes": 10260,
         "gzipBytes": 2864
       },
+      "discourse.js": {
+        "bytes": 6792,
+        "gzipBytes": 2495
+      },
+      "entities.js": {
+        "bytes": 8403,
+        "gzipBytes": 2665
+      },
+      "entropy.js": {
+        "bytes": 1979,
+        "gzipBytes": 832
+      },
       "expand.js": {
         "bytes": 2795,
         "gzipBytes": 934
@@ -219,13 +239,21 @@
         "bytes": 11923,
         "gzipBytes": 2941
       },
+      "flow.js": {
+        "bytes": 7967,
+        "gzipBytes": 2086
+      },
       "importance.js": {
         "bytes": 4759,
-        "gzipBytes": 1849
+        "gzipBytes": 1850
       },
       "index.js": {
-        "bytes": 854,
-        "gzipBytes": 405
+        "bytes": 1809,
+        "gzipBytes": 761
+      },
+      "ml-classifier.js": {
+        "bytes": 3096,
+        "gzipBytes": 1208
       },
       "summarizer.js": {
         "bytes": 2542,
@@ -236,8 +264,46 @@
         "gzipBytes": 31
       },
       "total": {
-        "bytes": 114084,
-        "gzipBytes": 31813
+        "bytes": 187862,
+        "gzipBytes": 50483
+      }
+    },
+    "quality": {
+      "Coding assistant": {
+        "entityRetention": 1,
+        "structuralIntegrity": 1,
+        "referenceCoherence": 1,
+        "qualityScore": 1
+      },
+      "Long Q&A": {
+        "entityRetention": 1,
+        "structuralIntegrity": 1,
+        "referenceCoherence": 1,
+        "qualityScore": 1
+      },
+      "Tool-heavy": {
+        "entityRetention": 0.931,
+        "structuralIntegrity": 1,
+        "referenceCoherence": 1,
+        "qualityScore": 0.972
+      },
+      "Deep conversation": {
+        "entityRetention": 1,
+        "structuralIntegrity": 1,
+        "referenceCoherence": 1,
+        "qualityScore": 1
+      },
+      "Structured content": {
+        "entityRetention": 1,
+        "structuralIntegrity": 1,
+        "referenceCoherence": 1,
+        "qualityScore": 1
+      },
+      "Agentic coding session": {
+        "entityRetention": 0.848,
+        "structuralIntegrity": 1,
+        "referenceCoherence": 1,
+        "qualityScore": 0.939
       }
     },
     "retention": {