opendatalab · e06084 · Jun 13, 2026 · Jun 13, 2026
diff --git a/README.md b/README.md
@@ -113,10 +113,10 @@ Results from the [Dripper paper](https://arxiv.org/abs/2511.23119) (Table 2):
 | Extractor | Version | overall | text\_edit | code\_edit | formula\_edit | table\_edit | table\_TEDS |
 |---|---|---|---|---|---|---|---|
 | **mineru-html** | 4.1.1 | **0.8256** | 0.8621 | 0.9093 | 0.9399 | 0.6780 | 0.7388 |
-| magic-html | 0.1.5 | 0.5141 | 0.7791 | 0.4117 | 0.7204 | 0.2611 | 0.3984 |
-| trafilatura (md) | 2.0.0 | 0.3858 | 0.6887 | 0.1305 | 0.6242 | 0.1653 | 0.3203 |
-| resiliparse | 0.14.5 | 0.2954 | 0.7381 | 0.0641 | 0.6747 | 0.0000 | 0.0000 |
-| trafilatura (txt) | 2.0.0 | 0.2657 | 0.7126 | 0.0000 | 0.6162 | 0.0000 | 0.0000 |
+| magic-html | 0.1.5 | 0.4996 | 0.7800 | 0.4150 | 0.6385 | 0.2638 | 0.4006 |
+| trafilatura (md) | 2.0.0 | 0.4013 | 0.7826 | 0.1801 | 0.6237 | 0.1202 | 0.2999 |
+| resiliparse | 0.14.5 | 0.2898 | 0.7435 | 0.0422 | 0.6631 | 0.0000 | 0.0000 |
+| trafilatura (txt) | 2.0.0 | 0.3718 | 0.7819 | 0.0000 | 0.6389 | 0.1278 | 0.3106 |
 
 Contributions of new extractor results are welcome — open a PR!
 
@@ -194,13 +194,23 @@ cp .env.example .env
 # Edit .env and set LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
 ```
 
+When constructing an evaluator manually, pass the same LLM settings to both `llm_config` and `metric_config`; `llm_config` validates the API, while `metric_config` enables LLM-enhanced metric splitting.
+
 #### Run an Evaluation
 
 ```python
+import os
 from webmainbench import DataLoader, Evaluator, ExtractorFactory
 
 dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl")
-result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura"))
+llm_config = {
+    "use_llm": True,
+    "llm_base_url": os.getenv("LLM_BASE_URL", ""),
+    "llm_api_key": os.getenv("LLM_API_KEY", ""),
+    "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"),
+}
+evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config)
+result = evaluator.evaluate(dataset, ExtractorFactory.create("trafilatura"))
 
 m = result.overall_metrics
 
@@ -217,7 +227,16 @@ for name, result in results.items():
     print(f"{name}: {result.overall_metrics['overall']:.4f}")
 ```
 
-A complete example is available at `examples/multi_extractor_compare.py`.
+To reproduce the 545-sample fine-grained leaderboard:
+
+```bash
+export LLM_BASE_URL="https://your-openai-compatible-endpoint/v1"
+export LLM_API_KEY="..."
+export LLM_MODEL="gpt-5-chat-latest"
+python examples/run_545_leaderboard.py data/WebMainBench_545.jsonl
+```
+
+Complete examples are available at `examples/run_545_leaderboard.py` and `examples/multi_extractor_compare.py`.
 
 ## Dataset Format
 

diff --git a/README_zh.md b/README_zh.md
@@ -113,10 +113,10 @@ WebMainBench 支持两套互补的评测协议：
 | 抽取器 | 版本 | overall | text\_edit | code\_edit | formula\_edit | table\_edit | table\_TEDS |
 |---|---|---|---|---|---|---|---|
 | **mineru-html** | 4.1.1 | **0.8256** | 0.8621 | 0.9093 | 0.9399 | 0.6780 | 0.7388 |
-| magic-html | 0.1.5 | 0.5141 | 0.7791 | 0.4117 | 0.7204 | 0.2611 | 0.3984 |
-| trafilatura (md) | 2.0.0 | 0.3858 | 0.6887 | 0.1305 | 0.6242 | 0.1653 | 0.3203 |
-| resiliparse | 0.14.5 | 0.2954 | 0.7381 | 0.0641 | 0.6747 | 0.0000 | 0.0000 |
-| trafilatura (txt) | 2.0.0 | 0.2657 | 0.7126 | 0.0000 | 0.6162 | 0.0000 | 0.0000 |
+| magic-html | 0.1.5 | 0.4996 | 0.7800 | 0.4150 | 0.6385 | 0.2638 | 0.4006 |
+| trafilatura (md) | 2.0.0 | 0.4013 | 0.7826 | 0.1801 | 0.6237 | 0.1202 | 0.2999 |
+| resiliparse | 0.14.5 | 0.2898 | 0.7435 | 0.0422 | 0.6631 | 0.0000 | 0.0000 |
+| trafilatura (txt) | 2.0.0 | 0.3718 | 0.7819 | 0.0000 | 0.6389 | 0.1278 | 0.3106 |
 
 欢迎提交新抽取器的评测结果 — 请提 PR！
 
@@ -194,13 +194,23 @@ cp .env.example .env
 # 编辑 .env，设置 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL
 ```
 
+手动构造评测器时，需要把同一份 LLM 配置同时传给 `llm_config` 和 `metric_config`；`llm_config` 用于校验 API，`metric_config` 用于启用 LLM 增强的指标拆分。
+
 #### 运行评测
 
 ```python
+import os
 from webmainbench import DataLoader, Evaluator, ExtractorFactory
 
 dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl")
-result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura"))
+llm_config = {
+    "use_llm": True,
+    "llm_base_url": os.getenv("LLM_BASE_URL", ""),
+    "llm_api_key": os.getenv("LLM_API_KEY", ""),
+    "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"),
+}
+evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config)
+result = evaluator.evaluate(dataset, ExtractorFactory.create("trafilatura"))
 
 m = result.overall_metrics
 
@@ -217,7 +227,16 @@ for name, result in results.items():
     print(f"{name}: {result.overall_metrics['overall']:.4f}")
 ```
 
-完整示例见 `examples/multi_extractor_compare.py`。
+复现 545 条子集细粒度榜单：
+
+```bash
+export LLM_BASE_URL="https://your-openai-compatible-endpoint/v1"
+export LLM_API_KEY="..."
+export LLM_MODEL="gpt-5-chat-latest"
+python examples/run_545_leaderboard.py data/WebMainBench_545.jsonl
+```
+
+完整示例见 `examples/run_545_leaderboard.py` 和 `examples/multi_extractor_compare.py`。
 
 ## 数据格式
 

diff --git a/examples/run_545_leaderboard.py b/examples/run_545_leaderboard.py
@@ -0,0 +1,59 @@
+"""Run the 545-sample fine-grained leaderboard.
+
+Required environment variables for LLM-enhanced formula splitting:
+    LLM_BASE_URL
+    LLM_API_KEY
+    LLM_MODEL
+
+Example:
+    python examples/run_545_leaderboard.py WebMainBench_545.jsonl
+"""
+
+import os
+import sys
+from pathlib import Path
+
+from webmainbench import DataLoader, Evaluator
+
+
+METRICS = [
+    "overall",
+    "text_edit",
+    "code_edit",
+    "formula_edit",
+    "table_edit",
+    "table_TEDS",
+]
+
+
+def build_llm_config() -> dict:
+    config = {
+        "use_llm": os.getenv("USE_LLM", "true").lower() == "true",
+        "llm_base_url": os.getenv("LLM_BASE_URL", ""),
+        "llm_api_key": os.getenv("LLM_API_KEY", ""),
+        "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"),
+        "llm_timeout": float(os.getenv("LLM_TIMEOUT", "60")),
+    }
+    if os.getenv("LLM_CACHE_DIR"):
+        config["cache_dir"] = os.getenv("LLM_CACHE_DIR")
+    return config
+
+
+def main() -> None:
+    dataset_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("data/WebMainBench_545.jsonl")
+    extractors = sys.argv[2:] or ["magic-html", "trafilatura", "resiliparse", "trafilatura_txt"]
+    llm_config = build_llm_config()
+
+    dataset = DataLoader.load_jsonl(dataset_path)
+    evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config)
+    results = evaluator.compare_extractors(dataset, extractors)
+
+    print("| Extractor | " + " | ".join(METRICS) + " |")
+    print("|---|" + "|".join(["---:"] * len(METRICS)) + "|")
+    for name, result in results.items():
+        scores = [result.overall_metrics.get(metric, 0.0) for metric in METRICS]
+        print(f"| {name} | " + " | ".join(f"{score:.4f}" for score in scores) + " |")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/test_html_cleaner.py b/tests/test_html_cleaner.py
@@ -0,0 +1,45 @@
+from webmainbench.extractors.base import BaseExtractor, ExtractionResult
+from webmainbench.utils import clean_browser_annotation_artifacts
+
+
+class EchoExtractor(BaseExtractor):
+    def _setup(self):
+        pass
+
+    def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
+        return ExtractionResult(content=html)
+
+
+def test_clean_browser_annotation_artifacts_preserves_text():
+    html = (
+        '<p data-anno-uid="anno-1">'
+        '<marked-text data-anno-uid="anno-2">Hello</marked-text>'
+        '<span> </span>'
+        "<marked-tail data-anno-uid='anno-3'>world</marked-tail>"
+        "</p>"
+    )
+
+    cleaned = clean_browser_annotation_artifacts(html)
+
+    assert "Hello" in cleaned
+    assert "world" in cleaned
+    assert "marked-text" not in cleaned
+    assert "marked-tail" not in cleaned
+    assert "data-anno-uid" not in cleaned
+
+
+def test_base_extractor_cleans_annotation_artifacts_by_default():
+    extractor = EchoExtractor("echo")
+
+    result = extractor.extract('<p><marked-text data-anno-uid="x">Hello</marked-text></p>')
+
+    assert result.content == "<p>Hello</p>"
+
+
+def test_base_extractor_can_disable_annotation_cleanup():
+    extractor = EchoExtractor("echo", config={"clean_html_annotations": False})
+    html = '<p><marked-text data-anno-uid="x">Hello</marked-text></p>'
+
+    result = extractor.extract(html)
+
+    assert result.content == html
diff --git a/tests/test_metric_config.py b/tests/test_metric_config.py
@@ -0,0 +1,15 @@
+from webmainbench.metrics import MetricCalculator
+
+
+def test_metric_calculator_passes_config_to_default_metrics():
+    config = {
+        "use_llm": True,
+        "llm_base_url": "http://example.test/v1",
+        "llm_api_key": "test-key",
+        "llm_model": "test-model",
+    }
+
+    calculator = MetricCalculator(config)
+
+    assert calculator.metrics["formula_edit"].config == config
+    assert calculator.metrics["text_edit"].config == config
diff --git a/tests/test_trafilatura_config.py b/tests/test_trafilatura_config.py
@@ -0,0 +1,39 @@
+from webmainbench.extractors.trafilatura_extractor import (
+    TrafilaturaExtractor as MarkdownTrafilaturaExtractor,
+)
+from webmainbench.extractors.trafilatura_txt_extractor import (
+    TrafilaturaExtractor as TextTrafilaturaExtractor,
+)
+
+
+def test_trafilatura_markdown_defaults_match_standard_options():
+    extractor = MarkdownTrafilaturaExtractor("trafilatura")
+
+    assert extractor.inference_config.favor_precision is False
+    assert extractor.inference_config.favor_recall is False
+    assert extractor.inference_config.include_comments is True
+    assert extractor.inference_config.output_format == "markdown"
+
+
+def test_trafilatura_txt_defaults_to_extract_txt(monkeypatch):
+    calls = {}
+
+    def fake_extract(html, **kwargs):
+        calls["html"] = html
+        calls["kwargs"] = kwargs
+        return "plain text"
+
+    monkeypatch.setattr(
+        "webmainbench.extractors.trafilatura_txt_extractor.extract",
+        fake_extract,
+    )
+    extractor = TextTrafilaturaExtractor("trafilatura_txt")
+
+    result = extractor.extract("<html><body>plain text</body></html>", url="https://example.com")
+
+    assert result.content == "plain text"
+    assert calls["kwargs"]["url"] == "https://example.com"
+    assert calls["kwargs"]["favor_precision"] is False
+    assert calls["kwargs"]["favor_recall"] is False
+    assert calls["kwargs"]["include_comments"] is True
+    assert calls["kwargs"]["output_format"] == "txt"
diff --git a/webmainbench/evaluator/evaluator.py b/webmainbench/evaluator/evaluator.py
@@ -137,7 +137,8 @@ def _validate_llm_config(self, llm_config: Dict[str, Any] = None):
                 print("Validating LLM API configuration...")
                 client = OpenAI(
                     base_url=config.get('llm_base_url'),
-                    api_key=config.get('llm_api_key')
+                    api_key=config.get('llm_api_key'),
+                    timeout=config.get('llm_timeout', 60),
                 )
 
                 client.chat.completions.create(
@@ -615,4 +616,4 @@ def compare_extractors(self,
                 print(f"Error evaluating {extractor_name}: {e}")
                 continue
 
-        return results 
+        return results 
diff --git a/webmainbench/extractors/base.py b/webmainbench/extractors/base.py
@@ -7,6 +7,7 @@
 from typing import Dict, List, Any, Optional, Union
 import time
 import traceback
+from ..utils.html_cleaner import clean_browser_annotation_artifacts
 
 
 @dataclass
@@ -154,6 +155,9 @@ def extract(self, html: str, url: str = None) -> ExtractionResult:
                     extraction_time=time.time() - start_time
                 )
 
+            if self.config.get("clean_html_annotations", True):
+                html = clean_browser_annotation_artifacts(html)
+
             # Perform extraction
             result = self._extract_content(html, url)
             result.extraction_time = time.time() - start_time
@@ -213,4 +217,4 @@ def __str__(self) -> str:
         return f"{self.__class__.__name__}(name='{self.name}')"
 
     def __repr__(self) -> str:
-        return self.__str__() 
+        return self.__str__() 
diff --git a/webmainbench/extractors/trafilatura_extractor.py b/webmainbench/extractors/trafilatura_extractor.py
@@ -13,15 +13,15 @@
 @dataclass
 class TrafilaturaInferenceConfig:
     """Configuration for Trafilatura extractor."""
-    favor_precision: bool = True  # Favor precision: only extract the most core content, filter more redundancy (e.g. sidebars, ads), enabled by default
-    favor_recall: bool = True  # Favor recall: extract all potentially valid content as much as possible, minimize omissions, enabled by default
-    include_comments: bool = False  # Whether to keep comments, disabled by default
-    include_tables: bool = True  # Whether to keep extracted HTML tables, enabled by default
+    favor_precision: bool = False  # Match trafilatura.extract default
+    favor_recall: bool = False  # Match trafilatura.extract default
+    include_comments: bool = True  # Match trafilatura.extract default
+    include_tables: bool = True  # Match trafilatura.extract default
     include_images: bool = False  # Whether to keep extracted image information, disabled by default
     include_links: bool = False  # Whether to keep links, disabled by default
     with_metadata: bool = False  # Whether to keep metadata, disabled by default
     skip_elements: bool = False  # Whether to keep CSS-hidden elements, disabled by default
-    output_format: str = "markdown"  # Supports multiple output formats: "csv", "json", "html", "markdown", "txt", "xml", etc.
+    output_format: str = "markdown"  # Markdown benchmark variant; trafilatura's library default is "txt"
 
 
 @extractor("trafilatura")