diff --git a/README.md b/README.md
index 71d6779..24747ae 100644
--- a/README.md
+++ b/README.md
@@ -113,10 +113,10 @@ Results from the [Dripper paper](https://arxiv.org/abs/2511.23119) (Table 2):
| Extractor | Version | overall | text\_edit | code\_edit | formula\_edit | table\_edit | table\_TEDS |
|---|---|---|---|---|---|---|---|
| **mineru-html** | 4.1.1 | **0.8256** | 0.8621 | 0.9093 | 0.9399 | 0.6780 | 0.7388 |
-| magic-html | 0.1.5 | 0.5141 | 0.7791 | 0.4117 | 0.7204 | 0.2611 | 0.3984 |
-| trafilatura (md) | 2.0.0 | 0.3858 | 0.6887 | 0.1305 | 0.6242 | 0.1653 | 0.3203 |
-| resiliparse | 0.14.5 | 0.2954 | 0.7381 | 0.0641 | 0.6747 | 0.0000 | 0.0000 |
-| trafilatura (txt) | 2.0.0 | 0.2657 | 0.7126 | 0.0000 | 0.6162 | 0.0000 | 0.0000 |
+| magic-html | 0.1.5 | 0.4996 | 0.7800 | 0.4150 | 0.6385 | 0.2638 | 0.4006 |
+| trafilatura (md) | 2.0.0 | 0.4013 | 0.7826 | 0.1801 | 0.6237 | 0.1202 | 0.2999 |
+| resiliparse | 0.14.5 | 0.2898 | 0.7435 | 0.0422 | 0.6631 | 0.0000 | 0.0000 |
+| trafilatura (txt) | 2.0.0 | 0.3718 | 0.7819 | 0.0000 | 0.6389 | 0.1278 | 0.3106 |
Contributions of new extractor results are welcome — open a PR!
@@ -194,13 +194,23 @@ cp .env.example .env
# Edit .env and set LLM_BASE_URL, LLM_API_KEY, LLM_MODEL
```
+When constructing an evaluator manually, pass the same LLM settings to both `llm_config` and `metric_config`; `llm_config` validates the API, while `metric_config` enables LLM-enhanced metric splitting.
+
#### Run an Evaluation
```python
+import os
from webmainbench import DataLoader, Evaluator, ExtractorFactory
dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl")
-result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura"))
+llm_config = {
+ "use_llm": True,
+ "llm_base_url": os.getenv("LLM_BASE_URL", ""),
+ "llm_api_key": os.getenv("LLM_API_KEY", ""),
+ "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"),
+}
+evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config)
+result = evaluator.evaluate(dataset, ExtractorFactory.create("trafilatura"))
m = result.overall_metrics
@@ -217,7 +227,16 @@ for name, result in results.items():
print(f"{name}: {result.overall_metrics['overall']:.4f}")
```
-A complete example is available at `examples/multi_extractor_compare.py`.
+To reproduce the 545-sample fine-grained leaderboard:
+
+```bash
+export LLM_BASE_URL="https://your-openai-compatible-endpoint/v1"
+export LLM_API_KEY="..."
+export LLM_MODEL="gpt-5-chat-latest"
+python examples/run_545_leaderboard.py data/WebMainBench_545.jsonl
+```
+
+Complete examples are available at `examples/run_545_leaderboard.py` and `examples/multi_extractor_compare.py`.
## Dataset Format
diff --git a/README_zh.md b/README_zh.md
index 0601bdc..db73068 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -113,10 +113,10 @@ WebMainBench 支持两套互补的评测协议:
| 抽取器 | 版本 | overall | text\_edit | code\_edit | formula\_edit | table\_edit | table\_TEDS |
|---|---|---|---|---|---|---|---|
| **mineru-html** | 4.1.1 | **0.8256** | 0.8621 | 0.9093 | 0.9399 | 0.6780 | 0.7388 |
-| magic-html | 0.1.5 | 0.5141 | 0.7791 | 0.4117 | 0.7204 | 0.2611 | 0.3984 |
-| trafilatura (md) | 2.0.0 | 0.3858 | 0.6887 | 0.1305 | 0.6242 | 0.1653 | 0.3203 |
-| resiliparse | 0.14.5 | 0.2954 | 0.7381 | 0.0641 | 0.6747 | 0.0000 | 0.0000 |
-| trafilatura (txt) | 2.0.0 | 0.2657 | 0.7126 | 0.0000 | 0.6162 | 0.0000 | 0.0000 |
+| magic-html | 0.1.5 | 0.4996 | 0.7800 | 0.4150 | 0.6385 | 0.2638 | 0.4006 |
+| trafilatura (md) | 2.0.0 | 0.4013 | 0.7826 | 0.1801 | 0.6237 | 0.1202 | 0.2999 |
+| resiliparse | 0.14.5 | 0.2898 | 0.7435 | 0.0422 | 0.6631 | 0.0000 | 0.0000 |
+| trafilatura (txt) | 2.0.0 | 0.3718 | 0.7819 | 0.0000 | 0.6389 | 0.1278 | 0.3106 |
欢迎提交新抽取器的评测结果 — 请提 PR!
@@ -194,13 +194,23 @@ cp .env.example .env
# 编辑 .env,设置 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL
```
+手动构造评测器时,需要把同一份 LLM 配置同时传给 `llm_config` 和 `metric_config`;`llm_config` 用于校验 API,`metric_config` 用于启用 LLM 增强的指标拆分。
+
#### 运行评测
```python
+import os
from webmainbench import DataLoader, Evaluator, ExtractorFactory
dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl")
-result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura"))
+llm_config = {
+ "use_llm": True,
+ "llm_base_url": os.getenv("LLM_BASE_URL", ""),
+ "llm_api_key": os.getenv("LLM_API_KEY", ""),
+ "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"),
+}
+evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config)
+result = evaluator.evaluate(dataset, ExtractorFactory.create("trafilatura"))
m = result.overall_metrics
@@ -217,7 +227,16 @@ for name, result in results.items():
print(f"{name}: {result.overall_metrics['overall']:.4f}")
```
-完整示例见 `examples/multi_extractor_compare.py`。
+复现 545 条子集细粒度榜单:
+
+```bash
+export LLM_BASE_URL="https://your-openai-compatible-endpoint/v1"
+export LLM_API_KEY="..."
+export LLM_MODEL="gpt-5-chat-latest"
+python examples/run_545_leaderboard.py data/WebMainBench_545.jsonl
+```
+
+完整示例见 `examples/run_545_leaderboard.py` 和 `examples/multi_extractor_compare.py`。
## 数据格式
diff --git a/examples/run_545_leaderboard.py b/examples/run_545_leaderboard.py
new file mode 100644
index 0000000..bf53f95
--- /dev/null
+++ b/examples/run_545_leaderboard.py
@@ -0,0 +1,59 @@
+"""Run the 545-sample fine-grained leaderboard.
+
+Required environment variables for LLM-enhanced formula splitting:
+ LLM_BASE_URL
+ LLM_API_KEY
+ LLM_MODEL
+
+Example:
+ python examples/run_545_leaderboard.py WebMainBench_545.jsonl
+"""
+
+import os
+import sys
+from pathlib import Path
+
+from webmainbench import DataLoader, Evaluator
+
+
+METRICS = [
+ "overall",
+ "text_edit",
+ "code_edit",
+ "formula_edit",
+ "table_edit",
+ "table_TEDS",
+]
+
+
+def build_llm_config() -> dict:
+ config = {
+ "use_llm": os.getenv("USE_LLM", "true").lower() == "true",
+ "llm_base_url": os.getenv("LLM_BASE_URL", ""),
+ "llm_api_key": os.getenv("LLM_API_KEY", ""),
+ "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"),
+ "llm_timeout": float(os.getenv("LLM_TIMEOUT", "60")),
+ }
+ if os.getenv("LLM_CACHE_DIR"):
+ config["cache_dir"] = os.getenv("LLM_CACHE_DIR")
+ return config
+
+
+def main() -> None:
+ dataset_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("data/WebMainBench_545.jsonl")
+ extractors = sys.argv[2:] or ["magic-html", "trafilatura", "resiliparse", "trafilatura_txt"]
+ llm_config = build_llm_config()
+
+ dataset = DataLoader.load_jsonl(dataset_path)
+ evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config)
+ results = evaluator.compare_extractors(dataset, extractors)
+
+ print("| Extractor | " + " | ".join(METRICS) + " |")
+ print("|---|" + "|".join(["---:"] * len(METRICS)) + "|")
+ for name, result in results.items():
+ scores = [result.overall_metrics.get(metric, 0.0) for metric in METRICS]
+ print(f"| {name} | " + " | ".join(f"{score:.4f}" for score in scores) + " |")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/tests/test_html_cleaner.py b/tests/test_html_cleaner.py
new file mode 100644
index 0000000..fae7f91
--- /dev/null
+++ b/tests/test_html_cleaner.py
@@ -0,0 +1,45 @@
+from webmainbench.extractors.base import BaseExtractor, ExtractionResult
+from webmainbench.utils import clean_browser_annotation_artifacts
+
+
+class EchoExtractor(BaseExtractor):
+ def _setup(self):
+ pass
+
+ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
+ return ExtractionResult(content=html)
+
+
+def test_clean_browser_annotation_artifacts_preserves_text():
+ html = (
+ '
'
+ 'Hello'
+ ' '
+ "world"
+ "
"
+ )
+
+ cleaned = clean_browser_annotation_artifacts(html)
+
+ assert "Hello" in cleaned
+ assert "world" in cleaned
+ assert "marked-text" not in cleaned
+ assert "marked-tail" not in cleaned
+ assert "data-anno-uid" not in cleaned
+
+
+def test_base_extractor_cleans_annotation_artifacts_by_default():
+ extractor = EchoExtractor("echo")
+
+ result = extractor.extract('Hello
')
+
+ assert result.content == "Hello
"
+
+
+def test_base_extractor_can_disable_annotation_cleanup():
+ extractor = EchoExtractor("echo", config={"clean_html_annotations": False})
+ html = 'Hello
'
+
+ result = extractor.extract(html)
+
+ assert result.content == html
diff --git a/tests/test_metric_config.py b/tests/test_metric_config.py
new file mode 100644
index 0000000..c6c3c56
--- /dev/null
+++ b/tests/test_metric_config.py
@@ -0,0 +1,15 @@
+from webmainbench.metrics import MetricCalculator
+
+
+def test_metric_calculator_passes_config_to_default_metrics():
+ config = {
+ "use_llm": True,
+ "llm_base_url": "http://example.test/v1",
+ "llm_api_key": "test-key",
+ "llm_model": "test-model",
+ }
+
+ calculator = MetricCalculator(config)
+
+ assert calculator.metrics["formula_edit"].config == config
+ assert calculator.metrics["text_edit"].config == config
diff --git a/tests/test_trafilatura_config.py b/tests/test_trafilatura_config.py
new file mode 100644
index 0000000..22ac2f3
--- /dev/null
+++ b/tests/test_trafilatura_config.py
@@ -0,0 +1,39 @@
+from webmainbench.extractors.trafilatura_extractor import (
+ TrafilaturaExtractor as MarkdownTrafilaturaExtractor,
+)
+from webmainbench.extractors.trafilatura_txt_extractor import (
+ TrafilaturaExtractor as TextTrafilaturaExtractor,
+)
+
+
+def test_trafilatura_markdown_defaults_match_standard_options():
+ extractor = MarkdownTrafilaturaExtractor("trafilatura")
+
+ assert extractor.inference_config.favor_precision is False
+ assert extractor.inference_config.favor_recall is False
+ assert extractor.inference_config.include_comments is True
+ assert extractor.inference_config.output_format == "markdown"
+
+
+def test_trafilatura_txt_defaults_to_extract_txt(monkeypatch):
+ calls = {}
+
+ def fake_extract(html, **kwargs):
+ calls["html"] = html
+ calls["kwargs"] = kwargs
+ return "plain text"
+
+ monkeypatch.setattr(
+ "webmainbench.extractors.trafilatura_txt_extractor.extract",
+ fake_extract,
+ )
+ extractor = TextTrafilaturaExtractor("trafilatura_txt")
+
+ result = extractor.extract("plain text", url="https://example.com")
+
+ assert result.content == "plain text"
+ assert calls["kwargs"]["url"] == "https://example.com"
+ assert calls["kwargs"]["favor_precision"] is False
+ assert calls["kwargs"]["favor_recall"] is False
+ assert calls["kwargs"]["include_comments"] is True
+ assert calls["kwargs"]["output_format"] == "txt"
diff --git a/webmainbench/evaluator/evaluator.py b/webmainbench/evaluator/evaluator.py
index 699b01c..ebdc150 100644
--- a/webmainbench/evaluator/evaluator.py
+++ b/webmainbench/evaluator/evaluator.py
@@ -137,7 +137,8 @@ def _validate_llm_config(self, llm_config: Dict[str, Any] = None):
print("Validating LLM API configuration...")
client = OpenAI(
base_url=config.get('llm_base_url'),
- api_key=config.get('llm_api_key')
+ api_key=config.get('llm_api_key'),
+ timeout=config.get('llm_timeout', 60),
)
client.chat.completions.create(
@@ -615,4 +616,4 @@ def compare_extractors(self,
print(f"Error evaluating {extractor_name}: {e}")
continue
- return results
\ No newline at end of file
+ return results
diff --git a/webmainbench/extractors/base.py b/webmainbench/extractors/base.py
index bc4eddf..57e1cc1 100644
--- a/webmainbench/extractors/base.py
+++ b/webmainbench/extractors/base.py
@@ -7,6 +7,7 @@
from typing import Dict, List, Any, Optional, Union
import time
import traceback
+from ..utils.html_cleaner import clean_browser_annotation_artifacts
@dataclass
@@ -154,6 +155,9 @@ def extract(self, html: str, url: str = None) -> ExtractionResult:
extraction_time=time.time() - start_time
)
+ if self.config.get("clean_html_annotations", True):
+ html = clean_browser_annotation_artifacts(html)
+
# Perform extraction
result = self._extract_content(html, url)
result.extraction_time = time.time() - start_time
@@ -213,4 +217,4 @@ def __str__(self) -> str:
return f"{self.__class__.__name__}(name='{self.name}')"
def __repr__(self) -> str:
- return self.__str__()
\ No newline at end of file
+ return self.__str__()
diff --git a/webmainbench/extractors/trafilatura_extractor.py b/webmainbench/extractors/trafilatura_extractor.py
index 0fb907d..6c676f9 100644
--- a/webmainbench/extractors/trafilatura_extractor.py
+++ b/webmainbench/extractors/trafilatura_extractor.py
@@ -13,15 +13,15 @@
@dataclass
class TrafilaturaInferenceConfig:
"""Configuration for Trafilatura extractor."""
- favor_precision: bool = True # Favor precision: only extract the most core content, filter more redundancy (e.g. sidebars, ads), enabled by default
- favor_recall: bool = True # Favor recall: extract all potentially valid content as much as possible, minimize omissions, enabled by default
- include_comments: bool = False # Whether to keep comments, disabled by default
- include_tables: bool = True # Whether to keep extracted HTML tables, enabled by default
+ favor_precision: bool = False # Match trafilatura.extract default
+ favor_recall: bool = False # Match trafilatura.extract default
+ include_comments: bool = True # Match trafilatura.extract default
+ include_tables: bool = True # Match trafilatura.extract default
include_images: bool = False # Whether to keep extracted image information, disabled by default
include_links: bool = False # Whether to keep links, disabled by default
with_metadata: bool = False # Whether to keep metadata, disabled by default
skip_elements: bool = False # Whether to keep CSS-hidden elements, disabled by default
- output_format: str = "markdown" # Supports multiple output formats: "csv", "json", "html", "markdown", "txt", "xml", etc.
+ output_format: str = "markdown" # Markdown benchmark variant; trafilatura's library default is "txt"
@extractor("trafilatura")
diff --git a/webmainbench/extractors/trafilatura_txt_extractor.py b/webmainbench/extractors/trafilatura_txt_extractor.py
index 55292c4..eee8dfd 100644
--- a/webmainbench/extractors/trafilatura_txt_extractor.py
+++ b/webmainbench/extractors/trafilatura_txt_extractor.py
@@ -6,22 +6,22 @@
from dataclasses import dataclass
from .base import BaseExtractor, ExtractionResult
from .factory import extractor
-from trafilatura import extract,html2txt,baseline
+from trafilatura import extract
import re
@dataclass
class TrafilaturaInferenceConfig:
"""Configuration for Trafilatura extractor."""
- favor_precision: bool = True # Favor precision: only extract the most core content, filter more redundancy (e.g. sidebars, ads), enabled by default
- favor_recall: bool = True # Favor recall: extract all potentially valid content as much as possible, minimize omissions, enabled by default
- include_comments: bool = False # Whether to keep comments, disabled by default
- include_tables: bool = True # Whether to keep extracted HTML tables, enabled by default
+ favor_precision: bool = False # Match trafilatura.extract default
+ favor_recall: bool = False # Match trafilatura.extract default
+ include_comments: bool = True # Match trafilatura.extract default
+ include_tables: bool = True # Match trafilatura.extract default
include_images: bool = False # Whether to keep extracted image information, disabled by default
include_links: bool = False # Whether to keep links, disabled by default
with_metadata: bool = False # Whether to keep metadata, disabled by default
skip_elements: bool = False # Whether to keep CSS-hidden elements, disabled by default
- output_format: str = "markdown" # Supports multiple output formats: "csv", "json", "html", "markdown", "txt", "xml", etc.
+ output_format: str = "txt" # Plain text benchmark variant; matches trafilatura.extract default
@extractor("trafilatura_txt")
@@ -58,26 +58,18 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult:
ExtractionResult instance
"""
try:
- # Perform content extraction using configuration parameters
- # content = extract(
- # html,
- # url=url,
- # favor_precision=self.inference_config.favor_precision,
- # favor_recall=self.inference_config.favor_recall,
- # include_comments=self.inference_config.include_comments,
- # include_tables=self.inference_config.include_tables,
- # include_images=self.inference_config.include_images,
- # include_links=self.inference_config.include_links,
- # with_metadata=self.inference_config.with_metadata,
- # output_format=self.inference_config.output_format # Pass in output format
- #
- # )
-
- # Extract content to txt with maximum recall
- # content = html2txt(html)
-
- # Extract txt result with more accurate output
- postbody, content, len_text = baseline(html)
+ content = extract(
+ html,
+ url=url,
+ favor_precision=self.inference_config.favor_precision,
+ favor_recall=self.inference_config.favor_recall,
+ include_comments=self.inference_config.include_comments,
+ include_tables=self.inference_config.include_tables,
+ include_images=self.inference_config.include_images,
+ include_links=self.inference_config.include_links,
+ with_metadata=self.inference_config.with_metadata,
+ output_format=self.inference_config.output_format,
+ )
# Create content_list (simple paragraph split)
content_list = []
diff --git a/webmainbench/metrics/base.py b/webmainbench/metrics/base.py
index 6be5a65..0a86947 100644
--- a/webmainbench/metrics/base.py
+++ b/webmainbench/metrics/base.py
@@ -119,8 +119,7 @@ def batch_calculate(self, predicted_list: List[Any],
results.append(result)
return results
- @staticmethod
- def split_content(text: str, content_list: List[Dict[str, Any]] = None, field_name: str = None) -> Dict[str, str]:
+ def split_content(self, text: str, content_list: List[Dict[str, Any]] = None, field_name: str = None) -> Dict[str, str]:
"""
Unified content splitting method that divides text into 4 parts: code, formula, table, and remaining text.
@@ -138,7 +137,7 @@ def split_content(text: str, content_list: List[Dict[str, Any]] = None, field_na
return extracted_content
# Extract from markdown text, passing the field name
- return BaseMetric._extract_from_markdown(text or "", field_name=field_name)
+ return BaseMetric._extract_from_markdown(text or "", field_name=field_name, config=self.config)
@staticmethod
def _extract_from_content_list(content_list: List[Dict[str, Any]]) -> Dict[str, str]:
@@ -191,22 +190,23 @@ def _recursive_extract(items):
}
@staticmethod
- def _extract_from_markdown(text: str, field_name: str = None) -> Dict[str, str]:
+ def _extract_from_markdown(text: str, field_name: str = None, config: Dict[str, Any] = None) -> Dict[str, str]:
"""Extract various types of content from markdown text"""
if not text:
return {'code': '', 'formula': '', 'table': '', 'text': ''}
# Load LLM config
from ..config import LLM_CONFIG
+ splitter_config = {**LLM_CONFIG, **(config or {})}
# Directly create concrete extractor instances
from .code_extractor import CodeSplitter
from .formula_extractor import FormulaSplitter
from .table_extractor import TableSplitter
- code_extractor = CodeSplitter(LLM_CONFIG)
- formula_extractor = FormulaSplitter(LLM_CONFIG)
- table_extractor = TableSplitter(LLM_CONFIG)
+ code_extractor = CodeSplitter(splitter_config)
+ formula_extractor = FormulaSplitter(splitter_config)
+ table_extractor = TableSplitter(splitter_config)
# Extract each type of content
code_content = code_extractor.extract(text, field_name)
diff --git a/webmainbench/metrics/base_content_splitter.py b/webmainbench/metrics/base_content_splitter.py
index 051a639..d33948a 100644
--- a/webmainbench/metrics/base_content_splitter.py
+++ b/webmainbench/metrics/base_content_splitter.py
@@ -35,7 +35,8 @@ def __init__(self, config: Dict[str, Any] = None):
if self.use_llm and self.config.get('llm_base_url') and self.config.get('llm_api_key'):
self.client = OpenAI(
base_url=self.config.get('llm_base_url', ""),
- api_key=self.config.get('llm_api_key', "")
+ api_key=self.config.get('llm_api_key', ""),
+ timeout=self.config.get('llm_timeout', 60),
)
else:
self.client = None
diff --git a/webmainbench/metrics/calculator.py b/webmainbench/metrics/calculator.py
index b07f9db..3a90732 100644
--- a/webmainbench/metrics/calculator.py
+++ b/webmainbench/metrics/calculator.py
@@ -26,11 +26,11 @@ def __init__(self, config: Dict[str, Any] = None):
def _setup_default_metrics(self) -> None:
"""Setup default metrics."""
# Register new content-type metrics
- self.add_metric("code_edit", CodeEditMetric("code_edit"))
- self.add_metric("formula_edit", FormulaEditMetric("formula_edit"))
- self.add_metric("table_edit", TableEditMetric("table_edit"))
- self.add_metric("table_TEDS", TableTEDSMetric("table_TEDS"))
- self.add_metric("text_edit", TextEditMetric("text_edit"))
+ self.add_metric("code_edit", CodeEditMetric("code_edit", self.config))
+ self.add_metric("formula_edit", FormulaEditMetric("formula_edit", self.config))
+ self.add_metric("table_edit", TableEditMetric("table_edit", self.config))
+ self.add_metric("table_TEDS", TableTEDSMetric("table_TEDS", self.config))
+ self.add_metric("text_edit", TextEditMetric("text_edit", self.config))
def add_metric(self, name: str, metric: BaseMetric) -> None:
"""
@@ -298,4 +298,4 @@ def get_metric_info(self, metric_name: str) -> Optional[Dict[str, Any]]:
"""Get information about a specific metric."""
if metric_name in self.metrics:
return self.metrics[metric_name].get_info()
- return None
\ No newline at end of file
+ return None
diff --git a/webmainbench/utils/__init__.py b/webmainbench/utils/__init__.py
index 0e46cb8..42127eb 100644
--- a/webmainbench/utils/__init__.py
+++ b/webmainbench/utils/__init__.py
@@ -4,6 +4,7 @@
from .helpers import setup_logging, validate_config, format_results
from .main_html import extract_main_html, HTML2TextWrapper
+from .html_cleaner import clean_browser_annotation_artifacts
__all__ = [
"setup_logging",
@@ -11,4 +12,5 @@
"format_results",
"extract_main_html",
"HTML2TextWrapper",
-]
\ No newline at end of file
+ "clean_browser_annotation_artifacts",
+]
diff --git a/webmainbench/utils/html_cleaner.py b/webmainbench/utils/html_cleaner.py
new file mode 100644
index 0000000..d599416
--- /dev/null
+++ b/webmainbench/utils/html_cleaner.py
@@ -0,0 +1,25 @@
+"""
+HTML cleanup helpers.
+"""
+
+import re
+
+
+_ANNOTATION_TAG_RE = re.compile(
+ r"?(?:marked-tail|marked-text|marked-inline)\b[^>]*>",
+ re.IGNORECASE,
+)
+_ANNO_ATTR_RE = re.compile(
+ r"\s+data-anno-uid(?:\s*=\s*(?:\"[^\"]*\"|'[^']*'|[^\s>]+))?",
+ re.IGNORECASE,
+)
+
+
+def clean_browser_annotation_artifacts(html: str) -> str:
+ """Remove browser annotation plugin artifacts while preserving page text."""
+ if not html:
+ return html
+
+ html = _ANNOTATION_TAG_RE.sub("", html)
+ html = _ANNO_ATTR_RE.sub("", html)
+ return html