diff --git a/README.md b/README.md index 71d6779..24747ae 100644 --- a/README.md +++ b/README.md @@ -113,10 +113,10 @@ Results from the [Dripper paper](https://arxiv.org/abs/2511.23119) (Table 2): | Extractor | Version | overall | text\_edit | code\_edit | formula\_edit | table\_edit | table\_TEDS | |---|---|---|---|---|---|---|---| | **mineru-html** | 4.1.1 | **0.8256** | 0.8621 | 0.9093 | 0.9399 | 0.6780 | 0.7388 | -| magic-html | 0.1.5 | 0.5141 | 0.7791 | 0.4117 | 0.7204 | 0.2611 | 0.3984 | -| trafilatura (md) | 2.0.0 | 0.3858 | 0.6887 | 0.1305 | 0.6242 | 0.1653 | 0.3203 | -| resiliparse | 0.14.5 | 0.2954 | 0.7381 | 0.0641 | 0.6747 | 0.0000 | 0.0000 | -| trafilatura (txt) | 2.0.0 | 0.2657 | 0.7126 | 0.0000 | 0.6162 | 0.0000 | 0.0000 | +| magic-html | 0.1.5 | 0.4996 | 0.7800 | 0.4150 | 0.6385 | 0.2638 | 0.4006 | +| trafilatura (md) | 2.0.0 | 0.4013 | 0.7826 | 0.1801 | 0.6237 | 0.1202 | 0.2999 | +| resiliparse | 0.14.5 | 0.2898 | 0.7435 | 0.0422 | 0.6631 | 0.0000 | 0.0000 | +| trafilatura (txt) | 2.0.0 | 0.3718 | 0.7819 | 0.0000 | 0.6389 | 0.1278 | 0.3106 | Contributions of new extractor results are welcome — open a PR! @@ -194,13 +194,23 @@ cp .env.example .env # Edit .env and set LLM_BASE_URL, LLM_API_KEY, LLM_MODEL ``` +When constructing an evaluator manually, pass the same LLM settings to both `llm_config` and `metric_config`; `llm_config` validates the API, while `metric_config` enables LLM-enhanced metric splitting. + #### Run an Evaluation ```python +import os from webmainbench import DataLoader, Evaluator, ExtractorFactory dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl") -result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura")) +llm_config = { + "use_llm": True, + "llm_base_url": os.getenv("LLM_BASE_URL", ""), + "llm_api_key": os.getenv("LLM_API_KEY", ""), + "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"), +} +evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config) +result = evaluator.evaluate(dataset, ExtractorFactory.create("trafilatura")) m = result.overall_metrics @@ -217,7 +227,16 @@ for name, result in results.items(): print(f"{name}: {result.overall_metrics['overall']:.4f}") ``` -A complete example is available at `examples/multi_extractor_compare.py`. +To reproduce the 545-sample fine-grained leaderboard: + +```bash +export LLM_BASE_URL="https://your-openai-compatible-endpoint/v1" +export LLM_API_KEY="..." +export LLM_MODEL="gpt-5-chat-latest" +python examples/run_545_leaderboard.py data/WebMainBench_545.jsonl +``` + +Complete examples are available at `examples/run_545_leaderboard.py` and `examples/multi_extractor_compare.py`. ## Dataset Format diff --git a/README_zh.md b/README_zh.md index 0601bdc..db73068 100644 --- a/README_zh.md +++ b/README_zh.md @@ -113,10 +113,10 @@ WebMainBench 支持两套互补的评测协议: | 抽取器 | 版本 | overall | text\_edit | code\_edit | formula\_edit | table\_edit | table\_TEDS | |---|---|---|---|---|---|---|---| | **mineru-html** | 4.1.1 | **0.8256** | 0.8621 | 0.9093 | 0.9399 | 0.6780 | 0.7388 | -| magic-html | 0.1.5 | 0.5141 | 0.7791 | 0.4117 | 0.7204 | 0.2611 | 0.3984 | -| trafilatura (md) | 2.0.0 | 0.3858 | 0.6887 | 0.1305 | 0.6242 | 0.1653 | 0.3203 | -| resiliparse | 0.14.5 | 0.2954 | 0.7381 | 0.0641 | 0.6747 | 0.0000 | 0.0000 | -| trafilatura (txt) | 2.0.0 | 0.2657 | 0.7126 | 0.0000 | 0.6162 | 0.0000 | 0.0000 | +| magic-html | 0.1.5 | 0.4996 | 0.7800 | 0.4150 | 0.6385 | 0.2638 | 0.4006 | +| trafilatura (md) | 2.0.0 | 0.4013 | 0.7826 | 0.1801 | 0.6237 | 0.1202 | 0.2999 | +| resiliparse | 0.14.5 | 0.2898 | 0.7435 | 0.0422 | 0.6631 | 0.0000 | 0.0000 | +| trafilatura (txt) | 2.0.0 | 0.3718 | 0.7819 | 0.0000 | 0.6389 | 0.1278 | 0.3106 | 欢迎提交新抽取器的评测结果 — 请提 PR! @@ -194,13 +194,23 @@ cp .env.example .env # 编辑 .env,设置 LLM_BASE_URL、LLM_API_KEY、LLM_MODEL ``` +手动构造评测器时,需要把同一份 LLM 配置同时传给 `llm_config` 和 `metric_config`;`llm_config` 用于校验 API,`metric_config` 用于启用 LLM 增强的指标拆分。 + #### 运行评测 ```python +import os from webmainbench import DataLoader, Evaluator, ExtractorFactory dataset = DataLoader.load_jsonl("data/WebMainBench_545.jsonl") -result = Evaluator().evaluate(dataset, ExtractorFactory.create("trafilatura")) +llm_config = { + "use_llm": True, + "llm_base_url": os.getenv("LLM_BASE_URL", ""), + "llm_api_key": os.getenv("LLM_API_KEY", ""), + "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"), +} +evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config) +result = evaluator.evaluate(dataset, ExtractorFactory.create("trafilatura")) m = result.overall_metrics @@ -217,7 +227,16 @@ for name, result in results.items(): print(f"{name}: {result.overall_metrics['overall']:.4f}") ``` -完整示例见 `examples/multi_extractor_compare.py`。 +复现 545 条子集细粒度榜单: + +```bash +export LLM_BASE_URL="https://your-openai-compatible-endpoint/v1" +export LLM_API_KEY="..." +export LLM_MODEL="gpt-5-chat-latest" +python examples/run_545_leaderboard.py data/WebMainBench_545.jsonl +``` + +完整示例见 `examples/run_545_leaderboard.py` 和 `examples/multi_extractor_compare.py`。 ## 数据格式 diff --git a/examples/run_545_leaderboard.py b/examples/run_545_leaderboard.py new file mode 100644 index 0000000..bf53f95 --- /dev/null +++ b/examples/run_545_leaderboard.py @@ -0,0 +1,59 @@ +"""Run the 545-sample fine-grained leaderboard. + +Required environment variables for LLM-enhanced formula splitting: + LLM_BASE_URL + LLM_API_KEY + LLM_MODEL + +Example: + python examples/run_545_leaderboard.py WebMainBench_545.jsonl +""" + +import os +import sys +from pathlib import Path + +from webmainbench import DataLoader, Evaluator + + +METRICS = [ + "overall", + "text_edit", + "code_edit", + "formula_edit", + "table_edit", + "table_TEDS", +] + + +def build_llm_config() -> dict: + config = { + "use_llm": os.getenv("USE_LLM", "true").lower() == "true", + "llm_base_url": os.getenv("LLM_BASE_URL", ""), + "llm_api_key": os.getenv("LLM_API_KEY", ""), + "llm_model": os.getenv("LLM_MODEL", "deepseek-chat"), + "llm_timeout": float(os.getenv("LLM_TIMEOUT", "60")), + } + if os.getenv("LLM_CACHE_DIR"): + config["cache_dir"] = os.getenv("LLM_CACHE_DIR") + return config + + +def main() -> None: + dataset_path = Path(sys.argv[1]) if len(sys.argv) > 1 else Path("data/WebMainBench_545.jsonl") + extractors = sys.argv[2:] or ["magic-html", "trafilatura", "resiliparse", "trafilatura_txt"] + llm_config = build_llm_config() + + dataset = DataLoader.load_jsonl(dataset_path) + evaluator = Evaluator(llm_config=llm_config, metric_config=llm_config) + results = evaluator.compare_extractors(dataset, extractors) + + print("| Extractor | " + " | ".join(METRICS) + " |") + print("|---|" + "|".join(["---:"] * len(METRICS)) + "|") + for name, result in results.items(): + scores = [result.overall_metrics.get(metric, 0.0) for metric in METRICS] + print(f"| {name} | " + " | ".join(f"{score:.4f}" for score in scores) + " |") + + +if __name__ == "__main__": + main() diff --git a/tests/test_html_cleaner.py b/tests/test_html_cleaner.py new file mode 100644 index 0000000..fae7f91 --- /dev/null +++ b/tests/test_html_cleaner.py @@ -0,0 +1,45 @@ +from webmainbench.extractors.base import BaseExtractor, ExtractionResult +from webmainbench.utils import clean_browser_annotation_artifacts + + +class EchoExtractor(BaseExtractor): + def _setup(self): + pass + + def _extract_content(self, html: str, url: str = None) -> ExtractionResult: + return ExtractionResult(content=html) + + +def test_clean_browser_annotation_artifacts_preserves_text(): + html = ( + '

' + 'Hello' + ' ' + "world" + "

" + ) + + cleaned = clean_browser_annotation_artifacts(html) + + assert "Hello" in cleaned + assert "world" in cleaned + assert "marked-text" not in cleaned + assert "marked-tail" not in cleaned + assert "data-anno-uid" not in cleaned + + +def test_base_extractor_cleans_annotation_artifacts_by_default(): + extractor = EchoExtractor("echo") + + result = extractor.extract('

Hello

') + + assert result.content == "

Hello

" + + +def test_base_extractor_can_disable_annotation_cleanup(): + extractor = EchoExtractor("echo", config={"clean_html_annotations": False}) + html = '

Hello

' + + result = extractor.extract(html) + + assert result.content == html diff --git a/tests/test_metric_config.py b/tests/test_metric_config.py new file mode 100644 index 0000000..c6c3c56 --- /dev/null +++ b/tests/test_metric_config.py @@ -0,0 +1,15 @@ +from webmainbench.metrics import MetricCalculator + + +def test_metric_calculator_passes_config_to_default_metrics(): + config = { + "use_llm": True, + "llm_base_url": "http://example.test/v1", + "llm_api_key": "test-key", + "llm_model": "test-model", + } + + calculator = MetricCalculator(config) + + assert calculator.metrics["formula_edit"].config == config + assert calculator.metrics["text_edit"].config == config diff --git a/tests/test_trafilatura_config.py b/tests/test_trafilatura_config.py new file mode 100644 index 0000000..22ac2f3 --- /dev/null +++ b/tests/test_trafilatura_config.py @@ -0,0 +1,39 @@ +from webmainbench.extractors.trafilatura_extractor import ( + TrafilaturaExtractor as MarkdownTrafilaturaExtractor, +) +from webmainbench.extractors.trafilatura_txt_extractor import ( + TrafilaturaExtractor as TextTrafilaturaExtractor, +) + + +def test_trafilatura_markdown_defaults_match_standard_options(): + extractor = MarkdownTrafilaturaExtractor("trafilatura") + + assert extractor.inference_config.favor_precision is False + assert extractor.inference_config.favor_recall is False + assert extractor.inference_config.include_comments is True + assert extractor.inference_config.output_format == "markdown" + + +def test_trafilatura_txt_defaults_to_extract_txt(monkeypatch): + calls = {} + + def fake_extract(html, **kwargs): + calls["html"] = html + calls["kwargs"] = kwargs + return "plain text" + + monkeypatch.setattr( + "webmainbench.extractors.trafilatura_txt_extractor.extract", + fake_extract, + ) + extractor = TextTrafilaturaExtractor("trafilatura_txt") + + result = extractor.extract("plain text", url="https://example.com") + + assert result.content == "plain text" + assert calls["kwargs"]["url"] == "https://example.com" + assert calls["kwargs"]["favor_precision"] is False + assert calls["kwargs"]["favor_recall"] is False + assert calls["kwargs"]["include_comments"] is True + assert calls["kwargs"]["output_format"] == "txt" diff --git a/webmainbench/evaluator/evaluator.py b/webmainbench/evaluator/evaluator.py index 699b01c..ebdc150 100644 --- a/webmainbench/evaluator/evaluator.py +++ b/webmainbench/evaluator/evaluator.py @@ -137,7 +137,8 @@ def _validate_llm_config(self, llm_config: Dict[str, Any] = None): print("Validating LLM API configuration...") client = OpenAI( base_url=config.get('llm_base_url'), - api_key=config.get('llm_api_key') + api_key=config.get('llm_api_key'), + timeout=config.get('llm_timeout', 60), ) client.chat.completions.create( @@ -615,4 +616,4 @@ def compare_extractors(self, print(f"Error evaluating {extractor_name}: {e}") continue - return results \ No newline at end of file + return results diff --git a/webmainbench/extractors/base.py b/webmainbench/extractors/base.py index bc4eddf..57e1cc1 100644 --- a/webmainbench/extractors/base.py +++ b/webmainbench/extractors/base.py @@ -7,6 +7,7 @@ from typing import Dict, List, Any, Optional, Union import time import traceback +from ..utils.html_cleaner import clean_browser_annotation_artifacts @dataclass @@ -154,6 +155,9 @@ def extract(self, html: str, url: str = None) -> ExtractionResult: extraction_time=time.time() - start_time ) + if self.config.get("clean_html_annotations", True): + html = clean_browser_annotation_artifacts(html) + # Perform extraction result = self._extract_content(html, url) result.extraction_time = time.time() - start_time @@ -213,4 +217,4 @@ def __str__(self) -> str: return f"{self.__class__.__name__}(name='{self.name}')" def __repr__(self) -> str: - return self.__str__() \ No newline at end of file + return self.__str__() diff --git a/webmainbench/extractors/trafilatura_extractor.py b/webmainbench/extractors/trafilatura_extractor.py index 0fb907d..6c676f9 100644 --- a/webmainbench/extractors/trafilatura_extractor.py +++ b/webmainbench/extractors/trafilatura_extractor.py @@ -13,15 +13,15 @@ @dataclass class TrafilaturaInferenceConfig: """Configuration for Trafilatura extractor.""" - favor_precision: bool = True # Favor precision: only extract the most core content, filter more redundancy (e.g. sidebars, ads), enabled by default - favor_recall: bool = True # Favor recall: extract all potentially valid content as much as possible, minimize omissions, enabled by default - include_comments: bool = False # Whether to keep comments, disabled by default - include_tables: bool = True # Whether to keep extracted HTML tables, enabled by default + favor_precision: bool = False # Match trafilatura.extract default + favor_recall: bool = False # Match trafilatura.extract default + include_comments: bool = True # Match trafilatura.extract default + include_tables: bool = True # Match trafilatura.extract default include_images: bool = False # Whether to keep extracted image information, disabled by default include_links: bool = False # Whether to keep links, disabled by default with_metadata: bool = False # Whether to keep metadata, disabled by default skip_elements: bool = False # Whether to keep CSS-hidden elements, disabled by default - output_format: str = "markdown" # Supports multiple output formats: "csv", "json", "html", "markdown", "txt", "xml", etc. + output_format: str = "markdown" # Markdown benchmark variant; trafilatura's library default is "txt" @extractor("trafilatura") diff --git a/webmainbench/extractors/trafilatura_txt_extractor.py b/webmainbench/extractors/trafilatura_txt_extractor.py index 55292c4..eee8dfd 100644 --- a/webmainbench/extractors/trafilatura_txt_extractor.py +++ b/webmainbench/extractors/trafilatura_txt_extractor.py @@ -6,22 +6,22 @@ from dataclasses import dataclass from .base import BaseExtractor, ExtractionResult from .factory import extractor -from trafilatura import extract,html2txt,baseline +from trafilatura import extract import re @dataclass class TrafilaturaInferenceConfig: """Configuration for Trafilatura extractor.""" - favor_precision: bool = True # Favor precision: only extract the most core content, filter more redundancy (e.g. sidebars, ads), enabled by default - favor_recall: bool = True # Favor recall: extract all potentially valid content as much as possible, minimize omissions, enabled by default - include_comments: bool = False # Whether to keep comments, disabled by default - include_tables: bool = True # Whether to keep extracted HTML tables, enabled by default + favor_precision: bool = False # Match trafilatura.extract default + favor_recall: bool = False # Match trafilatura.extract default + include_comments: bool = True # Match trafilatura.extract default + include_tables: bool = True # Match trafilatura.extract default include_images: bool = False # Whether to keep extracted image information, disabled by default include_links: bool = False # Whether to keep links, disabled by default with_metadata: bool = False # Whether to keep metadata, disabled by default skip_elements: bool = False # Whether to keep CSS-hidden elements, disabled by default - output_format: str = "markdown" # Supports multiple output formats: "csv", "json", "html", "markdown", "txt", "xml", etc. + output_format: str = "txt" # Plain text benchmark variant; matches trafilatura.extract default @extractor("trafilatura_txt") @@ -58,26 +58,18 @@ def _extract_content(self, html: str, url: str = None) -> ExtractionResult: ExtractionResult instance """ try: - # Perform content extraction using configuration parameters - # content = extract( - # html, - # url=url, - # favor_precision=self.inference_config.favor_precision, - # favor_recall=self.inference_config.favor_recall, - # include_comments=self.inference_config.include_comments, - # include_tables=self.inference_config.include_tables, - # include_images=self.inference_config.include_images, - # include_links=self.inference_config.include_links, - # with_metadata=self.inference_config.with_metadata, - # output_format=self.inference_config.output_format # Pass in output format - # - # ) - - # Extract content to txt with maximum recall - # content = html2txt(html) - - # Extract txt result with more accurate output - postbody, content, len_text = baseline(html) + content = extract( + html, + url=url, + favor_precision=self.inference_config.favor_precision, + favor_recall=self.inference_config.favor_recall, + include_comments=self.inference_config.include_comments, + include_tables=self.inference_config.include_tables, + include_images=self.inference_config.include_images, + include_links=self.inference_config.include_links, + with_metadata=self.inference_config.with_metadata, + output_format=self.inference_config.output_format, + ) # Create content_list (simple paragraph split) content_list = [] diff --git a/webmainbench/metrics/base.py b/webmainbench/metrics/base.py index 6be5a65..0a86947 100644 --- a/webmainbench/metrics/base.py +++ b/webmainbench/metrics/base.py @@ -119,8 +119,7 @@ def batch_calculate(self, predicted_list: List[Any], results.append(result) return results - @staticmethod - def split_content(text: str, content_list: List[Dict[str, Any]] = None, field_name: str = None) -> Dict[str, str]: + def split_content(self, text: str, content_list: List[Dict[str, Any]] = None, field_name: str = None) -> Dict[str, str]: """ Unified content splitting method that divides text into 4 parts: code, formula, table, and remaining text. @@ -138,7 +137,7 @@ def split_content(text: str, content_list: List[Dict[str, Any]] = None, field_na return extracted_content # Extract from markdown text, passing the field name - return BaseMetric._extract_from_markdown(text or "", field_name=field_name) + return BaseMetric._extract_from_markdown(text or "", field_name=field_name, config=self.config) @staticmethod def _extract_from_content_list(content_list: List[Dict[str, Any]]) -> Dict[str, str]: @@ -191,22 +190,23 @@ def _recursive_extract(items): } @staticmethod - def _extract_from_markdown(text: str, field_name: str = None) -> Dict[str, str]: + def _extract_from_markdown(text: str, field_name: str = None, config: Dict[str, Any] = None) -> Dict[str, str]: """Extract various types of content from markdown text""" if not text: return {'code': '', 'formula': '', 'table': '', 'text': ''} # Load LLM config from ..config import LLM_CONFIG + splitter_config = {**LLM_CONFIG, **(config or {})} # Directly create concrete extractor instances from .code_extractor import CodeSplitter from .formula_extractor import FormulaSplitter from .table_extractor import TableSplitter - code_extractor = CodeSplitter(LLM_CONFIG) - formula_extractor = FormulaSplitter(LLM_CONFIG) - table_extractor = TableSplitter(LLM_CONFIG) + code_extractor = CodeSplitter(splitter_config) + formula_extractor = FormulaSplitter(splitter_config) + table_extractor = TableSplitter(splitter_config) # Extract each type of content code_content = code_extractor.extract(text, field_name) diff --git a/webmainbench/metrics/base_content_splitter.py b/webmainbench/metrics/base_content_splitter.py index 051a639..d33948a 100644 --- a/webmainbench/metrics/base_content_splitter.py +++ b/webmainbench/metrics/base_content_splitter.py @@ -35,7 +35,8 @@ def __init__(self, config: Dict[str, Any] = None): if self.use_llm and self.config.get('llm_base_url') and self.config.get('llm_api_key'): self.client = OpenAI( base_url=self.config.get('llm_base_url', ""), - api_key=self.config.get('llm_api_key', "") + api_key=self.config.get('llm_api_key', ""), + timeout=self.config.get('llm_timeout', 60), ) else: self.client = None diff --git a/webmainbench/metrics/calculator.py b/webmainbench/metrics/calculator.py index b07f9db..3a90732 100644 --- a/webmainbench/metrics/calculator.py +++ b/webmainbench/metrics/calculator.py @@ -26,11 +26,11 @@ def __init__(self, config: Dict[str, Any] = None): def _setup_default_metrics(self) -> None: """Setup default metrics.""" # Register new content-type metrics - self.add_metric("code_edit", CodeEditMetric("code_edit")) - self.add_metric("formula_edit", FormulaEditMetric("formula_edit")) - self.add_metric("table_edit", TableEditMetric("table_edit")) - self.add_metric("table_TEDS", TableTEDSMetric("table_TEDS")) - self.add_metric("text_edit", TextEditMetric("text_edit")) + self.add_metric("code_edit", CodeEditMetric("code_edit", self.config)) + self.add_metric("formula_edit", FormulaEditMetric("formula_edit", self.config)) + self.add_metric("table_edit", TableEditMetric("table_edit", self.config)) + self.add_metric("table_TEDS", TableTEDSMetric("table_TEDS", self.config)) + self.add_metric("text_edit", TextEditMetric("text_edit", self.config)) def add_metric(self, name: str, metric: BaseMetric) -> None: """ @@ -298,4 +298,4 @@ def get_metric_info(self, metric_name: str) -> Optional[Dict[str, Any]]: """Get information about a specific metric.""" if metric_name in self.metrics: return self.metrics[metric_name].get_info() - return None \ No newline at end of file + return None diff --git a/webmainbench/utils/__init__.py b/webmainbench/utils/__init__.py index 0e46cb8..42127eb 100644 --- a/webmainbench/utils/__init__.py +++ b/webmainbench/utils/__init__.py @@ -4,6 +4,7 @@ from .helpers import setup_logging, validate_config, format_results from .main_html import extract_main_html, HTML2TextWrapper +from .html_cleaner import clean_browser_annotation_artifacts __all__ = [ "setup_logging", @@ -11,4 +12,5 @@ "format_results", "extract_main_html", "HTML2TextWrapper", -] \ No newline at end of file + "clean_browser_annotation_artifacts", +] diff --git a/webmainbench/utils/html_cleaner.py b/webmainbench/utils/html_cleaner.py new file mode 100644 index 0000000..d599416 --- /dev/null +++ b/webmainbench/utils/html_cleaner.py @@ -0,0 +1,25 @@ +""" +HTML cleanup helpers. +""" + +import re + + +_ANNOTATION_TAG_RE = re.compile( + r"]*>", + re.IGNORECASE, +) +_ANNO_ATTR_RE = re.compile( + r"\s+data-anno-uid(?:\s*=\s*(?:\"[^\"]*\"|'[^']*'|[^\s>]+))?", + re.IGNORECASE, +) + + +def clean_browser_annotation_artifacts(html: str) -> str: + """Remove browser annotation plugin artifacts while preserving page text.""" + if not html: + return html + + html = _ANNOTATION_TAG_RE.sub("", html) + html = _ANNO_ATTR_RE.sub("", html) + return html