From 85dc9c06d6567cd77970ed68e0218629608ea100 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 16:03:36 +0800 Subject: [PATCH 1/3] =?UTF-8?q?feat:=E6=B8=85=E7=90=86=E5=85=83=E7=B4=A0?= =?UTF-8?q?=E5=B1=9E=E6=80=A7=EF=BC=8C=E4=BF=9D=E7=95=99=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=9A=84=E6=9C=89=E6=95=88src=EF=BC=88=E6=8E=92=E9=99=A4base64?= =?UTF-8?q?=EF=BC=89=E3=80=81alt=EF=BC=8C=E4=BB=A5=E5=8F=8A=E6=89=80?= =?UTF-8?q?=E6=9C=89=E5=85=83=E7=B4=A0=E7=9A=84class=E5=92=8Cid"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../simplify_html/simplify_html.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index b47de995..94e5d629 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -417,23 +417,34 @@ def is_meaningful_content(element) -> bool: def clean_attributes(element): - """清理元素属性,只保留图片的有效src以及所有元素的class和id.""" + """清理元素属性,保留图片的有效src(排除base64)、alt,以及所有元素的class和id.""" if element.tag == 'img': + # 获取图片相关属性 src = element.get('src', '').strip() + alt = element.get('alt', '').strip() class_attr = element.get('class', '').strip() id_attr = element.get('id', '').strip() - element.attrib.clear() # 先清除所有属性 - if src: + + element.attrib.clear() # 清除所有属性 + + # 保留非base64的src + if src and not src.startswith('data:image/'): element.set('src', src) + # 保留alt(如果非空) + if alt: + element.set('alt', alt) + # 保留class和id(如果非空) if class_attr: element.set('class', class_attr) if id_attr: element.set('id', id_attr) else: - # 对于其他元素,只保留class和id + # 非图片元素:只保留class和id class_attr = element.get('class', '').strip() id_attr = element.get('id', '').strip() - element.attrib.clear() # 先清除所有属性 + + element.attrib.clear() # 清除所有属性 + if class_attr: element.set('class', class_attr) if id_attr: From 6cffbb6f93c4f44b24fde8964fdf44e03b3cbdb8 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 19:15:51 +0800 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20=E7=B2=BE=E7=AE=80=E6=8E=A7?= =?UTF-8?q?=E5=88=B6=E6=98=AF=E5=90=A6=E8=8E=B7=E5=8F=96XPATH?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/input/pre_data_json.py | 1 + llm_web_kit/main_html_parser/parser/tag_simplifier.py | 3 ++- .../llm_web_kit/main_html_parser/parser/test_tag_simplifier.py | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py index c8d602c0..2153073f 100644 --- a/llm_web_kit/input/pre_data_json.py +++ b/llm_web_kit/input/pre_data_json.py @@ -15,6 +15,7 @@ class PreDataJsonKey: TYPICAL_RAW_HTML = 'typical_raw_html' TYPICAL_RAW_TAG_HTML = 'typical_raw_tag_html' + IS_XPATH = True XPATH_MAPPING = 'xpath_mapping' TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html' # 模型打标字典 diff --git a/llm_web_kit/main_html_parser/parser/tag_simplifier.py b/llm_web_kit/main_html_parser/parser/tag_simplifier.py index 1d705c1b..eede6cfb 100644 --- a/llm_web_kit/main_html_parser/parser/tag_simplifier.py +++ b/llm_web_kit/main_html_parser/parser/tag_simplifier.py @@ -19,11 +19,12 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: """ # 获取输入数据 typical_raw_html = pre_data.get(PreDataJsonKey.TYPICAL_RAW_HTML, '') + is_xpath = pre_data.get(PreDataJsonKey.IS_XPATH, True) # layout_file_list = pre_data.get(PreDataJsonKey.LAYOUT_FILE_LIST, []) # 执行HTML标签简化逻辑 try: - simplified_html, original_html, _ = simplify_html(typical_raw_html) + simplified_html, original_html, _ = simplify_html(typical_raw_html, is_xpath=is_xpath) except TagSimplifiedParserException as e1: raise e1 except Exception as e2: diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 3d656811..ea5d57e9 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -57,7 +57,7 @@ def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() - data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html, PreDataJsonKey.IS_XPATH: False} pre_data = PreDataJson(data_dict) pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '') From c96bbf9a242919d996ce0ccfb895fac5e30d2023 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 19:44:34 +0800 Subject: [PATCH 3/3] =?UTF-8?q?feat:=20=E7=B2=BE=E7=AE=80=E6=8E=A7?= =?UTF-8?q?=E5=88=B6=E6=98=AF=E5=90=A6=E8=8E=B7=E5=8F=96XPATH?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- llm_web_kit/input/pre_data_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py index 2153073f..7902d093 100644 --- a/llm_web_kit/input/pre_data_json.py +++ b/llm_web_kit/input/pre_data_json.py @@ -15,7 +15,7 @@ class PreDataJsonKey: TYPICAL_RAW_HTML = 'typical_raw_html' TYPICAL_RAW_TAG_HTML = 'typical_raw_tag_html' - IS_XPATH = True + IS_XPATH = 'is_xpath' XPATH_MAPPING = 'xpath_mapping' TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html' # 模型打标字典