From 85dc9c06d6567cd77970ed68e0218629608ea100 Mon Sep 17 00:00:00 2001 From: houlinfeng Date: Mon, 9 Jun 2025 16:03:36 +0800 Subject: [PATCH] =?UTF-8?q?feat:=E6=B8=85=E7=90=86=E5=85=83=E7=B4=A0?= =?UTF-8?q?=E5=B1=9E=E6=80=A7=EF=BC=8C=E4=BF=9D=E7=95=99=E5=9B=BE=E7=89=87?= =?UTF-8?q?=E7=9A=84=E6=9C=89=E6=95=88src=EF=BC=88=E6=8E=92=E9=99=A4base64?= =?UTF-8?q?=EF=BC=89=E3=80=81alt=EF=BC=8C=E4=BB=A5=E5=8F=8A=E6=89=80?= =?UTF-8?q?=E6=9C=89=E5=85=83=E7=B4=A0=E7=9A=84class=E5=92=8Cid"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../simplify_html/simplify_html.py | 21 ++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py index b47de995..94e5d629 100644 --- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py +++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py @@ -417,23 +417,34 @@ def is_meaningful_content(element) -> bool: def clean_attributes(element): - """清理元素属性,只保留图片的有效src以及所有元素的class和id.""" + """清理元素属性,保留图片的有效src(排除base64)、alt,以及所有元素的class和id.""" if element.tag == 'img': + # 获取图片相关属性 src = element.get('src', '').strip() + alt = element.get('alt', '').strip() class_attr = element.get('class', '').strip() id_attr = element.get('id', '').strip() - element.attrib.clear() # 先清除所有属性 - if src: + + element.attrib.clear() # 清除所有属性 + + # 保留非base64的src + if src and not src.startswith('data:image/'): element.set('src', src) + # 保留alt(如果非空) + if alt: + element.set('alt', alt) + # 保留class和id(如果非空) if class_attr: element.set('class', class_attr) if id_attr: element.set('id', id_attr) else: - # 对于其他元素,只保留class和id + # 非图片元素:只保留class和id class_attr = element.get('class', '').strip() id_attr = element.get('id', '').strip() - element.attrib.clear() # 先清除所有属性 + + element.attrib.clear() # 清除所有属性 + if class_attr: element.set('class', class_attr) if id_attr: