diff --git a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py
index efb4ac31..44801c60 100644
--- a/llm_web_kit/main_html_parser/simplify_html/simplify_html.py
+++ b/llm_web_kit/main_html_parser/simplify_html/simplify_html.py
@@ -18,7 +18,18 @@
# 需要删除的标签
tags_to_remove = {
- 'head', 'header', 'footer', 'nav', 'aside', 'style', 'script', 'select', 'noscript', 'link', 'meta', 'iframe', 'frame'
+ 'head',
+ 'header',
+ 'footer',
+ 'nav',
+ 'aside',
+ 'style',
+ 'script',
+ 'noscript',
+ 'link',
+ 'meta',
+ 'iframe',
+ 'frame'
}
# 需要保留的特殊标签(即使它们是行内标签)
@@ -31,7 +42,7 @@
# 需要删除的属性名模式(特定前缀/后缀)
ATTR_SUFFIX_TO_REMOVE = {
- '-nav', '_nav',
+ # '-nav', '_nav',
# '-footer', '_footer', # 有特例,可能dl列表一组最后一项添加了自定义footer属性,先注释
# '-header', '_header', # 有特例,可能自定义的header中有标题,先注释
}
@@ -543,9 +554,9 @@ def should_remove_element(element) -> bool:
if part in ATTR_PATTERNS_TO_REMOVE:
return True
# 检查是否包含特定前缀/后缀
- for pattern in ATTR_SUFFIX_TO_REMOVE:
- if pattern in part:
- return True
+ # for pattern in ATTR_SUFFIX_TO_REMOVE:
+ # if part.endswith(pattern):
+ # return True
# 检查id属性
id_name = element.get('id', '')
@@ -556,9 +567,9 @@ def should_remove_element(element) -> bool:
if part in ATTR_PATTERNS_TO_REMOVE:
return True
# 检查是否包含特定前缀/后缀
- for pattern in ATTR_SUFFIX_TO_REMOVE:
- if pattern in part:
- return True
+ # for pattern in ATTR_SUFFIX_TO_REMOVE:
+ # if part.endswith(pattern):
+ # return True
# 检查style属性
style_attr = element.get('style', '')
@@ -647,9 +658,9 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html
content_type = para.get('content_type', 'block_element')
# 公共处理步骤
- clean_attributes(root)
+ # clean_attributes(root)
simplify_list(root)
- remove_inline_tags(root)
+ # remove_inline_tags(root)
# 跳过无意义内容
if not is_meaningful_content(root):
diff --git a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json
index f2bf5c1b..abb40deb 100644
--- a/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json
+++ b/tests/llm_web_kit/main_html_parser/parser/assets/input_layout_batch_parser/template_www.wdi.it_llm.json
@@ -5,9 +5,9 @@
"item_id 4": "No",
"item_id 5": "No",
"item_id 6": "No",
- "item_id 7": "Yes",
+ "item_id 7": "No",
"item_id 8": "No",
- "item_id 9": "No",
+ "item_id 9": "Yes",
"item_id 10": "No",
"item_id 11": "No",
"item_id 12": "No",
@@ -31,5 +31,9 @@
"item_id 30": "No",
"item_id 31": "No",
"item_id 32": "No",
- "item_id 33": "No"
- }
\ No newline at end of file
+ "item_id 33": "No",
+ "item_id 34": "No",
+ "item_id 35": "No",
+ "item_id 36": "No",
+ "item_id 37": "No"
+}
\ No newline at end of file