Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 21 additions & 10 deletions llm_web_kit/main_html_parser/simplify_html/simplify_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,18 @@

# 需要删除的标签
tags_to_remove = {
'head', 'header', 'footer', 'nav', 'aside', 'style', 'script', 'select', 'noscript', 'link', 'meta', 'iframe', 'frame'
'head',
'header',
'footer',
'nav',
'aside',
'style',
'script',
'noscript',
'link',
'meta',
'iframe',
'frame'
}

# 需要保留的特殊标签(即使它们是行内标签)
Expand All @@ -31,7 +42,7 @@

# 需要删除的属性名模式(特定前缀/后缀)
ATTR_SUFFIX_TO_REMOVE = {
'-nav', '_nav',
# '-nav', '_nav',
# '-footer', '_footer', # 有特例,可能dl列表一组最后一项添加了自定义footer属性,先注释
# '-header', '_header', # 有特例,可能自定义的header中有标题,先注释
}
Expand Down Expand Up @@ -543,9 +554,9 @@ def should_remove_element(element) -> bool:
if part in ATTR_PATTERNS_TO_REMOVE:
return True
# 检查是否包含特定前缀/后缀
for pattern in ATTR_SUFFIX_TO_REMOVE:
if pattern in part:
return True
# for pattern in ATTR_SUFFIX_TO_REMOVE:
# if part.endswith(pattern):
# return True

# 检查id属性
id_name = element.get('id', '')
Expand All @@ -556,9 +567,9 @@ def should_remove_element(element) -> bool:
if part in ATTR_PATTERNS_TO_REMOVE:
return True
# 检查是否包含特定前缀/后缀
for pattern in ATTR_SUFFIX_TO_REMOVE:
if pattern in part:
return True
# for pattern in ATTR_SUFFIX_TO_REMOVE:
# if part.endswith(pattern):
# return True

# 检查style属性
style_attr = element.get('style', '')
Expand Down Expand Up @@ -647,9 +658,9 @@ def process_paragraphs(paragraphs: List[Dict[str, str]], uid_map: Dict[str, html
content_type = para.get('content_type', 'block_element')

# 公共处理步骤
clean_attributes(root)
# clean_attributes(root)
simplify_list(root)
remove_inline_tags(root)
# remove_inline_tags(root)

# 跳过无意义内容
if not is_meaningful_content(root):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
"item_id 4": "No",
"item_id 5": "No",
"item_id 6": "No",
"item_id 7": "Yes",
"item_id 7": "No",
"item_id 8": "No",
"item_id 9": "No",
"item_id 9": "Yes",
"item_id 10": "No",
"item_id 11": "No",
"item_id 12": "No",
Expand All @@ -31,5 +31,9 @@
"item_id 30": "No",
"item_id 31": "No",
"item_id 32": "No",
"item_id 33": "No"
}
"item_id 33": "No",
"item_id 34": "No",
"item_id 35": "No",
"item_id 36": "No",
"item_id 37": "No"
}