Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 12 additions & 16 deletions llm_web_kit/extractor/html/recognizer/code/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,22 +161,18 @@ def _detect_and_remove_subling_lineno(node: HtmlElement, depth: int = 4):
if depth == 0 or node is None or node.getparent() is None:
return

found = False
# 认为只有代码元素左侧的元素可能是行号,避免对无关的表格进行损坏
ele_before = None
for child in node.getparent():
if child == node:
if ele_before is None:
break
has_lineno, _ = _detect_lineno('\n'.join(ele_before.itertext()), False)
if has_lineno:
node.getparent().remove(ele_before)
found = True
break
ele_before = child

if not found:
_detect_and_remove_subling_lineno(node.getparent(), depth - 1)
parent = node.getparent()
ele_before = node.getprevious()

if ele_before is not None:
text = '\n'.join(ele_before.itertext())
has_lineno, _ = _detect_lineno(text, False)
if has_lineno:
parent.remove(ele_before)
return # 删除后立即返回,不再递归

# 继续递归父节点
_detect_and_remove_subling_lineno(parent, depth - 1)


def get_full_text(sub_tree: HtmlElement) -> tuple[bool, str, str]:
Expand Down
84 changes: 63 additions & 21 deletions llm_web_kit/extractor/html/recognizer/code/tag_code.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import deque
from typing import Optional

from lxml.html import HtmlElement
Expand All @@ -12,13 +13,29 @@


def __is_all_chars_in_code_element(node: HtmlElement) -> bool:
full_text = ''.join([x for x in ''.join(node.itertext(None)) if not x.isspace() and not x.isdigit()])
code_text = ''
for s in node.xpath('.//code//text()'):
for c in s:
if not c.isspace() and not c.isdigit():
code_text += c
return full_text == code_text

if node.tag == 'code':
return True

Check warning on line 18 in llm_web_kit/extractor/html/recognizer/code/tag_code.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/code/tag_code.py#L18

Added line #L18 was not covered by tests

full_chars = (
c for text in node.itertext()
for c in text
if not c.isspace() and not c.isdigit()
)

node_texts = node.xpath('.//code//text()')
code_chars = (
c for code in node_texts
for text in code
for c in text
if not c.isspace() and not c.isdigit()
)

for f, c in zip(full_chars, code_chars):
if f != c:
return False

return next(full_chars, None) is None and next(code_chars, None) is None


def __get_code_nodes(html_el: HtmlElement) -> list[HtmlElement]:
Expand All @@ -42,6 +59,7 @@
nodes.append(code_node)
else:
nodes.extend(__get_code_nodes(code_node))

return nodes


Expand Down Expand Up @@ -85,9 +103,26 @@


def __group_code(nodes: list[HtmlElement]) -> list[HtmlElement]:
"""从 HtmlElement 列表中提取包含 <code> 标签的根节点。

Args:
nodes: 输入的 HtmlElement 列表
Returns:
包含 <code> 标签的根节点列表
"""
root_nodes: list[HtmlElement] = []
processed = set()
nodes_deque = deque(nodes)

def next_parent(code_node: HtmlElement, code_tags: int) -> tuple[Optional[HtmlElement], int]:
"""查找父节点中第一个 <code> 标签数量不同的节点。

Args:
code_node: 当前节点
code_tags: 当前节点的 <code> 标签数量
Returns:
(父节点, 父节点的 <code> 标签数量),若无符合条件的父节点则返回 (None, 0)
"""
parent: Optional[HtmlElement] = code_node.getparent()
while parent is not None:
new_code_tags = len(parent.xpath('.//code'))
Expand All @@ -97,33 +132,40 @@
return parent, new_code_tags
return None, 0

while len(nodes):
code_node = nodes[0]
def get_descendants(node: HtmlElement) -> set:
"""获取节点的所有后代节点的 id 集合。

Args:
node: 当前节点
Returns:
后代节点的 id 集合
"""
descendants = set()
for child in node.iterdescendants():
descendants.add(id(child))
return descendants

while nodes_deque:
code_node = nodes_deque.popleft()
if id(code_node) in processed:
continue

code_tags = len(code_node.xpath('.//code'))

parent, new_code_tags = next_parent(code_node, code_tags)
while parent is not None:
if not __is_all_chars_in_code_element(parent):
break

if len(parent.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0:
break

code_node = parent
code_tags = new_code_tags

parent, new_code_tags = next_parent(code_node, code_tags)

root_path: str = code_node.getroottree().getpath(code_node)
root_nodes.append(code_node)

new_nodes: list[HtmlElement] = []
for node in nodes:
node_path: str = node.getroottree().getpath(node)
if node_path.startswith(root_path):
continue
new_nodes.append(node)
nodes = new_nodes
processed.add(id(code_node))
descendants = get_descendants(code_node)
processed.update(descendants)

return root_nodes

Expand Down
Loading