ccprocessor · yogacc33 · May 26, 2025 · May 21, 2025 · May 21, 2025 · May 21, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/code/common.py b/llm_web_kit/extractor/html/recognizer/code/common.py
@@ -161,22 +161,18 @@ def _detect_and_remove_subling_lineno(node: HtmlElement, depth: int = 4):
     if depth == 0 or node is None or node.getparent() is None:
         return
 
-    found = False
-    # 认为只有代码元素左侧的元素可能是行号，避免对无关的表格进行损坏
-    ele_before = None
-    for child in node.getparent():
-        if child == node:
-            if ele_before is None:
-                break
-            has_lineno, _ = _detect_lineno('\n'.join(ele_before.itertext()), False)
-            if has_lineno:
-                node.getparent().remove(ele_before)
-                found = True
-            break
-        ele_before = child
-
-    if not found:
-        _detect_and_remove_subling_lineno(node.getparent(), depth - 1)
+    parent = node.getparent()
+    ele_before = node.getprevious()
+
+    if ele_before is not None:
+        text = '\n'.join(ele_before.itertext())
+        has_lineno, _ = _detect_lineno(text, False)
+        if has_lineno:
+            parent.remove(ele_before)
+            return  # 删除后立即返回，不再递归
+
+    # 继续递归父节点
+    _detect_and_remove_subling_lineno(parent, depth - 1)
 
 
 def get_full_text(sub_tree: HtmlElement) -> tuple[bool, str, str]:

diff --git a/llm_web_kit/extractor/html/recognizer/code/tag_code.py b/llm_web_kit/extractor/html/recognizer/code/tag_code.py
@@ -1,3 +1,4 @@
+from collections import deque
 from typing import Optional
 
 from lxml.html import HtmlElement
@@ -12,13 +13,29 @@
 
 
 def __is_all_chars_in_code_element(node: HtmlElement) -> bool:
-    full_text = ''.join([x for x in ''.join(node.itertext(None)) if not x.isspace() and not x.isdigit()])
-    code_text = ''
-    for s in node.xpath('.//code//text()'):
-        for c in s:
-            if not c.isspace() and not c.isdigit():
-                code_text += c
-    return full_text == code_text
+
+    if node.tag == 'code':
+        return True
+
+    full_chars = (
+        c for text in node.itertext()
+        for c in text
+        if not c.isspace() and not c.isdigit()
+    )
+
+    node_texts = node.xpath('.//code//text()')
+    code_chars = (
+        c for code in node_texts
+        for text in code
+        for c in text
+        if not c.isspace() and not c.isdigit()
+    )
+
+    for f, c in zip(full_chars, code_chars):
+        if f != c:
+            return False
+
+    return next(full_chars, None) is None and next(code_chars, None) is None
 
 
 def __get_code_nodes(html_el: HtmlElement) -> list[HtmlElement]:
@@ -42,6 +59,7 @@
             nodes.append(code_node)
         else:
             nodes.extend(__get_code_nodes(code_node))
+
     return nodes
 
 
@@ -85,9 +103,26 @@
 
 
 def __group_code(nodes: list[HtmlElement]) -> list[HtmlElement]:
+    """从 HtmlElement 列表中提取包含 <code> 标签的根节点。
+
+    Args:
+        nodes: 输入的 HtmlElement 列表
+    Returns:
+        包含 <code> 标签的根节点列表
+    """
     root_nodes: list[HtmlElement] = []
+    processed = set()
+    nodes_deque = deque(nodes)
 
     def next_parent(code_node: HtmlElement, code_tags: int) -> tuple[Optional[HtmlElement], int]:
+        """查找父节点中第一个 <code> 标签数量不同的节点。
+
+        Args:
+            code_node: 当前节点
+            code_tags: 当前节点的 <code> 标签数量
+        Returns:
+            (父节点, 父节点的 <code> 标签数量)，若无符合条件的父节点则返回 (None, 0)
+        """
         parent: Optional[HtmlElement] = code_node.getparent()
         while parent is not None:
             new_code_tags = len(parent.xpath('.//code'))
@@ -97,33 +132,40 @@
                 return parent, new_code_tags
         return None, 0
 
-    while len(nodes):
-        code_node = nodes[0]
+    def get_descendants(node: HtmlElement) -> set:
+        """获取节点的所有后代节点的 id 集合。
+
+        Args:
+            node: 当前节点
+        Returns:
+            后代节点的 id 集合
+        """
+        descendants = set()
+        for child in node.iterdescendants():
+            descendants.add(id(child))
+        return descendants
+
+    while nodes_deque:
+        code_node = nodes_deque.popleft()
+        if id(code_node) in processed:
+            continue
+
         code_tags = len(code_node.xpath('.//code'))
 
         parent, new_code_tags = next_parent(code_node, code_tags)
         while parent is not None:
             if not __is_all_chars_in_code_element(parent):
                 break
-
             if len(parent.xpath(f'.//{CCTag.CC_CODE}|.//{CCTag.CC_CODE_INLINE}')) > 0:
                 break
-
             code_node = parent
             code_tags = new_code_tags
-
             parent, new_code_tags = next_parent(code_node, code_tags)
 
-        root_path: str = code_node.getroottree().getpath(code_node)
         root_nodes.append(code_node)
-
-        new_nodes: list[HtmlElement] = []
-        for node in nodes:
-            node_path: str = node.getroottree().getpath(node)
-            if node_path.startswith(root_path):
-                continue
-            new_nodes.append(node)
-        nodes = new_nodes
+        processed.add(id(code_node))
+        descendants = get_descendants(code_node)
+        processed.update(descendants)
 
     return root_nodes