ccprocessor · ddfinshes · May 21, 2025 · May 21, 2025 · May 21, 2025 · May 22, 2025
diff --git a/llm_web_kit/extractor/html/recognizer/code/classes.py b/llm_web_kit/extractor/html/recognizer/code/classes.py
@@ -4,13 +4,21 @@
     replace_node_by_cccode
 from llm_web_kit.extractor.html.recognizer.recognizer import CCTag
 
+no_code_tags = ['audio', 'td', 'span','ul', 'li', 'body', 'p', 'h1', 'h2']
+
 
 def modify_tree(root: HtmlElement) -> None:
+
     for maybe_code_root in root.xpath('.//*[@class]'):
         assert isinstance(maybe_code_root, HtmlElement)
+
         if not any(['code' in class_name for class_name in maybe_code_root.classes]):
             continue
-
+        # 应对list或者audio被识别为code的情况
+        if maybe_code_root.tag in no_code_tags:
+            continue
+        if maybe_code_root.tag == 'div' and any([child.tag in no_code_tags for child in maybe_code_root.iterchildren()]) or maybe_code_root.iterchildren() is None:
+            continue
         if len(maybe_code_root.xpath(f'.//{CCTag.CC_CODE}')) > 0:
             continue
 
@@ -20,12 +28,15 @@ def modify_tree(root: HtmlElement) -> None:
 def detect(root: HtmlElement) -> bool:
     for maybe_code_root in root.xpath('.//*[@class]'):
         assert isinstance(maybe_code_root, HtmlElement)
+
         if not any(['code' in class_name for class_name in maybe_code_root.classes]):
             continue
-
+        if maybe_code_root.tag in no_code_tags:
+            continue
+        if maybe_code_root.tag == 'div' and any([child.tag in no_code_tags for child in maybe_code_root.iterchildren()]):
+            continue
         if len(maybe_code_root.xpath(f'.//{CCTag.CC_CODE}')) > 0:
             continue
-
         return True
 
     return False
diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py
@@ -124,6 +124,7 @@ def __extract_list_item_text_recusive(el: HtmlElement):
             is_sub_sup = el.tag == 'sub' or el.tag == 'sup'
             paragraph = []
             result = {}
+
             if el.tag == CCTag.CC_MATH_INLINE and el.text and el.text.strip():
                 paragraph.append({'c': f'${el.text}$', 't': ParagraphTextType.EQUATION_INLINE})
             elif el.tag == CCTag.CC_CODE_INLINE and el.text and el.text.strip():
@@ -146,11 +147,15 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                     'items': []
                 }
                 for child in el.getchildren():
-                    child_list['items'].append(__extract_list_item_text_recusive(child))
-                result['child_list'] = child_list
+                    child_item = __extract_list_item_text_recusive(child)
+                    if len(child_item) != 0:
+                        child_list['items'].append(child_item)
+                if child_list['items']:
+                    result['child_list'] = child_list
             else:
                 if el.text and el.text.strip():
-                    paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT})
+                    if el.text != '-':
+                        paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT})
                     el.text = None
                 for child in el.getchildren():
                     p = __extract_list_item_text_recusive(child)
@@ -160,7 +165,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                             result['child_list'] = p['child_list']
                         # 添加子元素的文本内容
                         if 'c' in p:
-                            paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)})
+                            if p['c'] != '' and p['c'] != '-':
+                                paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)})
             if el.tag != 'li' and el.tail and el.tail.strip():
                 if is_sub_sup:
                     # 如果尾部文本跟在sub/sup后面，直接附加到最后一个文本段落中
@@ -171,13 +177,16 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                 else:
                     paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT})
             if paragraph:
+                # item['c'].strip(): 会导致前面处理br标签，添加的\n\n失效
                 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph)
             return result
-        list_item_tags = ('li', 'dd', 'dt')
+        # 这里也需要加上ul，不然会导致<ul><ul><ul/><ul/>的结构的list提取不到
+        list_item_tags = ('li', 'dd', 'dt', 'ul', 'div')
         if child.tag in list_item_tags:
             paragraph = __extract_list_item_text_recusive(child)
             if len(paragraph) > 0:
                 text_paragraph.append(paragraph)
+
         return text_paragraph
 
     def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> list:
@@ -190,6 +199,7 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
         Returns:
             list: 包含列表项内容的列表，即items
         """
+
         content_list = []
         # 处理根元素文本
         if ele.text and ele.text.strip():

diff --git a/llm_web_kit/extractor/html/recognizer/text.py b/llm_web_kit/extractor/html/recognizer/text.py
@@ -40,6 +40,18 @@
     '☁'    # 云符号
 ]
 
+# 其他标点符
+other_symbols = [
+    '“',
+    '‘',
+    '[',
+    '(',
+    '”',
+    '’',
+    '。',
+    '，'
+]
+
 PARAGRAPH_SEPARATOR = '\n\n'
 
 # 需要保留的html实体，例如：'>' 直接在markdown中无法渲染，需要替换为html实体
@@ -50,8 +62,9 @@
     'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code',
     'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
     'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
-    'textarea', 'time', 'var', 'u', 's', 'code', 'cccode-inline', 'ccmath-inline',
-    'marked-tail', 'marked-text'
+    'textarea', 'time', 'var', 'u', 's', 'cccode-inline', 'ccmath-inline',
+    'marked-tail', 'marked-text', 'font', 'nobr', 'bdi', 'mjx-container',
+    'mjx-assistive-mml', 'strike', 'wbr'
 }
 
 
@@ -93,9 +106,16 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, H
         new_html_lst = []
         for html_element, raw_html_element in main_html_lst:
             # 如果是字符串则转换为 HtmlElement
+
             if self.is_cc_html(html_element):
                 new_html_lst.append((html_element, raw_html_element))
+
             else:
+                # html_element = element_to_html_unescaped(html_element) # str
+                # if '<sup&gt;' in html_element:
+                #     print('-------------------------------------')
+                # html_element = html.fromstring(html_element) # html_to_element
+                # html_element = html_to_element(html_element)
                 lst = list(self.__extract_paragraphs(html_element))
                 new_lst = self.__to_cctext_lst(lst)
                 new_html_lst.extend(new_lst)
@@ -108,7 +128,9 @@ def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]
             lst: List[Tuple[HtmlElement | str, HtmlElement | str]]: Element和raw_html组成的列表
         """
         new_lst = []
+
         for el, raw_html in lst:
+
             # 如果是字符串则转换为 HtmlElement
             el_element = html_to_element(el) if isinstance(el, str) else el
             raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html
@@ -120,20 +142,45 @@ def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]
         return new_lst
 
     def replace_entities(self, text, entities_map):
-        """使用正则表达式同时替换文本中的多个特定字符为其对应的HTML实体。
+        """替换文本中指定字符为对应的HTML实体，但跳过HTML标签内的字符。
 
         :param text: 需要处理的文本。
-        :param entities_map: 一个字典，键是需要替换的字符，值是对应的HTML实体名
+        :param entities_map: 字典，键是要替换的字符，值是对应的HTML实体名。
         :return: 替换后的文本。
         """
-        # 创建正则表达式模式，匹配所有需要替换的字符
-        rx = re.compile('|'.join(re.escape(str(key)) for key in entities_map.keys()))
+        if not entities_map:
+            return text  # 如果字典为空，直接返回原文本
+
+        # 构建匹配需要替换字符的正则表达式
+        entities_pattern = '|'.join(re.escape(str(key)) for key in entities_map.keys())
+        rx_entity = re.compile(entities_pattern)
 
-        def one_xlat(match):
-            """回调函数，用于将匹配到的字符替换为对应的HTML实体。"""
-            return f'&{entities_map[match.group(0)]};'
+        # 构建匹配HTML标签的正则表达式
+        rx_tag = re.compile(r'<[^>]*>')
 
-        return rx.sub(one_xlat, text)
+        result = []
+        last_pos = 0
+
+        # 遍历所有HTML标签
+        for tag_match in rx_tag.finditer(text):
+            start, end = tag_match.start(), tag_match.end()
+
+            # 提取非标签部分并进行替换
+            non_tag_part = text[last_pos:start]
+            replaced = rx_entity.sub(lambda m: f'&{entities_map[m.group(0)]};', non_tag_part)
+            result.append(replaced)
+
+            # 保留HTML标签不变
+            result.append(text[start:end])
+
+            last_pos = end
+
+        # 处理最后剩余的非标签部分
+        non_tag_part = text[last_pos:]
+        replaced = rx_entity.sub(lambda m: f'&{entities_map[m.group(0)]};', non_tag_part)
+        result.append(replaced)
+
+        return ''.join(result)
 
     def __combine_text(self, text1:str, text2:str, lang='en') -> str:
         """将两段文本合并，中间加空格.
@@ -149,7 +196,8 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
             txt = text1 + text2
             return self.replace_entities(txt.strip(), entities_map)
         else:
-            words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
+            # 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接
+            words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols or text2[0] in other_symbols or text1 and text1[-1] in other_symbols else ' '
             txt = text1 + words_sep + text2
             return self.replace_entities(txt.strip(), entities_map)
 
@@ -169,7 +217,6 @@ def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
         para_text = []
 
         def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
-
             # 标记当前元素是否是sub或sup类型
             is_sub_sup = el.tag == 'sub' or el.tag == 'sup'
 
@@ -187,6 +234,8 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
                 text += PARAGRAPH_SEPARATOR  # TODO 这个地方直接加换行是错误点做法，需要利用数据结构来保证段落。
             elif el.tag == 'sub' or el.tag == 'sup':
                 text = process_sub_sup_tags(el, text, recursive=False)
+            elif el.tag == 'audio':  # 避免audio被识别为paragraph
+                pass
             else:
                 if el.text and el.text.strip():
                     text = self.__combine_text(text, el.text.strip())