diff --git a/llm_web_kit/extractor/html/recognizer/code/classes.py b/llm_web_kit/extractor/html/recognizer/code/classes.py index b71af41a..a228e736 100644 --- a/llm_web_kit/extractor/html/recognizer/code/classes.py +++ b/llm_web_kit/extractor/html/recognizer/code/classes.py @@ -4,13 +4,21 @@ replace_node_by_cccode from llm_web_kit.extractor.html.recognizer.recognizer import CCTag +no_code_tags = ['audio', 'td', 'span','ul', 'li', 'body', 'p', 'h1', 'h2'] + def modify_tree(root: HtmlElement) -> None: + for maybe_code_root in root.xpath('.//*[@class]'): assert isinstance(maybe_code_root, HtmlElement) + if not any(['code' in class_name for class_name in maybe_code_root.classes]): continue - + # 应对list或者audio被识别为code的情况 + if maybe_code_root.tag in no_code_tags: + continue + if maybe_code_root.tag == 'div' and any([child.tag in no_code_tags for child in maybe_code_root.iterchildren()]) or maybe_code_root.iterchildren() is None: + continue if len(maybe_code_root.xpath(f'.//{CCTag.CC_CODE}')) > 0: continue @@ -20,12 +28,15 @@ def modify_tree(root: HtmlElement) -> None: def detect(root: HtmlElement) -> bool: for maybe_code_root in root.xpath('.//*[@class]'): assert isinstance(maybe_code_root, HtmlElement) + if not any(['code' in class_name for class_name in maybe_code_root.classes]): continue - + if maybe_code_root.tag in no_code_tags: + continue + if maybe_code_root.tag == 'div' and any([child.tag in no_code_tags for child in maybe_code_root.iterchildren()]): + continue if len(maybe_code_root.xpath(f'.//{CCTag.CC_CODE}')) > 0: continue - return True return False diff --git a/llm_web_kit/extractor/html/recognizer/list.py b/llm_web_kit/extractor/html/recognizer/list.py index 8ee3b136..0feb7965 100644 --- a/llm_web_kit/extractor/html/recognizer/list.py +++ b/llm_web_kit/extractor/html/recognizer/list.py @@ -124,6 +124,7 @@ def __extract_list_item_text_recusive(el: HtmlElement): is_sub_sup = el.tag == 'sub' or el.tag == 'sup' paragraph = [] result = {} + if el.tag == CCTag.CC_MATH_INLINE and el.text and el.text.strip(): paragraph.append({'c': f'${el.text}$', 't': ParagraphTextType.EQUATION_INLINE}) elif el.tag == CCTag.CC_CODE_INLINE and el.text and el.text.strip(): @@ -146,11 +147,15 @@ def __extract_list_item_text_recusive(el: HtmlElement): 'items': [] } for child in el.getchildren(): - child_list['items'].append(__extract_list_item_text_recusive(child)) - result['child_list'] = child_list + child_item = __extract_list_item_text_recusive(child) + if len(child_item) != 0: + child_list['items'].append(child_item) + if child_list['items']: + result['child_list'] = child_list else: if el.text and el.text.strip(): - paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT}) + if el.text != '-': + paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT}) el.text = None for child in el.getchildren(): p = __extract_list_item_text_recusive(child) @@ -160,7 +165,8 @@ def __extract_list_item_text_recusive(el: HtmlElement): result['child_list'] = p['child_list'] # 添加子元素的文本内容 if 'c' in p: - paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)}) + if p['c'] != '' and p['c'] != '-': + paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)}) if el.tag != 'li' and el.tail and el.tail.strip(): if is_sub_sup: # 如果尾部文本跟在sub/sup后面,直接附加到最后一个文本段落中 @@ -171,13 +177,16 @@ def __extract_list_item_text_recusive(el: HtmlElement): else: paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT}) if paragraph: + # item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph) return result - list_item_tags = ('li', 'dd', 'dt') + # 这里也需要加上ul,不然会导致