Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
b12770c
timeout_change
ddfinshes May 21, 2025
82e7018
Merge branch 'dev' of https://github.com/ddfinshes/llm-webkit-mirror …
ddfinshes May 21, 2025
4375f1a
timeout-chagened-right
ddfinshes May 21, 2025
9189918
timeout-exception
ddfinshes May 22, 2025
932eb0d
timeout-exception
ddfinshes May 22, 2025
863fbe5
Fix docstrings
ddfinshes May 22, 2025
fa69f09
Merge branch 'ccprocessor:dev' into dev
ddfinshes May 22, 2025
0133cc9
timeout change
ddfinshes May 22, 2025
557d436
timeout change
ddfinshes May 22, 2025
872a4b0
final timeout change
ddfinshes May 22, 2025
eb4988e
finall
ddfinshes May 22, 2025
88ba1c2
finall
ddfinshes May 22, 2025
a2a1959
finall
ddfinshes May 22, 2025
92639e6
finall
ddfinshes May 22, 2025
e00e760
Merge branch 'ccprocessor:dev' into dev
ddfinshes May 23, 2025
1258b41
finall
ddfinshes May 23, 2025
d90f726
finall
ddfinshes May 23, 2025
926a8f6
finall
ddfinshes May 23, 2025
173f0d9
finall
ddfinshes May 23, 2025
977e708
finall
ddfinshes May 23, 2025
254be11
finall
ddfinshes May 23, 2025
53c8322
finall
ddfinshes May 23, 2025
3250875
finall
ddfinshes May 23, 2025
8332378
finall
ddfinshes May 23, 2025
72a328a
finall
ddfinshes May 23, 2025
45ba792
finall
ddfinshes May 23, 2025
2eeb24c
finall
ddfinshes May 23, 2025
8e6c184
finall
ddfinshes May 23, 2025
1fbff3d
finall
ddfinshes May 23, 2025
c74fa27
Merge branch 'ccprocessor:dev' into dev
ddfinshes May 27, 2025
97910e2
text-marked
ddfinshes May 27, 2025
cc2bc3d
text-marked
ddfinshes May 27, 2025
ff94025
text-marked
ddfinshes May 27, 2025
05d7208
text-marked
ddfinshes May 27, 2025
8d3393b
text-marked
ddfinshes May 28, 2025
662c6af
text-marked
ddfinshes May 28, 2025
b13fc92
code-pre
ddfinshes May 28, 2025
ab936ef
code-pre
ddfinshes May 28, 2025
3349b0f
code-pre
ddfinshes May 28, 2025
153e4a1
code-pre
ddfinshes May 29, 2025
a87fb7a
code-pre
ddfinshes May 29, 2025
238e6a7
code-pre
ddfinshes May 29, 2025
89074ab
code-pre
ddfinshes May 29, 2025
b73c6dd
text-marked
ddfinshes May 29, 2025
c67d212
text-marked
ddfinshes May 29, 2025
fb91da9
Merge branch 'ccprocessor:dev' into dev
ddfinshes Jun 3, 2025
ce54387
Merge branch 'ccprocessor:dev' into dev
ddfinshes Jun 6, 2025
a181def
recognizer-bug
ddfinshes Jun 6, 2025
17dc2ea
recognizer-bug
ddfinshes Jun 6, 2025
6d53ee8
extract_modify
ddfinshes Jun 9, 2025
b1616cd
extract_modify
ddfinshes Jun 9, 2025
993e4f8
extract_modify
ddfinshes Jun 9, 2025
175cda5
extract_modify
ddfinshes Jun 9, 2025
01d7f4f
extract_modify
ddfinshes Jun 9, 2025
68fd003
extract_modify
ddfinshes Jun 9, 2025
c9fa615
extract_modify
ddfinshes Jun 13, 2025
7474f3c
extract_modify
ddfinshes Jun 13, 2025
54c8f94
extract_modify
ddfinshes Jun 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 14 additions & 3 deletions llm_web_kit/extractor/html/recognizer/code/classes.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,21 @@
replace_node_by_cccode
from llm_web_kit.extractor.html.recognizer.recognizer import CCTag

no_code_tags = ['audio', 'td', 'span','ul', 'li', 'body', 'p', 'h1', 'h2']


def modify_tree(root: HtmlElement) -> None:

for maybe_code_root in root.xpath('.//*[@class]'):
assert isinstance(maybe_code_root, HtmlElement)

if not any(['code' in class_name for class_name in maybe_code_root.classes]):
continue

# 应对list或者audio被识别为code的情况
if maybe_code_root.tag in no_code_tags:
continue
if maybe_code_root.tag == 'div' and any([child.tag in no_code_tags for child in maybe_code_root.iterchildren()]) or maybe_code_root.iterchildren() is None:
continue
if len(maybe_code_root.xpath(f'.//{CCTag.CC_CODE}')) > 0:
continue

Expand All @@ -20,12 +28,15 @@ def modify_tree(root: HtmlElement) -> None:
def detect(root: HtmlElement) -> bool:
for maybe_code_root in root.xpath('.//*[@class]'):
assert isinstance(maybe_code_root, HtmlElement)

if not any(['code' in class_name for class_name in maybe_code_root.classes]):
continue

if maybe_code_root.tag in no_code_tags:
continue
if maybe_code_root.tag == 'div' and any([child.tag in no_code_tags for child in maybe_code_root.iterchildren()]):
continue
if len(maybe_code_root.xpath(f'.//{CCTag.CC_CODE}')) > 0:
continue

return True

return False
20 changes: 15 additions & 5 deletions llm_web_kit/extractor/html/recognizer/list.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def __extract_list_item_text_recusive(el: HtmlElement):
is_sub_sup = el.tag == 'sub' or el.tag == 'sup'
paragraph = []
result = {}

if el.tag == CCTag.CC_MATH_INLINE and el.text and el.text.strip():
paragraph.append({'c': f'${el.text}$', 't': ParagraphTextType.EQUATION_INLINE})
elif el.tag == CCTag.CC_CODE_INLINE and el.text and el.text.strip():
Expand All @@ -146,11 +147,15 @@ def __extract_list_item_text_recusive(el: HtmlElement):
'items': []
}
for child in el.getchildren():
child_list['items'].append(__extract_list_item_text_recusive(child))
result['child_list'] = child_list
child_item = __extract_list_item_text_recusive(child)
if len(child_item) != 0:
child_list['items'].append(child_item)
if child_list['items']:
result['child_list'] = child_list
else:
if el.text and el.text.strip():
paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT})
if el.text != '-':
paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT})
el.text = None
for child in el.getchildren():
p = __extract_list_item_text_recusive(child)
Expand All @@ -160,7 +165,8 @@ def __extract_list_item_text_recusive(el: HtmlElement):
result['child_list'] = p['child_list']
# 添加子元素的文本内容
if 'c' in p:
paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)})
if p['c'] != '' and p['c'] != '-':
paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)})
if el.tag != 'li' and el.tail and el.tail.strip():
if is_sub_sup:
# 如果尾部文本跟在sub/sup后面,直接附加到最后一个文本段落中
Expand All @@ -171,13 +177,16 @@ def __extract_list_item_text_recusive(el: HtmlElement):
else:
paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT})
if paragraph:
# item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效
result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph)
return result
list_item_tags = ('li', 'dd', 'dt')
# 这里也需要加上ul,不然会导致<ul><ul><ul/><ul/>的结构的list提取不到
list_item_tags = ('li', 'dd', 'dt', 'ul', 'div')
if child.tag in list_item_tags:
paragraph = __extract_list_item_text_recusive(child)
if len(paragraph) > 0:
text_paragraph.append(paragraph)

return text_paragraph

def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> list:
Expand All @@ -190,6 +199,7 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
Returns:
list: 包含列表项内容的列表,即items
"""

content_list = []
# 处理根元素文本
if ele.text and ele.text.strip():
Expand Down
73 changes: 61 additions & 12 deletions llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,18 @@
'☁' # 云符号
]

# 其他标点符
other_symbols = [
'“',
'‘',
'[',
'(',
'”',
'’',
'。',
','
]

PARAGRAPH_SEPARATOR = '\n\n'

# 需要保留的html实体,例如:'>' 直接在markdown中无法渲染,需要替换为html实体
Expand All @@ -50,8 +62,9 @@
'a', 'abbr', 'acronym', 'b', 'bdo', 'big', 'br', 'button', 'cite', 'code',
'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
'textarea', 'time', 'var', 'u', 's', 'code', 'cccode-inline', 'ccmath-inline',
'marked-tail', 'marked-text'
'textarea', 'time', 'var', 'u', 's', 'cccode-inline', 'ccmath-inline',
'marked-tail', 'marked-text', 'font', 'nobr', 'bdi', 'mjx-container',
'mjx-assistive-mml', 'strike', 'wbr'
}


Expand Down Expand Up @@ -93,9 +106,16 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, H
new_html_lst = []
for html_element, raw_html_element in main_html_lst:
# 如果是字符串则转换为 HtmlElement

if self.is_cc_html(html_element):
new_html_lst.append((html_element, raw_html_element))

else:
# html_element = element_to_html_unescaped(html_element) # str
# if '<sup&gt;' in html_element:
# print('-------------------------------------')
# html_element = html.fromstring(html_element) # html_to_element
# html_element = html_to_element(html_element)
lst = list(self.__extract_paragraphs(html_element))
new_lst = self.__to_cctext_lst(lst)
new_html_lst.extend(new_lst)
Expand All @@ -108,7 +128,9 @@ def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]
lst: List[Tuple[HtmlElement | str, HtmlElement | str]]: Element和raw_html组成的列表
"""
new_lst = []

for el, raw_html in lst:

# 如果是字符串则转换为 HtmlElement
el_element = html_to_element(el) if isinstance(el, str) else el
raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html
Expand All @@ -120,20 +142,45 @@ def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]
return new_lst

def replace_entities(self, text, entities_map):
"""使用正则表达式同时替换文本中的多个特定字符为其对应的HTML实体
"""替换文本中指定字符为对应的HTML实体,但跳过HTML标签内的字符

:param text: 需要处理的文本。
:param entities_map: 一个字典,键是需要替换的字符,值是对应的HTML实体名
:param entities_map: 字典,键是要替换的字符,值是对应的HTML实体名
:return: 替换后的文本。
"""
# 创建正则表达式模式,匹配所有需要替换的字符
rx = re.compile('|'.join(re.escape(str(key)) for key in entities_map.keys()))
if not entities_map:
return text # 如果字典为空,直接返回原文本

# 构建匹配需要替换字符的正则表达式
entities_pattern = '|'.join(re.escape(str(key)) for key in entities_map.keys())
rx_entity = re.compile(entities_pattern)

def one_xlat(match):
"""回调函数,用于将匹配到的字符替换为对应的HTML实体。"""
return f'&{entities_map[match.group(0)]};'
# 构建匹配HTML标签的正则表达式
rx_tag = re.compile(r'<[^>]*>')

return rx.sub(one_xlat, text)
result = []
last_pos = 0

# 遍历所有HTML标签
for tag_match in rx_tag.finditer(text):
start, end = tag_match.start(), tag_match.end()

# 提取非标签部分并进行替换
non_tag_part = text[last_pos:start]
replaced = rx_entity.sub(lambda m: f'&{entities_map[m.group(0)]};', non_tag_part)
result.append(replaced)

# 保留HTML标签不变
result.append(text[start:end])

last_pos = end

# 处理最后剩余的非标签部分
non_tag_part = text[last_pos:]
replaced = rx_entity.sub(lambda m: f'&{entities_map[m.group(0)]};', non_tag_part)
result.append(replaced)

return ''.join(result)

def __combine_text(self, text1:str, text2:str, lang='en') -> str:
"""将两段文本合并,中间加空格.
Expand All @@ -149,7 +196,8 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
txt = text1 + text2
return self.replace_entities(txt.strip(), entities_map)
else:
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols else ' '
# 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接
words_sep = '' if text2[0] in string.punctuation or text2[0] in special_symbols or text2[0] in other_symbols or text1 and text1[-1] in other_symbols else ' '
txt = text1 + words_sep + text2
return self.replace_entities(txt.strip(), entities_map)

Expand All @@ -169,7 +217,6 @@ def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
para_text = []

def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:

# 标记当前元素是否是sub或sup类型
is_sub_sup = el.tag == 'sub' or el.tag == 'sup'

Expand All @@ -187,6 +234,8 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
text += PARAGRAPH_SEPARATOR # TODO 这个地方直接加换行是错误点做法,需要利用数据结构来保证段落。
elif el.tag == 'sub' or el.tag == 'sup':
text = process_sub_sup_tags(el, text, recursive=False)
elif el.tag == 'audio': # 避免audio被识别为paragraph
pass
else:
if el.text and el.text.strip():
text = self.__combine_text(text, el.text.strip())
Expand Down
Loading