Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 5 additions & 15 deletions llm_web_kit/extractor/html/recognizer/cc_math/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,23 +286,11 @@ def equation_type_to_tag(self, type_math_type: List[Tuple[str, str]]) -> List[Tu

def mml_to_latex(self, mml_code):
# Remove any attributes from the math tag
mml_code = re.sub(r'(<math.*?>)', r'\1', mml_code)
mml_ns = mml_code.replace('<math>', '<math xmlns="http://www.w3.org/1998/Math/MathML">') # Required.

mml_ns = re.sub(r'<math.*?>', '<math xmlns="http://www.w3.org/1998/Math/MathML">', mml_code)
# mml_ns = mml_code
mml_ns = mml_ns.replace('&quot;', '"')
mml_ns = mml_ns.replace("'\\\"", '"').replace("\\\"'", '"')

# 很多网页中标签内容就是错误
# pattern = r"(<[^<>]*?\s)(mathbackground|mathsize|mathvariant|mathfamily|class|separators|style|id|rowalign|columnspacing|rowlines|columnlines|frame|framespacing|equalrows|equalcolumns|align|linethickness|lspace|rspace|mathcolor|rowspacing|displaystyle|style|columnalign|open|close|right|left)(?=\s|>)(?![\"'][^<>]*?>)"
# def replace_attr(match):
# tag_start = match.group(1) # 标签开始部分和空格
# attr_name = match.group(2) # 属性名
# return f'{tag_start}{attr_name}=\"\" '
# # 替换文本
# mml_ns = re.sub(pattern, replace_attr, mml_ns, re.S)
# mml_ns = re.sub(pattern, replace_attr, mml_ns, re.S)
# mml_ns = re.sub(pattern, replace_attr, mml_ns, re.S)

pattern = r'"([^"]+?)\''
mml_ns = re.sub(pattern, r'"\1"', mml_ns)
mml_ns = re.sub(r'<mspace[^>]*>.*?</mspace>', '', mml_ns, flags=re.DOTALL)
Expand All @@ -313,9 +301,11 @@ def mml_to_latex(self, mml_code):
# 提前修复已知的一些利用XSLT方法转换的错误
mml_str = self.fix_mathml_superscript(mml_str)
mml_element = etree.fromstring(mml_str)
# 使用兼容的元素进行转换
print(f'Processing MathML: {etree.tostring(mml_element, encoding="unicode", pretty_print=True)}')
mmldom = transform(mml_element)
print(f'After XSLT transformation: {str(mmldom)}')
latex_code = str(mmldom)
print(f'latex_code: {latex_code}')
return latex_code

def fix_mathml_superscript(self, mathml_str):
Expand Down
10 changes: 10 additions & 0 deletions llm_web_kit/extractor/html/recognizer/cc_math/mmltex/README2
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
This file is not part of the original source code.

Researched links to archived web page and Sourceforge project:

https://sourceforge.net/projects/xsltml/files/xsltml/

https://web.archive.org/web/20160109063934/http://www.raleigh.ru/MathML/mmltex/index.php

Google Translated to English:
https://translate.google.com/translate?sl=ru&tl=en&u=https%3A%2F%2Fweb.archive.org%2Fweb%2F20160114170851%2Fhttp%3A%2F%2Fwww.raleigh.ru%2FMathML%2Fmmltex%2Findex.php
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,6 @@
<xsl:when test='starts-with($content,")")'><xsl:value-of select='"\right)"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, ')')"/></xsl:call-template></xsl:when>
<xsl:when test='starts-with($content,"[")'><xsl:value-of select='"\left["' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '[')"/></xsl:call-template></xsl:when>
<xsl:when test='starts-with($content,"]")'><xsl:value-of select='"\right]"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, ']')"/></xsl:call-template></xsl:when>
<xsl:when test='starts-with($content,"{")'><xsl:value-of select='"\left\{"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '{')"/></xsl:call-template></xsl:when>
<xsl:when test='starts-with($content,"}")'><xsl:value-of select='"\right\}"' /><xsl:call-template name="replaceEntities"><xsl:with-param name="content" select="substring-after($content, '}')"/></xsl:call-template></xsl:when>


<xsl:otherwise>
<xsl:value-of select="substring($content,1,1)"/>
Expand Down
15 changes: 14 additions & 1 deletion llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,17 @@


def modify_tree(cm: CCMATH, math_render: str, o_html: str, node: HtmlElement, parent: HtmlElement):
"""
这段代码主要用于将 HTML 中的 MathML 数学公式节点,识别并替换为自定义的 cc 数学标签(如 <ccmath-interline>),以便后续结构化处理。
其核心流程如下:
1.查找 MathML节点中的 LaTeX 注释:优先查找 annotation 标签(application/x-tex),如果有则提取 LaTeX 公式。
2.判断数学公式类型:通过 CCMATH.get_equation_type 判断公式类型(如行内/行间、LaTeX/MathML)。
3.封装为 cc 标签:将提取到的公式内容用 wrap_math_md 包装,并用 build_cc_element 构造自定义 cc 标签节点,替换原有的 MathML 节点。
4.兼容 alttext 属性:如果节点有 alttext 属性,也会优先用其内容。
5.MathML 转 LaTeX:如果没有 LaTeX 注释,则尝试将 MathML 转为 LaTeX,再封装为 cc 标签。
6.异常处理:如有异常,抛出自定义异常。
这样做的目的是将原始 HTML 里的数学公式统一转换为项目自定义的结构化标签,便于后续内容抽取和处理。
"""
try:
annotation_tags = node.xpath('.//*[local-name()="annotation"][@encoding="application/x-tex"]')
math_type = MathType.MATHML
Expand Down Expand Up @@ -52,7 +63,9 @@
mathml = re.sub(r'</(\w+):', '</', mathml) # remove any /prefix:mi
mathml = re.sub(r'([^\s])\s+([^\s])', r'\1 \2', mathml) # remove extra spaces

# print("Before mml_to_latex:", mathml)
latex = cm.mml_to_latex(mathml)
# print("After mml_to_latex:", latex)
text = cm.wrap_math_md(latex)
if text:
# Set the html of the new span tag to the text
Expand All @@ -66,4 +79,4 @@
html = '<math xmlns="http://www.w3.org/1998/Math/MathML"><mi>a</mi><mo>&#x2260;</mo><mn>0</mn></math>'
element = html_to_element(html)
cm = CCMATH()
modify_tree(cm, 'mathjax', html, element, element)
modify_tree(cm, 'mathjax', html, element, element)

Check warning on line 82 in llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/extractor/html/recognizer/cc_math/tag_math.py#L82

Added line #L82 was not covered by tests
2 changes: 1 addition & 1 deletion llm_web_kit/extractor/html/recognizer/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
'dfn', 'em', 'i', 'img', 'input', 'kbd', 'label', 'map', 'object', 'q',
'samp', 'script', 'select', 'small', 'span', 'strong', 'sub', 'sup',
'textarea', 'time', 'var', 'u', 's', 'code', 'cccode-inline', 'ccmath-inline',
'marked-tail', 'marked-text'
'marked-tail', 'marked-text','math','mspace'
}


Expand Down

Large diffs are not rendered by default.

Loading
Loading