Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 47 additions & 25 deletions llm_web_kit/html_layout/html_layout_cosin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from llm_web_kit.exception.exception import LayoutClusteringParserException

TAGS_TO_IGNORE = ['script', 'style', 'meta', 'link', 'br', 'noscript'] # , 'b', 'i', 'strong'
TAGS_IGNORE_ATTR = ['a', 'i', 'b', 'li', 'tr', 'td', 'img', 'p', 'body']

Expand Down Expand Up @@ -38,10 +40,11 @@
return root


def get_feature(html_source: str) -> Dict:
def get_feature(html_source: str, is_ignore_tag: bool = True) -> Dict:
"""获取DOM有效tag和attr
Args:
html_source: html源码字符串
is_ignore_tag: bool 是否忽略TAGS_TO_IGNORE可忽略的标签
Returns:
dict:
{
Expand All @@ -50,7 +53,7 @@
}
"""
doc = __html_to_valid_element(html_source)
return __recursive_extract_tags(doc)
return __recursive_extract_tags(doc, is_ignore_tag)


def __parse_tag_attr(tag_attrs_lst: List[set]) -> Dict:
Expand All @@ -66,7 +69,7 @@
return {'tags': tags, 'attrs': attrs}


def __recursive_extract_tags(doc: HtmlElement) -> Dict:
def __recursive_extract_tags(doc: HtmlElement, is_ignore_tag: bool = True) -> Dict:
"""递归获取标签及属性
Args:
doc: lxml.html.HtmlElement
Expand All @@ -84,29 +87,30 @@
for el in el_lst:
parent_tag_attr = set()
for child in el.getchildren():
if isinstance(child, HtmlComment) or child.tag is None or child.tag.lower() in TAGS_TO_IGNORE:
if isinstance(child, HtmlComment) or child.tag is None:
continue

Check warning on line 91 in llm_web_kit/html_layout/html_layout_cosin.py

View check run for this annotation

Codecov / codecov/patch

llm_web_kit/html_layout/html_layout_cosin.py#L91

Added line #L91 was not covered by tests
tag = child.tag.lower()
if is_ignore_tag and tag in TAGS_TO_IGNORE:
continue
next_el.append(child)
if tag in TAGS_IGNORE_ATTR:
parent_tag_attr.add(f'<{tag}>')
else:
next_el.append(child)
tag = child.tag.lower()
if tag in TAGS_IGNORE_ATTR:
parent_tag_attr.add(f'<{tag}>')
else:
attrs_str = ' '.join([f'{k}="{v}"' for k, v in child.attrib.items() if k in ['class', 'id']])
parent_tag_attr.add(f'<{tag} {attrs_str}>' if attrs_str else f'<{tag}>')
attrs_str = ' '.join([f'{k}="{v}"' for k, v in child.attrib.items() if k in ['class', 'id']])
parent_tag_attr.add(f'<{tag} {attrs_str}>' if attrs_str else f'<{tag}>')

el_tag_attr.append(parent_tag_attr)

layer_tag_attr = __parse_tag_attr(el_tag_attr)
if layer_tag_attr.get('tags'):
tag_attr['tags'].update({layer_n: layer_tag_attr['tags']})
tag_attr['tags'][layer_n] = layer_tag_attr['tags']
if layer_tag_attr.get('attrs'):
tag_attr['attrs'].update({layer_n: layer_tag_attr['attrs']})
tag_attr['attrs'][layer_n] = layer_tag_attr['attrs']
if next_el:
return __get_children(next_el, layer_n + 1, tag_attr)

tag_attr = defaultdict(dict)
layer_n = 1
__get_children([doc], layer_n, tag_attr)
__get_children([doc], 1, tag_attr)
return dict(tag_attr) if tag_attr.get('tags') else None


Expand Down Expand Up @@ -283,16 +287,34 @@
"attrs": {1: ["nav", "content", "footer"], 2: [...]}
}
layer_n: 相似度计算DOM树层级深度,默认为5
k: tags 和 attrs 权重占比,默认1:1
k: tags 和 attrs 权重占比,k表示tags权重,(1-k)为attrs权重,默认0.7:0.3
Return:
np.float32: cosine similarity
"""
tag_sim = cosine_similarity(
__parse_vectors([__simp_tags(feature1['tags'], layer_n), __simp_tags(feature2['tags'], layer_n)]))[0][1]
if not feature1.get('attrs') or not feature2.get('attrs'):
k = 1
return round(tag_sim * k, 8)
else:
attr_sim = cosine_similarity(
__parse_vectors([__simp_tags(feature1['attrs'], layer_n), __simp_tags(feature2['attrs'], layer_n)]))[0][1]
return round(tag_sim * k + attr_sim * (1 - k), 8)
if not isinstance(feature1, dict) or not isinstance(feature2, dict):
raise LayoutClusteringParserException('feature1 and feature2 must be dictionaries')

def process_feature(feat):
return {
'tags': __simp_tags(feat.get('tags', {}), layer_n),
'attrs': __simp_tags(feat.get('attrs', {}), layer_n)
}

f1 = process_feature(feature1)
f2 = process_feature(feature2)

f1_tag = f1.get('tags', {})
f2_tag = f2.get('tags', {})
if not f1_tag or not f2_tag:
return 0.0
tag_vecs = __parse_vectors([f1_tag, f2_tag])
tag_sim = cosine_similarity(tag_vecs)[0][1]

f1_attr = f1.get('attrs', {})
f2_attr = f2.get('attrs', {})
if not f1_attr or not f2_attr:
return round(tag_sim, 8)

attr_vecs = __parse_vectors([f1_attr, f2_attr])
attr_sim = cosine_similarity(attr_vecs)[0][1]
return round(tag_sim * k + attr_sim * (1 - k), 8)
23 changes: 22 additions & 1 deletion tests/llm_web_kit/html_layout/test_html_layout_cosin.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@
{'input1': 'assets/feature.html', 'input2': 'assets/cosin.html', 'expected': 0.22013982},
{'input1': 'assets/feature1.html', 'input2': 'assets/feature2.html', 'layer_n': 12, 'expected': 0.9357468},
{'input1': 'assets/feature1.html', 'input2': 'assets/data_structure.html', 'layer_n': 12, 'expected': 0},

]
TEST_SIMIL_FEATURES = [
({}, {}, 0),
(None, None, 62102000),
({'tags': {}}, {}, 0),
(
{'tags': {1: ['<body>/div'], 2: ['<div>[2]/div', '<div>[1]/div', '<div>[3]/ul']}},
{'tags': {'1': ['<body>/div'], '2': ['<div>[2]/div', '<div>[1]/div', '<div>[3]/ul']},
'attrs': {'1': ['nav', 'content', 'footer'], '2': ['foot', 'container']}},
1.0,
)
]
TEST_CLUSTER_HTMLS = [
{'input': ['assets/feature1.html', 'assets/cosin.html'], 'expected': [-1]},
Expand Down Expand Up @@ -47,6 +57,11 @@ def test_get_feature(self):
features = get_feature(raw_html)
if features is not None:
self.assertEqual(len(features), test_case['expected'])

layer_n, total_n = sum_tags(get_feature(raw_html, is_ignore_tag=False)['tags'])
ignore_layer_n, ignore_total_n = sum_tags(get_feature(raw_html)['tags'])
self.assertEqual(len(layer_n) == len(ignore_layer_n), True)
self.assertEqual(total_n > ignore_total_n, True)
else:
self.assertEqual(features, test_case['expected'])

Expand All @@ -63,6 +78,12 @@ def test_similarity(self):
feature2 = get_feature(base_dir.joinpath(TEST_SIMIL_HTML['input2']).read_text(encoding='utf-8'))
cosin = similarity(feature1, feature2, layer_n=TEST_SIMIL_HTML.get('layer_n', 5))
self.assertEqual('{:.2f}'.format(TEST_SIMIL_HTML['expected']), '{:.2f}'.format(cosin))
for TEST_SIMIL_FEATURE in TEST_SIMIL_FEATURES:
try:
cosin = similarity(TEST_SIMIL_FEATURE[0], TEST_SIMIL_FEATURE[1])
self.assertEqual(TEST_SIMIL_FEATURE[2], cosin)
except Exception as e:
self.assertEqual(e.error_code, TEST_SIMIL_FEATURE[2])

def test_sum_tags(self):
for TEST_SUM_TAG in TEST_SUM_TAGS:
Expand Down