diff --git a/llm_web_kit/input/pre_data_json.py b/llm_web_kit/input/pre_data_json.py index c8d602c0..7902d093 100644 --- a/llm_web_kit/input/pre_data_json.py +++ b/llm_web_kit/input/pre_data_json.py @@ -15,6 +15,7 @@ class PreDataJsonKey: TYPICAL_RAW_HTML = 'typical_raw_html' TYPICAL_RAW_TAG_HTML = 'typical_raw_tag_html' + IS_XPATH = 'is_xpath' XPATH_MAPPING = 'xpath_mapping' TYPICAL_SIMPLIFIED_HTML = 'typical_simplified_html' # 模型打标字典 diff --git a/llm_web_kit/main_html_parser/parser/tag_simplifier.py b/llm_web_kit/main_html_parser/parser/tag_simplifier.py index 1d705c1b..eede6cfb 100644 --- a/llm_web_kit/main_html_parser/parser/tag_simplifier.py +++ b/llm_web_kit/main_html_parser/parser/tag_simplifier.py @@ -19,11 +19,12 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson: """ # 获取输入数据 typical_raw_html = pre_data.get(PreDataJsonKey.TYPICAL_RAW_HTML, '') + is_xpath = pre_data.get(PreDataJsonKey.IS_XPATH, True) # layout_file_list = pre_data.get(PreDataJsonKey.LAYOUT_FILE_LIST, []) # 执行HTML标签简化逻辑 try: - simplified_html, original_html, _ = simplify_html(typical_raw_html) + simplified_html, original_html, _ = simplify_html(typical_raw_html, is_xpath=is_xpath) except TagSimplifiedParserException as e1: raise e1 except Exception as e2: diff --git a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py index 3d656811..ea5d57e9 100644 --- a/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py +++ b/tests/llm_web_kit/main_html_parser/parser/test_tag_simplifier.py @@ -57,7 +57,7 @@ def test_tag_simplifier4(self): file_path = base_dir / 'assets/test_html_data/1.html' with open(file_path, 'r', encoding='utf-8') as file: raw_html = file.read() - data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html} + data_dict = {PreDataJsonKey.TYPICAL_RAW_HTML: raw_html, PreDataJsonKey.IS_XPATH: False} pre_data = PreDataJson(data_dict) pre_data_result = HtmlTagSimplifierParser({}).parse(pre_data) simplifier_raw_html = pre_data_result.get(PreDataJsonKey.TYPICAL_SIMPLIFIED_HTML, '')