ccprocessor · dt-yy · Jun 18, 2025
diff --git a/README.md b/README.md
@@ -104,9 +104,9 @@ from llm_web_kit.simple import extract_pure_html_to_md
 import traceback
 from loguru import logger
 
-def extract(url:str, main_html:str, raw_html) -> str:
+def extract(url:str, html:str) -> str:
     try:
-        nlp_md = extract_html_to_md(url, main_html, clip_html=False, raw_html)
+        nlp_md = extract_html_to_md(url, html, clip_html=False)
         return nlp_md
     except Exception as e:
         logger.exception(e)

diff --git a/llm_web_kit/simple.py b/llm_web_kit/simple.py
@@ -35,14 +35,12 @@ def get_extractor(extractor_type: str):
             raise ValueError(f'Invalid extractor type: {extractor_type}')
 
 
-def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str) -> DataJson:
+def __extract_main_html_by_no_clip_html(url:str, html_content: str) -> DataJson:
     extractor = NoClipHTMLFIleFormatorExtractor(load_pipe_tpl('noclip_html'))
-    if raw_html == '':
-        raw_html = html_content
     input_data_dict = {
         'track_id': str(uuid.uuid4()),
         'url': url,
-        'html': raw_html,
+        'html': html_content,
         'main_html': html_content,
         'dataset_name': 'llm-web-kit-pure-quickstart',
         'data_source_category': 'HTML',
@@ -76,21 +74,21 @@ def __extract_html(url:str, html_content: str) -> DataJson:
     return result
 
 
-def extract_html_to_md(url:str, html_content: str, clip_html=True, raw_html='') -> str:
+def extract_html_to_md(url:str, html_content: str, clip_html=True) -> str:
     """extract html to markdown without images."""
     if clip_html:
         result = __extract_html(url, html_content)
     else:
-        result = __extract_main_html_by_no_clip_html(url, html_content, raw_html)
+        result = __extract_main_html_by_no_clip_html(url, html_content)
     return result.get_content_list().to_nlp_md()
 
 
-def extract_html_to_mm_md(url:str, html_content: str, clip_html=True, raw_html='') -> str:
+def extract_html_to_mm_md(url:str, html_content: str, clip_html=True) -> str:
     """extract html to markdown with images."""
     if clip_html:
         result = __extract_html(url, html_content)
     else:
-        result = __extract_main_html_by_no_clip_html(url, html_content, raw_html)
+        result = __extract_main_html_by_no_clip_html(url, html_content)
     return result.get_content_list().to_mm_md()