Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,9 +104,9 @@ from llm_web_kit.simple import extract_pure_html_to_md
import traceback
from loguru import logger

def extract(url:str, main_html:str, raw_html) -> str:
def extract(url:str, html:str) -> str:
try:
nlp_md = extract_html_to_md(url, main_html, clip_html=False, raw_html)
nlp_md = extract_html_to_md(url, html, clip_html=False)
return nlp_md
except Exception as e:
logger.exception(e)
Expand Down
14 changes: 6 additions & 8 deletions llm_web_kit/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,12 @@ def get_extractor(extractor_type: str):
raise ValueError(f'Invalid extractor type: {extractor_type}')


def __extract_main_html_by_no_clip_html(url:str, html_content: str, raw_html:str) -> DataJson:
def __extract_main_html_by_no_clip_html(url:str, html_content: str) -> DataJson:
extractor = NoClipHTMLFIleFormatorExtractor(load_pipe_tpl('noclip_html'))
if raw_html == '':
raw_html = html_content
input_data_dict = {
'track_id': str(uuid.uuid4()),
'url': url,
'html': raw_html,
'html': html_content,
'main_html': html_content,
'dataset_name': 'llm-web-kit-pure-quickstart',
'data_source_category': 'HTML',
Expand Down Expand Up @@ -76,21 +74,21 @@ def __extract_html(url:str, html_content: str) -> DataJson:
return result


def extract_html_to_md(url:str, html_content: str, clip_html=True, raw_html='') -> str:
def extract_html_to_md(url:str, html_content: str, clip_html=True) -> str:
"""extract html to markdown without images."""
if clip_html:
result = __extract_html(url, html_content)
else:
result = __extract_main_html_by_no_clip_html(url, html_content, raw_html)
result = __extract_main_html_by_no_clip_html(url, html_content)
return result.get_content_list().to_nlp_md()


def extract_html_to_mm_md(url:str, html_content: str, clip_html=True, raw_html='') -> str:
def extract_html_to_mm_md(url:str, html_content: str, clip_html=True) -> str:
"""extract html to markdown with images."""
if clip_html:
result = __extract_html(url, html_content)
else:
result = __extract_main_html_by_no_clip_html(url, html_content, raw_html)
result = __extract_main_html_by_no_clip_html(url, html_content)
return result.get_content_list().to_mm_md()


Expand Down