bioinfor_script_modules/67_Extract_Url_From_Str.py at main · MaybeBio/bioinfor_script_modules · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 从文本中提取链接, 比如说是文献摘要/正文之类

import re

def extract_urls_from_text(text: str) -> List[Dict[str, str]]:
    """
    从文本中提取 URL，并尝试分类（GitHub, General, etc.）
    """
    if not text:
        return []

    # 匹配 URL 的正则 (比较宽松，能匹配大部分 http/https/ftp/www)
    url_pattern = r'(https?://[^\s,;>)]+|www\.[^\s,;>)]+|ftp://[^\s,;>)]+)'

    found_urls = re.findall(url_pattern, text)

    results = []
    seen = set() # 去重

    for url in found_urls:
        # 清洗末尾的标点符号 (比如句号结尾的 url.)
        url = url.rstrip('.')

        if url in seen:
            continue
        seen.add(url)

        # 简单分类
        category = "General"
        if "github.com" in url:
            category = "GitHub"
        elif "gitlab.com" in url:
            category = "GitLab"
        elif "zenodo.org" in url:
            category = "Zenodo"
        elif "figshare.com" in url:
            category = "Figshare"
        elif "huggingface.co" in url:
            category = "HuggingFace"

        results.append({
            "url": url,
            "source": "abstract_mining", # 标记来源
            "category": category
        })

    return results