-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path67_Extract_Url_From_Str.py
More file actions
47 lines (37 loc) · 1.26 KB
/
67_Extract_Url_From_Str.py
File metadata and controls
47 lines (37 loc) · 1.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# 从文本中提取链接, 比如说是文献摘要/正文之类
import re
def extract_urls_from_text(text: str) -> List[Dict[str, str]]:
"""
从文本中提取 URL,并尝试分类(GitHub, General, etc.)
"""
if not text:
return []
# 匹配 URL 的正则 (比较宽松,能匹配大部分 http/https/ftp/www)
url_pattern = r'(https?://[^\s,;>)]+|www\.[^\s,;>)]+|ftp://[^\s,;>)]+)'
found_urls = re.findall(url_pattern, text)
results = []
seen = set() # 去重
for url in found_urls:
# 清洗末尾的标点符号 (比如句号结尾的 url.)
url = url.rstrip('.')
if url in seen:
continue
seen.add(url)
# 简单分类
category = "General"
if "github.com" in url:
category = "GitHub"
elif "gitlab.com" in url:
category = "GitLab"
elif "zenodo.org" in url:
category = "Zenodo"
elif "figshare.com" in url:
category = "Figshare"
elif "huggingface.co" in url:
category = "HuggingFace"
results.append({
"url": url,
"source": "abstract_mining", # 标记来源
"category": category
})
return results