-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathamLibrary_DataAddFunctions.py
More file actions
40 lines (37 loc) · 2.06 KB
/
amLibrary_DataAddFunctions.py
File metadata and controls
40 lines (37 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
##############
#### Library of Functions that add new data ####
#################
## Calls a few generic sources to get new data from anything existing ie for value add
##############
## Filling this py file from older amStorehouse, and then will break into separate common file so other services can call this too
#############
import requests
from urllib.parse import urlparse #to get URL info if no OG
from bs4 import BeautifulSoup
######### Get Site Name from Opengraph Data ##########
def url_to_sitename(url_in): #Only works for a single use case of opengraph
try: #adding overall in case any error
response = requests.get(url_in, timeout=15)
soup = BeautifulSoup(response.text, 'html.parser')
try:
sitename = soup.find("meta", property ="og:site_name").attrs["content"]
except: #from https://stackoverflow.com/a/41919945/9231911
print("ran into exception with bs4")
parsed_uri = urlparse(url_in)
domain = '{uri.netloc}'.format(uri=parsed_uri)
result = domain.replace('www.', '') # as per your case
sitename = result.title()
# sitename = urlparse.urlparse(url_in).hostname
# sitename = soup.find("meta", name="twitter:site").attrs["content"] #need to create these too
return sitename
except:
print ("🚫URL to Sitename crapped")
return ""
## Testing
# print(url_to_sitename('https://www.bbc.com/news/business-55722542'))
# print(url_to_sitename('https://www.youtube.com/watch?v=XV9SBK1vzu4'))
# print(url_to_sitename('https://www.theatlantic.com/health/archive/2021/01/coronavirus-vaccine-masks-how-much-longer/617747/'))
# print(url_to_sitename('https://www.globaltimes.cn/page/202101/1213707.shtml')) #No sitename in PG
# print(url_to_sitename('https://www.w3schools.com/python/python_try_except.asp')) #No sitename in PG
# print(url_to_sitename('https://www.aa.com.tr/en/middle-east/israel-worried-by-us-plans-to-lift-icc-sanctions/2124920')) #No sitename in PG
# print(url_to_sitename('https://www.seattletimes.com/nation-world/mutated-virus-may-reinfect-people-already-stricken-once-with-covid-19-sparking-debate-and-concerns/')) #Requests.get not working