33"""
44import datetime
55import uuid
6-
6+ import os
77import requests
8+
89from bs4 import BeautifulSoup
910from progress .bar import Bar
1011from threadsafe .safe_csv import SafeDictWriter
1112
12- from .utils import join_local_path
13+ from .config import get_data_directory
1314from .validators import validate_link
1415from .log import debug
1516
1617
17- def parse_links (html : str ):
18- """Parses HTML page to extract links.
19-
20- Returns:
21- (list): List of all valid links found.
18+ def parse_links (html : str ) -> list [str ]:
19+ """
20+ Finds all anchor tags and parses the href attribute.
2221 """
2322 soup = BeautifulSoup (html , 'html.parser' )
2423 tags = soup .find_all ('a' )
2524 return [tag ['href' ] for tag in tags if validate_link (tag ['href' ])]
2625
2726
28- def parse_meta_tags (soup : BeautifulSoup ):
29- """Retrieve all meta elements from HTML object.
30-
31- Returns:
32- list: List containing content from meta tags
27+ def parse_meta_tags (soup : BeautifulSoup ) -> list [object ]:
28+ """
29+ Parses all meta tags.
3330 """
3431 meta_tags = soup .find_all ('meta' )
3532 content_list = list ()
@@ -38,23 +35,23 @@ def parse_meta_tags(soup: BeautifulSoup):
3835 return content_list
3936
4037
41- def get_links (url : str ):
38+ def get_links (url : str ) -> list [str ]:
39+ """
40+ Returns all valid links found on the URL.
41+ """
4242 resp = requests .get (url )
4343 links = parse_links (resp .text )
4444 return links
4545
4646
47- default_url = 'https://thehiddenwiki.org'
48-
49-
50- def collect_data (user_url : str ):
51- url = user_url if user_url is not None else default_url
47+ def collect_data (url : str = 'https://thehiddenwiki.org' ):
5248 print (f"Gathering data for { url } " )
5349 links = get_links (url )
5450 current_time = datetime .datetime .now ().isoformat ()
5551 file_name = f'torbot_{ current_time } .csv'
56- file_path = join_local_path (file_name )
57- with open (file_path , 'w+' ) as outcsv :
52+ data_directory = get_data_directory ()
53+ local_file_path = os .path .join (data_directory , file_name )
54+ with open (local_file_path , 'w+' ) as outcsv :
5855 fieldnames = ['ID' , 'Title' , 'Metadata' , 'Content' ]
5956 writer = SafeDictWriter (outcsv , fieldnames = fieldnames )
6057 bar = Bar ('Processing...' , max = len (links ))
@@ -71,8 +68,9 @@ def collect_data(user_url: str):
7168 }
7269 writer .writerow (entry )
7370 except requests .exceptions .RequestException as e :
71+ print (f"Failed to connect to [{ link } ]." )
7472 debug (e )
75- debug (f"Failed to connect to [{ link } ]." )
7673 bar .next ()
7774 bar .finish ()
78- print (f'Data has been saved to { file_path } .' )
75+
76+ print (f'Data has been saved to { local_file_path } .' )
0 commit comments