webcrawler/crawler_task.py at master · 4knahs/webcrawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from url import Url

# Parser for xml/html, handles unicode/utf-8 encoding otherwise managed when fetching the website via urllib
from bs4 import BeautifulSoup

import urllib

from logger import warn, info, debug, error

class CrawlerTask:

    def __init__(self, url, domain=None, protocol=None):

        self.crawl_url = Url(url, domain=domain, protocol=protocol)

    def urls_from_content(self, content):
        """Retrieves the URLs from a request content.

        Keyword arguments:
        content -- page content, as retrieved via urllib urlopen or compatible with lxml.
        """
        if content:
            bs = BeautifulSoup(content, 'lxml')
            urls = []

            for u in bs.findAll('a'):
                new_url = Url(u.get('href'), domain = self.crawl_url.netloc(), protocol=self.crawl_url.proto())
                if new_url == None or new_url == '':
                    error("Failed to process {}".format(u.get('href')))
                else:
                    urls.append(new_url)

            return urls
        else:
            return []

    def url(self):
        """Retrieves the crawler's URL.
        """
        return self.crawl_url

    def crawl(self):
        """Retrieves the urls present in the crawler's URL.
        """
        return {
            'urls': self.urls_from_content(self.crawl_url.content()),
            'parent': self.crawl_url.url()
        }

    def __call__(self):
        return self.crawl()

    def __str__(self):
        return self.crawl_url.url()