summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
blob: d2186480c9b05447dfe37c7081c7838044e13f8f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import NotSupported
from scrapy.linkextractors import LinkExtractor
import tldextract

from django.conf import settings

MAX_LINKS = 500


class DefaultSpider:
    name = None
    start_urls = None
    allowed_domains = []
    excluded_domains = []
    target_id = None
    links_reached = set()

    def _parse_image(self, response, result):
        for __ in response.css('img'):
            if 'nb_images' not in result:
                result["nb_images"] = 0
            result["nb_images"] += 1

    def parse(self, response):
        result = {
            "url": response.url,
            "target_id": self.target_id
        }
        for domain in self.excluded_domains:
            if domain in response.url:
                result["offline"] = True
                yield result
        try:
            self._parse_image(response, result)
            for link in LinkExtractor().extract_links(response):
                url = link.url
                if url is None or url in self.links_reached:
                    continue
                for domain in self.allowed_domains:
                    if domain in url:
                        self.links_reached.add(link.url)
                        if len(self.links_reached) < MAX_LINKS:
                            yield response.follow(link.url, self.parse)
                        else:
                            print("MAX", self.allowed_domains,
                                  self.links_reached)
        except NotSupported:
            print("No response", response.url)
        yield result


def get_domain(url):
    ext = tldextract.extract(url)
    return '{}.{}'.format(ext.domain, ext.suffix)


def create_spider(name, urls, target, excluded_domains=None):
    if not excluded_domains:
        excluded_domains = []
    return type(
        name, (DefaultSpider, scrapy.Spider),
        {"name": name, "start_urls": urls,
         "allowed_domains": [get_domain(url) for url in urls],
         "target_id": target.pk, "links_reached": set(),
         "excluded_domains": excluded_domains}
    )


def launch_crawl(crawl_item, excluded_domains=None):
    scrap_settings = settings.SCRAPPY_SETTINGS.copy()
    scrap_settings.update({
        'FEED_FORMAT': 'json',
        'FEED_URI': 'result.json'
    })
    process = CrawlerProcess(settings=scrap_settings)
    for target in crawl_item.targets.all():
        process.crawl(
            create_spider(
                "Target{}".format(target.pk),
                [target.url], target,
                excluded_domains
            )
        )
    process.start()