diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-05 12:52:31 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-05 12:52:31 +0200 |
commit | 72dfec0c3532941a46f77b3c0a6a49e16e6a2864 (patch) | |
tree | 8fcb33e87b357c796ca8f2e3325298272900745d /commcrawler/scrapy.py | |
parent | c6b3188e49049cf689658654a1458a3276304782 (diff) | |
download | Comm-on-net-72dfec0c3532941a46f77b3c0a6a49e16e6a2864.tar.bz2 Comm-on-net-72dfec0c3532941a46f77b3c0a6a49e16e6a2864.zip |
Manage excluded domains
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 50 |
1 files changed, 40 insertions, 10 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index e821a31..d218648 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,13 +1,21 @@ import scrapy from scrapy.crawler import CrawlerProcess +from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor import tldextract from django.conf import settings +MAX_LINKS = 500 + class DefaultSpider: + name = None + start_urls = None + allowed_domains = [] + excluded_domains = [] target_id = None + links_reached = set() def _parse_image(self, response, result): for __ in response.css('img'): @@ -20,10 +28,27 @@ class DefaultSpider: "url": response.url, "target_id": self.target_id } - self._parse_image(response, result) - for link in LinkExtractor().extract_links(response): - if link.url is not None: - yield response.follow(link.url, self.parse) + for domain in self.excluded_domains: + if domain in response.url: + result["offline"] = True + yield result + try: + self._parse_image(response, result) + for link in LinkExtractor().extract_links(response): + url = link.url + if url is None or url in self.links_reached: + continue + for domain in self.allowed_domains: + if domain in url: + self.links_reached.add(link.url) + if len(self.links_reached) < MAX_LINKS: + yield response.follow(link.url, self.parse) + else: + print("MAX", self.allowed_domains, + self.links_reached) + except NotSupported: + print("No response", response.url) + yield result def get_domain(url): @@ -31,16 +56,19 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, target=None): +def create_spider(name, urls, target, excluded_domains=None): + if not excluded_domains: + excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], - "target_id": target.pk} + "target_id": target.pk, "links_reached": set(), + "excluded_domains": excluded_domains} ) -def launch_crawl(crawl_item): +def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() scrap_settings.update({ 'FEED_FORMAT': 'json', @@ -49,8 +77,10 @@ def launch_crawl(crawl_item): process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( - create_spider("Target{}".format(target.pk), - [target.url], - target) + create_spider( + "Target{}".format(target.pk), + [target.url], target, + excluded_domains + ) ) process.start() |