From 72dfec0c3532941a46f77b3c0a6a49e16e6a2864 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Mon, 5 Aug 2019 12:52:31 +0200 Subject: Manage excluded domains --- commcrawler/scrapy.py | 50 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'commcrawler/scrapy.py') diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index e821a31..d218648 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,13 +1,21 @@ import scrapy from scrapy.crawler import CrawlerProcess +from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor import tldextract from django.conf import settings +MAX_LINKS = 500 + class DefaultSpider: + name = None + start_urls = None + allowed_domains = [] + excluded_domains = [] target_id = None + links_reached = set() def _parse_image(self, response, result): for __ in response.css('img'): @@ -20,10 +28,27 @@ class DefaultSpider: "url": response.url, "target_id": self.target_id } - self._parse_image(response, result) - for link in LinkExtractor().extract_links(response): - if link.url is not None: - yield response.follow(link.url, self.parse) + for domain in self.excluded_domains: + if domain in response.url: + result["offline"] = True + yield result + try: + self._parse_image(response, result) + for link in LinkExtractor().extract_links(response): + url = link.url + if url is None or url in self.links_reached: + continue + for domain in self.allowed_domains: + if domain in url: + self.links_reached.add(link.url) + if len(self.links_reached) < MAX_LINKS: + yield response.follow(link.url, self.parse) + else: + print("MAX", self.allowed_domains, + self.links_reached) + except NotSupported: + print("No response", response.url) + yield result def get_domain(url): @@ -31,16 +56,19 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, target=None): +def create_spider(name, urls, target, excluded_domains=None): + if not excluded_domains: + excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], - "target_id": target.pk} + "target_id": target.pk, "links_reached": set(), + "excluded_domains": excluded_domains} ) -def launch_crawl(crawl_item): +def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() scrap_settings.update({ 'FEED_FORMAT': 'json', @@ -49,8 +77,10 @@ def launch_crawl(crawl_item): process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( - create_spider("Target{}".format(target.pk), - [target.url], - target) + create_spider( + "Target{}".format(target.pk), + [target.url], target, + excluded_domains + ) ) process.start() -- cgit v1.2.3