import scrapy from scrapy.crawler import CrawlerProcess from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor import tldextract from django.conf import settings MAX_LINKS = 500 class DefaultSpider: name = None start_urls = None allowed_domains = [] excluded_domains = [] target_id = None links_reached = set() def _parse_image(self, response, result): for __ in response.css('img'): if 'nb_images' not in result: result["nb_images"] = 0 result["nb_images"] += 1 def parse(self, response): result = { "url": response.url, "target_id": self.target_id } for domain in self.excluded_domains: if domain in response.url: result["offline"] = True yield result try: self._parse_image(response, result) for link in LinkExtractor().extract_links(response): url = link.url if url is None or url in self.links_reached: continue for domain in self.allowed_domains: if domain in url: self.links_reached.add(link.url) if len(self.links_reached) < MAX_LINKS: yield response.follow(link.url, self.parse) else: print("MAX", self.allowed_domains, self.links_reached) except NotSupported: print("No response", response.url) yield result def get_domain(url): ext = tldextract.extract(url) return '{}.{}'.format(ext.domain, ext.suffix) def create_spider(name, urls, target, excluded_domains=None): if not excluded_domains: excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "target_id": target.pk, "links_reached": set(), "excluded_domains": excluded_domains} ) def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() scrap_settings.update({ 'FEED_FORMAT': 'json', 'FEED_URI': 'result.json' }) process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider( "Target{}".format(target.pk), [target.url], target, excluded_domains ) ) process.start()