import scrapy from scrapy.crawler import CrawlerProcess from scrapy.linkextractors import LinkExtractor import tldextract from django.conf import settings class DefaultSpider: target_id = None def _parse_image(self, response, result): for __ in response.css('img'): if 'nb_images' not in result: result["nb_images"] = 0 result["nb_images"] += 1 def parse(self, response): result = { "url": response.url, "target_id": self.target_id } self._parse_image(response, result) for link in LinkExtractor().extract_links(response): if link.url is not None: yield response.follow(link.url, self.parse) def get_domain(url): ext = tldextract.extract(url) return '{}.{}'.format(ext.domain, ext.suffix) def create_spider(name, urls, target=None): return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "target_id": target.pk} ) def launch_crawl(crawl_item): scrap_settings = settings.SCRAPPY_SETTINGS.copy() scrap_settings.update({ 'FEED_FORMAT': 'json', 'FEED_URI': 'result.json' }) process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider("Target{}".format(target.pk), [target.url], target) ) process.start()