import tldextract import scrapy from scrapy.crawler import CrawlerProcess from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor from django.conf import settings from django.db import transaction, IntegrityError from django.utils import timezone from . import models """ nb_external_link nb_internal_link nb_images nb_facebook nb_twitter nb_instagram nb_youtube nb_dailymotion nb_vimeo nb_video nb_audio nb_internal_pdf nb_external_pdf nb_internal_office nb_external_office redirection CrawlLink """ MAX_LINKS = 500 class DefaultSpider: name = None start_urls = None allowed_domains = [] excluded_domains = [] crawl_id = None target_id = None links_reached = set() def start_requests(self): q = { "crawl_id": self.crawl_id, "target_id": self.target_id, "status": "F" } if models.CrawlResult.objects.filter(**q).count(): return [] q.pop("status") if models.CrawlResult.objects.filter(**q).count(): # delete a previous interrupted attempt res = models.CrawlResult.objects.get(**q) res.delete() for url in self.start_urls: yield scrapy.Request(url, self.parse) def _parse_image(self, response, result): if "images" not in result: result["images"] = [] for img in response.css('img'): attributes = img.attrib if "src" not in attributes: continue src = attributes["src"] is_a_real_src = src.startswith("http") or src.startswith("/") if not src or not is_a_real_src or src in result["images"]: continue result["images"].append(src) def parse(self, response): result = { "url": response.url, } for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False if result.get("is_online", None) is False: yield result else: result["is_online"] = True try: self._parse_image(response, result) for link in LinkExtractor().extract_links(response): url = link.url if url is None or url in self.links_reached: continue for domain in self.allowed_domains: if domain in url: self.links_reached.add(link.url) if len(self.links_reached) < MAX_LINKS: yield response.follow(link.url, self.parse) else: print("MAX", self.allowed_domains, self.links_reached) except NotSupported: print("No response", response.url) yield result def closed(self, reason): result = { "crawl_id": self.crawl_id, "target_id": self.target_id, } DbPipeline().close(self) class DbPipeline: BASE_KEYS = ["url", "crawl_id", "target_id"] NB_KEYS = ["external_link", "internal_link", "images", "facebook", "twitter", "instagram", "youtube", "dailymotion", "vimeo", "video", "audio", "internal_pdf", "external_pdf", "internal_office", "external_office", ] def _get_result_pk(self, spider): """ Atomic creation :param spider: current spider :return: result_pk, created """ pks = { "crawl_id": spider.crawl_id, "target_id": spider.target_id, } created = False try: result = models.CrawlResult.objects.get(**pks) except models.CrawlResult.DoesNotExist: try: with transaction.atomic(): result = models.CrawlResult.objects.create(**pks) created = True except IntegrityError: result = models.CrawlResult.objects.get(**pks) return result.pk, created def _update(self, result_pk, item, result_created): """ Atomic update """ with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=result_pk) crawl_result = result.crawl_result if crawl_result: crawl_result = crawl_result[0] else: crawl_result = {} if "urls" not in crawl_result: crawl_result["urls"] = [] url = item.pop("url") if url in crawl_result["urls"]: return crawl_result["urls"].append(url) for k, value in item.items(): if k == "is_online": if result_created: # only update on the first link result.is_online = value elif k in self.NB_KEYS: if k not in crawl_result: crawl_result[k] = [] for subvalue in value: if subvalue in crawl_result[k]: continue crawl_result[k].append(subvalue) setattr(result, "nb_" + k, len(crawl_result[k])) result.crawl_result = [crawl_result] result.save() return True def process_item(self, item, spider): result_pk, created = self._get_result_pk(spider) self._update(result_pk, item, created) return item def close(self, spider): result_pk, created = self._get_result_pk(spider) with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=result_pk) result.status = "F" result.duration = timezone.now() - result.started result.save() def get_domain(url): ext = tldextract.extract(url) return '{}.{}'.format(ext.domain, ext.suffix) def create_spider(name, urls, crawl, target, excluded_domains=None): if not excluded_domains: excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), "excluded_domains": excluded_domains} ) def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider( "Crawl{}Target{}".format(crawl_item.pk, target.pk), [target.url], crawl_item, target, excluded_domains ) ) process.start()