import datetime import tldextract from urllib.parse import urldefrag import scrapy from scrapy.crawler import CrawlerProcess from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor from django.conf import settings from django.db import transaction, IntegrityError from django.utils import timezone from . import models """ nb_facebook nb_twitter nb_instagram nb_youtube nb_dailymotion nb_vimeo nb_video nb_audio nb_internal_pdf nb_external_pdf nb_internal_office nb_external_office redirection CrawlLink """ def clean_url(url): url, __ = urldefrag(url) # remove anchors return url MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) class DefaultSpider: name = None start_urls = None allowed_domains = [] excluded_domains = [] crawl_id = None target_id = None crawl_result = None links_reached = set() def start_requests(self): q = { "crawl_id": self.crawl_id, "target_id": self.target_id, "status__in": ["F", "T"], } if models.CrawlResult.objects.filter(**q).count(): return [] q.pop("status__in") if models.CrawlResult.objects.filter(**q).count(): # delete a previous interrupted attempt res = models.CrawlResult.objects.get(**q) res.delete() for url in self.start_urls: yield scrapy.Request(url, self.parse) def _parse_image(self, response, result): if "images" not in result: result["images"] = [] for img in response.css('img'): attributes = img.attrib if "src" not in attributes: continue src = attributes["src"] is_a_real_src = src.startswith("http") or src.startswith("/") if not src or not is_a_real_src or src in result["images"]: continue result["images"].append(src) def timeout(self): if not self.crawl_result: q = { "crawl_id": self.crawl_id, "target_id": self.target_id, } if not models.CrawlResult.objects.filter(**q).count(): return self.crawl_result = models.CrawlResult.objects.get(**q) duration = timezone.now() - self.crawl_result.started if duration < TIMEOUT: return with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=self.crawl_result.pk) result.status = "T" result.save() return True def parse(self, response): result = { "url": response.url, } if self.timeout(): return [] for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False if result.get("is_online", None) is False: yield result else: result["is_online"] = True try: self._parse_image(response, result) for link in LinkExtractor().extract_links(response): url = clean_url(link.url) if url is None or url in self.links_reached: continue is_internal = False for domain in self.allowed_domains: if domain in url: is_internal = True self.links_reached.add(link.url) if not MAX_LINKS or \ len(self.links_reached) < MAX_LINKS: yield response.follow(link.url, self.parse) else: print("MAX", self.allowed_domains, self.links_reached) if not is_internal: if "external_link" not in result: result["external_link"] = [] result["external_link"].append(url) except NotSupported: print("No response", response.url) yield result def closed(self, reason): DbPipeline().close(self) class DbPipeline: BASE_KEYS = ["url", "crawl_id", "target_id"] NB_KEYS = ["external_link", "internal_link", "images", "facebook", "twitter", "instagram", "youtube", "dailymotion", "vimeo", "video", "audio", "internal_pdf", "external_pdf", "internal_office", "external_office"] def _get_result_pk(self, spider): """ Atomic creation :param spider: current spider :return: result_pk, created """ pks = { "crawl_id": spider.crawl_id, "target_id": spider.target_id, } created = False try: result = models.CrawlResult.objects.get(**pks) except models.CrawlResult.DoesNotExist: try: with transaction.atomic(): result = models.CrawlResult.objects.create(**pks) created = True except IntegrityError: result = models.CrawlResult.objects.get(**pks) return result.pk, created def _update(self, result_pk, item, result_created): """ Atomic update """ with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=result_pk) crawl_result = result.crawl_result if crawl_result: crawl_result = crawl_result[0] else: crawl_result = {} if "urls" not in crawl_result: crawl_result["urls"] = [] url = item.pop("url") if url in crawl_result["urls"]: return crawl_result["urls"].append(url) result.nb_internal_links = len(crawl_result["urls"]) - 1 for k, value in item.items(): if k == "is_online": if result_created: # only update on the first link result.is_online = value elif k in self.NB_KEYS: if k not in crawl_result: crawl_result[k] = [] for subvalue in value: if subvalue in crawl_result[k]: continue crawl_result[k].append(subvalue) setattr(result, "nb_" + k, len(crawl_result[k])) result.crawl_result = [crawl_result] result.save() return True def process_item(self, item, spider): result_pk, created = self._get_result_pk(spider) self._update(result_pk, item, created) return item def close(self, spider): result_pk, created = self._get_result_pk(spider) with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( pk=result_pk) if result.status == "P": result.status = "F" result.duration = (timezone.now() - result.started) result.save() def get_domain(url): ext = tldextract.extract(url) return '{}.{}'.format(ext.domain, ext.suffix) def create_spider(name, urls, crawl, target, excluded_domains=None): if not excluded_domains: excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), "excluded_domains": excluded_domains} ) def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider( "Crawl{}Target{}".format(crawl_item.pk, target.pk), [target.url], crawl_item, target, excluded_domains ) ) process.start()