diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-07 03:53:32 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-07 03:55:49 +0200 |
commit | d737e04553f464966f54739ba37f9f06dab44586 (patch) | |
tree | 2b68891ff1629b55b820312fdd3a17ce91ac5722 /commcrawler/scrapy.py | |
parent | 9fbd94f70d4b819b45eef720425242c0d69b032d (diff) | |
download | Comm-on-net-d737e04553f464966f54739ba37f9f06dab44586.tar.bz2 Comm-on-net-d737e04553f464966f54739ba37f9f06dab44586.zip |
Save crawling results in the database
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 200 |
1 files changed, 168 insertions, 32 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index d218648..b0c4fe4 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,10 +1,36 @@ +import tldextract + import scrapy from scrapy.crawler import CrawlerProcess from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor -import tldextract from django.conf import settings +from django.db import transaction, IntegrityError +from django.utils import timezone + +from . import models + +""" +nb_external_link +nb_internal_link +nb_images +nb_facebook +nb_twitter +nb_instagram +nb_youtube +nb_dailymotion +nb_vimeo +nb_video +nb_audio +nb_internal_pdf +nb_external_pdf +nb_internal_office +nb_external_office +redirection + +CrawlLink +""" MAX_LINKS = 500 @@ -14,41 +40,154 @@ class DefaultSpider: start_urls = None allowed_domains = [] excluded_domains = [] + crawl_id = None target_id = None links_reached = set() + def start_requests(self): + q = { + "crawl_id": self.crawl_id, + "target_id": self.target_id, + "status": "F" + } + if models.CrawlResult.objects.filter(**q).count(): + return [] + q.pop("status") + if models.CrawlResult.objects.filter(**q).count(): + # delete a previous interrupted attempt + res = models.CrawlResult.objects.get(**q) + res.delete() + + for url in self.start_urls: + yield scrapy.Request(url, self.parse) + def _parse_image(self, response, result): - for __ in response.css('img'): - if 'nb_images' not in result: - result["nb_images"] = 0 - result["nb_images"] += 1 + if "images" not in result: + result["images"] = [] + for img in response.css('img'): + attributes = img.attrib + if "src" not in attributes: + continue + src = attributes["src"] + is_a_real_src = src.startswith("http") or src.startswith("/") + if not src or not is_a_real_src or src in result["images"]: + continue + result["images"].append(src) def parse(self, response): result = { "url": response.url, - "target_id": self.target_id } for domain in self.excluded_domains: if domain in response.url: - result["offline"] = True - yield result + result["is_online"] = False + if result.get("is_online", None) is False: + yield result + else: + result["is_online"] = True + try: + self._parse_image(response, result) + for link in LinkExtractor().extract_links(response): + url = link.url + if url is None or url in self.links_reached: + continue + for domain in self.allowed_domains: + if domain in url: + self.links_reached.add(link.url) + if len(self.links_reached) < MAX_LINKS: + yield response.follow(link.url, self.parse) + else: + print("MAX", self.allowed_domains, + self.links_reached) + except NotSupported: + print("No response", response.url) + yield result + + def closed(self, reason): + result = { + "crawl_id": self.crawl_id, + "target_id": self.target_id, + } + DbPipeline().close(self) + + +class DbPipeline: + BASE_KEYS = ["url", "crawl_id", "target_id"] + NB_KEYS = ["external_link", "internal_link", "images", + "facebook", "twitter", "instagram", "youtube", + "dailymotion", "vimeo", "video", "audio", + "internal_pdf", "external_pdf", "internal_office", + "external_office", ] + + def _get_result_pk(self, spider): + """ + Atomic creation + :param spider: current spider + :return: result_pk, created + """ + pks = { + "crawl_id": spider.crawl_id, + "target_id": spider.target_id, + } + created = False try: - self._parse_image(response, result) - for link in LinkExtractor().extract_links(response): - url = link.url - if url is None or url in self.links_reached: - continue - for domain in self.allowed_domains: - if domain in url: - self.links_reached.add(link.url) - if len(self.links_reached) < MAX_LINKS: - yield response.follow(link.url, self.parse) - else: - print("MAX", self.allowed_domains, - self.links_reached) - except NotSupported: - print("No response", response.url) - yield result + result = models.CrawlResult.objects.get(**pks) + except models.CrawlResult.DoesNotExist: + try: + with transaction.atomic(): + result = models.CrawlResult.objects.create(**pks) + created = True + except IntegrityError: + result = models.CrawlResult.objects.get(**pks) + return result.pk, created + + def _update(self, result_pk, item, result_created): + """ + Atomic update + """ + with transaction.atomic(): + result = models.CrawlResult.objects.select_for_update().get( + pk=result_pk) + crawl_result = result.crawl_result + if crawl_result: + crawl_result = crawl_result[0] + else: + crawl_result = {} + if "urls" not in crawl_result: + crawl_result["urls"] = [] + url = item.pop("url") + if url in crawl_result["urls"]: + return + crawl_result["urls"].append(url) + for k, value in item.items(): + if k == "is_online": + if result_created: # only update on the first link + result.is_online = value + elif k in self.NB_KEYS: + if k not in crawl_result: + crawl_result[k] = [] + for subvalue in value: + if subvalue in crawl_result[k]: + continue + crawl_result[k].append(subvalue) + setattr(result, "nb_" + k, len(crawl_result[k])) + result.crawl_result = [crawl_result] + result.save() + return True + + def process_item(self, item, spider): + result_pk, created = self._get_result_pk(spider) + self._update(result_pk, item, created) + return item + + def close(self, spider): + result_pk, created = self._get_result_pk(spider) + with transaction.atomic(): + result = models.CrawlResult.objects.select_for_update().get( + pk=result_pk) + result.status = "F" + result.duration = timezone.now() - result.started + result.save() def get_domain(url): @@ -56,30 +195,27 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, target, excluded_domains=None): +def create_spider(name, urls, crawl, target, excluded_domains=None): if not excluded_domains: excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], - "target_id": target.pk, "links_reached": set(), + "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), "excluded_domains": excluded_domains} ) def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() - scrap_settings.update({ - 'FEED_FORMAT': 'json', - 'FEED_URI': 'result.json' - }) process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider( - "Target{}".format(target.pk), - [target.url], target, + "Crawl{}Target{}".format(crawl_item.pk, target.pk), + [target.url], + crawl_item, target, excluded_domains ) ) |