summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-07 03:53:32 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-07 03:55:49 +0200
commitd737e04553f464966f54739ba37f9f06dab44586 (patch)
tree2b68891ff1629b55b820312fdd3a17ce91ac5722 /commcrawler/scrapy.py
parent9fbd94f70d4b819b45eef720425242c0d69b032d (diff)
downloadComm-on-net-d737e04553f464966f54739ba37f9f06dab44586.tar.bz2
Comm-on-net-d737e04553f464966f54739ba37f9f06dab44586.zip
Save crawling results in the database
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py200
1 files changed, 168 insertions, 32 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index d218648..b0c4fe4 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,10 +1,36 @@
+import tldextract
+
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import NotSupported
from scrapy.linkextractors import LinkExtractor
-import tldextract
from django.conf import settings
+from django.db import transaction, IntegrityError
+from django.utils import timezone
+
+from . import models
+
+"""
+nb_external_link
+nb_internal_link
+nb_images
+nb_facebook
+nb_twitter
+nb_instagram
+nb_youtube
+nb_dailymotion
+nb_vimeo
+nb_video
+nb_audio
+nb_internal_pdf
+nb_external_pdf
+nb_internal_office
+nb_external_office
+redirection
+
+CrawlLink
+"""
MAX_LINKS = 500
@@ -14,41 +40,154 @@ class DefaultSpider:
start_urls = None
allowed_domains = []
excluded_domains = []
+ crawl_id = None
target_id = None
links_reached = set()
+ def start_requests(self):
+ q = {
+ "crawl_id": self.crawl_id,
+ "target_id": self.target_id,
+ "status": "F"
+ }
+ if models.CrawlResult.objects.filter(**q).count():
+ return []
+ q.pop("status")
+ if models.CrawlResult.objects.filter(**q).count():
+ # delete a previous interrupted attempt
+ res = models.CrawlResult.objects.get(**q)
+ res.delete()
+
+ for url in self.start_urls:
+ yield scrapy.Request(url, self.parse)
+
def _parse_image(self, response, result):
- for __ in response.css('img'):
- if 'nb_images' not in result:
- result["nb_images"] = 0
- result["nb_images"] += 1
+ if "images" not in result:
+ result["images"] = []
+ for img in response.css('img'):
+ attributes = img.attrib
+ if "src" not in attributes:
+ continue
+ src = attributes["src"]
+ is_a_real_src = src.startswith("http") or src.startswith("/")
+ if not src or not is_a_real_src or src in result["images"]:
+ continue
+ result["images"].append(src)
def parse(self, response):
result = {
"url": response.url,
- "target_id": self.target_id
}
for domain in self.excluded_domains:
if domain in response.url:
- result["offline"] = True
- yield result
+ result["is_online"] = False
+ if result.get("is_online", None) is False:
+ yield result
+ else:
+ result["is_online"] = True
+ try:
+ self._parse_image(response, result)
+ for link in LinkExtractor().extract_links(response):
+ url = link.url
+ if url is None or url in self.links_reached:
+ continue
+ for domain in self.allowed_domains:
+ if domain in url:
+ self.links_reached.add(link.url)
+ if len(self.links_reached) < MAX_LINKS:
+ yield response.follow(link.url, self.parse)
+ else:
+ print("MAX", self.allowed_domains,
+ self.links_reached)
+ except NotSupported:
+ print("No response", response.url)
+ yield result
+
+ def closed(self, reason):
+ result = {
+ "crawl_id": self.crawl_id,
+ "target_id": self.target_id,
+ }
+ DbPipeline().close(self)
+
+
+class DbPipeline:
+ BASE_KEYS = ["url", "crawl_id", "target_id"]
+ NB_KEYS = ["external_link", "internal_link", "images",
+ "facebook", "twitter", "instagram", "youtube",
+ "dailymotion", "vimeo", "video", "audio",
+ "internal_pdf", "external_pdf", "internal_office",
+ "external_office", ]
+
+ def _get_result_pk(self, spider):
+ """
+ Atomic creation
+ :param spider: current spider
+ :return: result_pk, created
+ """
+ pks = {
+ "crawl_id": spider.crawl_id,
+ "target_id": spider.target_id,
+ }
+ created = False
try:
- self._parse_image(response, result)
- for link in LinkExtractor().extract_links(response):
- url = link.url
- if url is None or url in self.links_reached:
- continue
- for domain in self.allowed_domains:
- if domain in url:
- self.links_reached.add(link.url)
- if len(self.links_reached) < MAX_LINKS:
- yield response.follow(link.url, self.parse)
- else:
- print("MAX", self.allowed_domains,
- self.links_reached)
- except NotSupported:
- print("No response", response.url)
- yield result
+ result = models.CrawlResult.objects.get(**pks)
+ except models.CrawlResult.DoesNotExist:
+ try:
+ with transaction.atomic():
+ result = models.CrawlResult.objects.create(**pks)
+ created = True
+ except IntegrityError:
+ result = models.CrawlResult.objects.get(**pks)
+ return result.pk, created
+
+ def _update(self, result_pk, item, result_created):
+ """
+ Atomic update
+ """
+ with transaction.atomic():
+ result = models.CrawlResult.objects.select_for_update().get(
+ pk=result_pk)
+ crawl_result = result.crawl_result
+ if crawl_result:
+ crawl_result = crawl_result[0]
+ else:
+ crawl_result = {}
+ if "urls" not in crawl_result:
+ crawl_result["urls"] = []
+ url = item.pop("url")
+ if url in crawl_result["urls"]:
+ return
+ crawl_result["urls"].append(url)
+ for k, value in item.items():
+ if k == "is_online":
+ if result_created: # only update on the first link
+ result.is_online = value
+ elif k in self.NB_KEYS:
+ if k not in crawl_result:
+ crawl_result[k] = []
+ for subvalue in value:
+ if subvalue in crawl_result[k]:
+ continue
+ crawl_result[k].append(subvalue)
+ setattr(result, "nb_" + k, len(crawl_result[k]))
+ result.crawl_result = [crawl_result]
+ result.save()
+ return True
+
+ def process_item(self, item, spider):
+ result_pk, created = self._get_result_pk(spider)
+ self._update(result_pk, item, created)
+ return item
+
+ def close(self, spider):
+ result_pk, created = self._get_result_pk(spider)
+ with transaction.atomic():
+ result = models.CrawlResult.objects.select_for_update().get(
+ pk=result_pk)
+ result.status = "F"
+ result.duration = timezone.now() - result.started
+ result.save()
def get_domain(url):
@@ -56,30 +195,27 @@ def get_domain(url):
return '{}.{}'.format(ext.domain, ext.suffix)
-def create_spider(name, urls, target, excluded_domains=None):
+def create_spider(name, urls, crawl, target, excluded_domains=None):
if not excluded_domains:
excluded_domains = []
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
- "target_id": target.pk, "links_reached": set(),
+ "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
"excluded_domains": excluded_domains}
)
def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
- scrap_settings.update({
- 'FEED_FORMAT': 'json',
- 'FEED_URI': 'result.json'
- })
process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
create_spider(
- "Target{}".format(target.pk),
- [target.url], target,
+ "Crawl{}Target{}".format(crawl_item.pk, target.pk),
+ [target.url],
+ crawl_item, target,
excluded_domains
)
)