From 7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Tue, 13 Aug 2019 12:22:29 +0200 Subject: Crawl time out is now a db parameter --- commcrawler/migrations/0004_crawl_time_out.py | 20 ++++++++++++++++++++ commcrawler/models.py | 4 ++++ commcrawler/scrapy.py | 16 ++++++++++------ 3 files changed, 34 insertions(+), 6 deletions(-) create mode 100644 commcrawler/migrations/0004_crawl_time_out.py (limited to 'commcrawler') diff --git a/commcrawler/migrations/0004_crawl_time_out.py b/commcrawler/migrations/0004_crawl_time_out.py new file mode 100644 index 0000000..e7dae3a --- /dev/null +++ b/commcrawler/migrations/0004_crawl_time_out.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11 on 2019-08-13 10:15 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('commcrawler', '0003_auto_20190809_1607'), + ] + + operations = [ + migrations.AddField( + model_name='crawl', + name='time_out', + field=models.PositiveIntegerField(default=10, help_text='0 for no delay', verbose_name='Maximum delay for crawling a target (minutes)'), + ), + ] diff --git a/commcrawler/models.py b/commcrawler/models.py index 7a80b3b..d6d766e 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -34,6 +34,10 @@ class Crawl(models.Model): ('F', _("Finished")) ) name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True) + time_out = models.PositiveIntegerField( + verbose_name=_("Maximum delay for crawling a target (minutes)"), + default=10, help_text=_("0 for no delay") + ) created = models.DateTimeField( verbose_name=_("Creation"), default=datetime.datetime.now) started = models.DateTimeField( diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index b729a1c..0967650 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -36,7 +36,6 @@ DATE_MONTH_DELAY = 1 DATE_MONTH_FUTUR_DELAY = 2 MAX_LINKS = None # if None no max -TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) NUMBER_PER_PAGE = settings.NUMBER_PER_SESSION ONLY_FIRST_PAGE = True @@ -51,6 +50,7 @@ class DefaultSpider: target_id = None crawl_result = None links_reached = set() + timeout = None redirect = None def start_requests(self): @@ -120,7 +120,7 @@ class DefaultSpider: append_to_results(result, content_type, url) return True - def timeout(self): + def is_timeout(self): if not self.crawl_result: q = { "crawl_id": self.crawl_id, @@ -130,7 +130,7 @@ class DefaultSpider: return self.crawl_result = models.CrawlResult.objects.get(**q) duration = timezone.now() - self.crawl_result.started - if duration < TIMEOUT: + if not self.timeout or duration < self.timeout: return with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( @@ -144,7 +144,7 @@ class DefaultSpider: result = { "url": response.url, } - self.timeout() + self.is_timeout() for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False @@ -190,7 +190,7 @@ class DefaultSpider: self.links_reached.add(url) is_file = self._parse_internal_files(url, result) if not is_file and \ - not self.timeout() and ( + not self.is_timeout() and ( not MAX_LINKS or len(self.links_reached) < MAX_LINKS): yield response.follow(link.url, self.parse) @@ -301,12 +301,16 @@ def create_spider(name, urls, crawl, target, excluded_domains=None, redirect=None): if not excluded_domains: excluded_domains = [] + timeout = datetime.timedelta(minutes=crawl.time_out) \ + if crawl.time_out else None return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), - "excluded_domains": excluded_domains, "redirect": redirect} + "excluded_domains": excluded_domains, + "timeout": timeout, + "redirect": redirect} ) -- cgit v1.2.3