From 7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7 Mon Sep 17 00:00:00 2001
From: Étienne Loks <etienne.loks@iggdrasil.net>
Date: Tue, 13 Aug 2019 12:22:29 +0200
Subject: Crawl time out is now a db parameter

---
 commcrawler/migrations/0004_crawl_time_out.py | 20 ++++++++++++++++++++
 commcrawler/models.py                         |  4 ++++
 commcrawler/scrapy.py                         | 16 ++++++++++------
 3 files changed, 34 insertions(+), 6 deletions(-)
 create mode 100644 commcrawler/migrations/0004_crawl_time_out.py

(limited to 'commcrawler')

diff --git a/commcrawler/migrations/0004_crawl_time_out.py b/commcrawler/migrations/0004_crawl_time_out.py
new file mode 100644
index 0000000..e7dae3a
--- /dev/null
+++ b/commcrawler/migrations/0004_crawl_time_out.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11 on 2019-08-13 10:15
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('commcrawler', '0003_auto_20190809_1607'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='crawl',
+            name='time_out',
+            field=models.PositiveIntegerField(default=10, help_text='0 for no delay', verbose_name='Maximum delay for crawling a target (minutes)'),
+        ),
+    ]
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 7a80b3b..d6d766e 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -34,6 +34,10 @@ class Crawl(models.Model):
         ('F', _("Finished"))
     )
     name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True)
+    time_out = models.PositiveIntegerField(
+        verbose_name=_("Maximum delay for crawling a target (minutes)"),
+        default=10, help_text=_("0 for no delay")
+    )
     created = models.DateTimeField(
         verbose_name=_("Creation"), default=datetime.datetime.now)
     started = models.DateTimeField(
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index b729a1c..0967650 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -36,7 +36,6 @@ DATE_MONTH_DELAY = 1
 DATE_MONTH_FUTUR_DELAY = 2
 
 MAX_LINKS = None  # if None no max
-TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
 
 NUMBER_PER_PAGE = settings.NUMBER_PER_SESSION
 ONLY_FIRST_PAGE = True
@@ -51,6 +50,7 @@ class DefaultSpider:
     target_id = None
     crawl_result = None
     links_reached = set()
+    timeout = None
     redirect = None
 
     def start_requests(self):
@@ -120,7 +120,7 @@ class DefaultSpider:
                 append_to_results(result, content_type, url)
                 return True
 
-    def timeout(self):
+    def is_timeout(self):
         if not self.crawl_result:
             q = {
                 "crawl_id": self.crawl_id,
@@ -130,7 +130,7 @@ class DefaultSpider:
                 return
             self.crawl_result = models.CrawlResult.objects.get(**q)
         duration = timezone.now() - self.crawl_result.started
-        if duration < TIMEOUT:
+        if not self.timeout or duration < self.timeout:
             return
         with transaction.atomic():
             result = models.CrawlResult.objects.select_for_update().get(
@@ -144,7 +144,7 @@ class DefaultSpider:
         result = {
             "url": response.url,
         }
-        self.timeout()
+        self.is_timeout()
         for domain in self.excluded_domains:
             if domain in response.url:
                 result["is_online"] = False
@@ -190,7 +190,7 @@ class DefaultSpider:
                             self.links_reached.add(url)
                             is_file = self._parse_internal_files(url, result)
                             if not is_file and \
-                                    not self.timeout() and (
+                                    not self.is_timeout() and (
                                         not MAX_LINKS or
                                         len(self.links_reached) < MAX_LINKS):
                                 yield response.follow(link.url, self.parse)
@@ -301,12 +301,16 @@ def create_spider(name, urls, crawl, target, excluded_domains=None,
                   redirect=None):
     if not excluded_domains:
         excluded_domains = []
+    timeout = datetime.timedelta(minutes=crawl.time_out) \
+        if crawl.time_out else None
     return type(
         name, (DefaultSpider, scrapy.Spider),
         {"name": name, "start_urls": urls,
          "allowed_domains": [get_domain(url) for url in urls],
          "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
-         "excluded_domains": excluded_domains, "redirect": redirect}
+         "excluded_domains": excluded_domains,
+         "timeout": timeout,
+         "redirect": redirect}
     )
 
 
-- 
cgit v1.2.3