summaryrefslogtreecommitdiff
path: root/commcrawler
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-13 12:22:29 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-13 12:22:29 +0200
commit7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7 (patch)
tree6f01b83665230d6378a607c81cce821c025d1b16 /commcrawler
parent00d0be457177da30ee5c83456e0ffac58aed497a (diff)
downloadComm-on-net-7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7.tar.bz2
Comm-on-net-7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7.zip
Crawl time out is now a db parameter
Diffstat (limited to 'commcrawler')
-rw-r--r--commcrawler/migrations/0004_crawl_time_out.py20
-rw-r--r--commcrawler/models.py4
-rw-r--r--commcrawler/scrapy.py16
3 files changed, 34 insertions, 6 deletions
diff --git a/commcrawler/migrations/0004_crawl_time_out.py b/commcrawler/migrations/0004_crawl_time_out.py
new file mode 100644
index 0000000..e7dae3a
--- /dev/null
+++ b/commcrawler/migrations/0004_crawl_time_out.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11 on 2019-08-13 10:15
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('commcrawler', '0003_auto_20190809_1607'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='crawl',
+ name='time_out',
+ field=models.PositiveIntegerField(default=10, help_text='0 for no delay', verbose_name='Maximum delay for crawling a target (minutes)'),
+ ),
+ ]
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 7a80b3b..d6d766e 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -34,6 +34,10 @@ class Crawl(models.Model):
('F', _("Finished"))
)
name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True)
+ time_out = models.PositiveIntegerField(
+ verbose_name=_("Maximum delay for crawling a target (minutes)"),
+ default=10, help_text=_("0 for no delay")
+ )
created = models.DateTimeField(
verbose_name=_("Creation"), default=datetime.datetime.now)
started = models.DateTimeField(
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index b729a1c..0967650 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -36,7 +36,6 @@ DATE_MONTH_DELAY = 1
DATE_MONTH_FUTUR_DELAY = 2
MAX_LINKS = None # if None no max
-TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
NUMBER_PER_PAGE = settings.NUMBER_PER_SESSION
ONLY_FIRST_PAGE = True
@@ -51,6 +50,7 @@ class DefaultSpider:
target_id = None
crawl_result = None
links_reached = set()
+ timeout = None
redirect = None
def start_requests(self):
@@ -120,7 +120,7 @@ class DefaultSpider:
append_to_results(result, content_type, url)
return True
- def timeout(self):
+ def is_timeout(self):
if not self.crawl_result:
q = {
"crawl_id": self.crawl_id,
@@ -130,7 +130,7 @@ class DefaultSpider:
return
self.crawl_result = models.CrawlResult.objects.get(**q)
duration = timezone.now() - self.crawl_result.started
- if duration < TIMEOUT:
+ if not self.timeout or duration < self.timeout:
return
with transaction.atomic():
result = models.CrawlResult.objects.select_for_update().get(
@@ -144,7 +144,7 @@ class DefaultSpider:
result = {
"url": response.url,
}
- self.timeout()
+ self.is_timeout()
for domain in self.excluded_domains:
if domain in response.url:
result["is_online"] = False
@@ -190,7 +190,7 @@ class DefaultSpider:
self.links_reached.add(url)
is_file = self._parse_internal_files(url, result)
if not is_file and \
- not self.timeout() and (
+ not self.is_timeout() and (
not MAX_LINKS or
len(self.links_reached) < MAX_LINKS):
yield response.follow(link.url, self.parse)
@@ -301,12 +301,16 @@ def create_spider(name, urls, crawl, target, excluded_domains=None,
redirect=None):
if not excluded_domains:
excluded_domains = []
+ timeout = datetime.timedelta(minutes=crawl.time_out) \
+ if crawl.time_out else None
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
"crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
- "excluded_domains": excluded_domains, "redirect": redirect}
+ "excluded_domains": excluded_domains,
+ "timeout": timeout,
+ "redirect": redirect}
)