summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-13 12:22:29 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-13 12:22:29 +0200
commit7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7 (patch)
tree6f01b83665230d6378a607c81cce821c025d1b16 /commcrawler/scrapy.py
parent00d0be457177da30ee5c83456e0ffac58aed497a (diff)
downloadComm-on-net-7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7.tar.bz2
Comm-on-net-7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7.zip
Crawl time out is now a db parameter
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py16
1 files changed, 10 insertions, 6 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index b729a1c..0967650 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -36,7 +36,6 @@ DATE_MONTH_DELAY = 1
DATE_MONTH_FUTUR_DELAY = 2
MAX_LINKS = None # if None no max
-TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
NUMBER_PER_PAGE = settings.NUMBER_PER_SESSION
ONLY_FIRST_PAGE = True
@@ -51,6 +50,7 @@ class DefaultSpider:
target_id = None
crawl_result = None
links_reached = set()
+ timeout = None
redirect = None
def start_requests(self):
@@ -120,7 +120,7 @@ class DefaultSpider:
append_to_results(result, content_type, url)
return True
- def timeout(self):
+ def is_timeout(self):
if not self.crawl_result:
q = {
"crawl_id": self.crawl_id,
@@ -130,7 +130,7 @@ class DefaultSpider:
return
self.crawl_result = models.CrawlResult.objects.get(**q)
duration = timezone.now() - self.crawl_result.started
- if duration < TIMEOUT:
+ if not self.timeout or duration < self.timeout:
return
with transaction.atomic():
result = models.CrawlResult.objects.select_for_update().get(
@@ -144,7 +144,7 @@ class DefaultSpider:
result = {
"url": response.url,
}
- self.timeout()
+ self.is_timeout()
for domain in self.excluded_domains:
if domain in response.url:
result["is_online"] = False
@@ -190,7 +190,7 @@ class DefaultSpider:
self.links_reached.add(url)
is_file = self._parse_internal_files(url, result)
if not is_file and \
- not self.timeout() and (
+ not self.is_timeout() and (
not MAX_LINKS or
len(self.links_reached) < MAX_LINKS):
yield response.follow(link.url, self.parse)
@@ -301,12 +301,16 @@ def create_spider(name, urls, crawl, target, excluded_domains=None,
redirect=None):
if not excluded_domains:
excluded_domains = []
+ timeout = datetime.timedelta(minutes=crawl.time_out) \
+ if crawl.time_out else None
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
"crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
- "excluded_domains": excluded_domains, "redirect": redirect}
+ "excluded_domains": excluded_domains,
+ "timeout": timeout,
+ "redirect": redirect}
)