diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-13 12:22:29 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-13 12:22:29 +0200 |
commit | 7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7 (patch) | |
tree | 6f01b83665230d6378a607c81cce821c025d1b16 /commcrawler/scrapy.py | |
parent | 00d0be457177da30ee5c83456e0ffac58aed497a (diff) | |
download | Comm-on-net-7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7.tar.bz2 Comm-on-net-7a66c4def3f86d87f3d7bd0aa763fe4b124a48b7.zip |
Crawl time out is now a db parameter
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index b729a1c..0967650 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -36,7 +36,6 @@ DATE_MONTH_DELAY = 1 DATE_MONTH_FUTUR_DELAY = 2 MAX_LINKS = None # if None no max -TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) NUMBER_PER_PAGE = settings.NUMBER_PER_SESSION ONLY_FIRST_PAGE = True @@ -51,6 +50,7 @@ class DefaultSpider: target_id = None crawl_result = None links_reached = set() + timeout = None redirect = None def start_requests(self): @@ -120,7 +120,7 @@ class DefaultSpider: append_to_results(result, content_type, url) return True - def timeout(self): + def is_timeout(self): if not self.crawl_result: q = { "crawl_id": self.crawl_id, @@ -130,7 +130,7 @@ class DefaultSpider: return self.crawl_result = models.CrawlResult.objects.get(**q) duration = timezone.now() - self.crawl_result.started - if duration < TIMEOUT: + if not self.timeout or duration < self.timeout: return with transaction.atomic(): result = models.CrawlResult.objects.select_for_update().get( @@ -144,7 +144,7 @@ class DefaultSpider: result = { "url": response.url, } - self.timeout() + self.is_timeout() for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False @@ -190,7 +190,7 @@ class DefaultSpider: self.links_reached.add(url) is_file = self._parse_internal_files(url, result) if not is_file and \ - not self.timeout() and ( + not self.is_timeout() and ( not MAX_LINKS or len(self.links_reached) < MAX_LINKS): yield response.follow(link.url, self.parse) @@ -301,12 +301,16 @@ def create_spider(name, urls, crawl, target, excluded_domains=None, redirect=None): if not excluded_domains: excluded_domains = [] + timeout = datetime.timedelta(minutes=crawl.time_out) \ + if crawl.time_out else None return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), - "excluded_domains": excluded_domains, "redirect": redirect} + "excluded_domains": excluded_domains, + "timeout": timeout, + "redirect": redirect} ) |