diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-10 12:28:09 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-10 12:28:09 +0200 |
commit | 9fb538feb0989df7bcd3538ae178165cc10cc184 (patch) | |
tree | 78c4372bf149e7a086448c0a2ab752ab6e890d0e /commcrawler/scrapy.py | |
parent | 7436b1bac461a6bf71f4329b49d26ee5740ae3ee (diff) | |
download | Comm-on-net-9fb538feb0989df7bcd3538ae178165cc10cc184.tar.bz2 Comm-on-net-9fb538feb0989df7bcd3538ae178165cc10cc184.zip |
Better management of timeout in crawl...
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 8 |
1 files changed, 4 insertions, 4 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index bdd28c3..767827a 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -5,7 +5,7 @@ import requests import scrapy from scrapy.crawler import CrawlerProcess -from scrapy.exceptions import NotSupported +from scrapy.exceptions import NotSupported, CloseSpider from scrapy.linkextractors import LinkExtractor from django.conf import settings @@ -129,15 +129,15 @@ class DefaultSpider: pk=self.crawl_result.pk) result.status = "T" result.save() - self.is_timeout = True - return True + self.is_timeout = True + raise CloseSpider('timeout') def parse(self, response): result = { "url": response.url, } if self.is_timeout or self.timeout(): - return [] + raise CloseSpider('timeout') for domain in self.excluded_domains: if domain in response.url: result["is_online"] = False |