summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-10 12:28:09 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-10 12:28:09 +0200
commit9fb538feb0989df7bcd3538ae178165cc10cc184 (patch)
tree78c4372bf149e7a086448c0a2ab752ab6e890d0e /commcrawler/scrapy.py
parent7436b1bac461a6bf71f4329b49d26ee5740ae3ee (diff)
downloadComm-on-net-9fb538feb0989df7bcd3538ae178165cc10cc184.tar.bz2
Comm-on-net-9fb538feb0989df7bcd3538ae178165cc10cc184.zip
Better management of timeout in crawl...
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py8
1 files changed, 4 insertions, 4 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index bdd28c3..767827a 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -5,7 +5,7 @@ import requests
import scrapy
from scrapy.crawler import CrawlerProcess
-from scrapy.exceptions import NotSupported
+from scrapy.exceptions import NotSupported, CloseSpider
from scrapy.linkextractors import LinkExtractor
from django.conf import settings
@@ -129,15 +129,15 @@ class DefaultSpider:
pk=self.crawl_result.pk)
result.status = "T"
result.save()
- self.is_timeout = True
- return True
+ self.is_timeout = True
+ raise CloseSpider('timeout')
def parse(self, response):
result = {
"url": response.url,
}
if self.is_timeout or self.timeout():
- return []
+ raise CloseSpider('timeout')
for domain in self.excluded_domains:
if domain in response.url:
result["is_online"] = False