summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py18
1 files changed, 9 insertions, 9 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7147949..bdd28c3 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -43,6 +43,7 @@ class DefaultSpider:
crawl_result = None
links_reached = set()
redirect = None
+ is_timeout = False
def start_requests(self):
q = {
@@ -128,13 +129,14 @@ class DefaultSpider:
pk=self.crawl_result.pk)
result.status = "T"
result.save()
+ self.is_timeout = True
return True
def parse(self, response):
result = {
"url": response.url,
}
- if self.timeout():
+ if self.is_timeout or self.timeout():
return []
for domain in self.excluded_domains:
if domain in response.url:
@@ -156,14 +158,12 @@ class DefaultSpider:
is_internal = True
self.links_reached.add(url)
is_file = self._parse_internal_files(url, result)
- if is_file:
- pass
- elif not MAX_LINKS or \
- len(self.links_reached) < MAX_LINKS:
+ if not is_file and \
+ not self.is_timeout and \
+ not self.timeout() and (
+ not MAX_LINKS or
+ len(self.links_reached) < MAX_LINKS):
yield response.follow(link.url, self.parse)
- else:
- print("MAX", self.allowed_domains,
- self.links_reached)
if not is_internal:
current_domain = get_domain(url)
if current_domain in FACEBOOK_DOMAINS:
@@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
while response is None:
try:
response = requests.get(target.url, verify=verify_ssl,
- timeout=20)
+ timeout=45)
except requests.exceptions.SSLError:
if not verify_ssl: # new error on SSL
response = "Try..." # scrapy is more permissive