Scrap: add a condition on timeout

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-09 19:39:46 +0200
committer: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-09 19:39:46 +0200
commit: 507e0ce240f739a1d4580405ce2d189390c9f68b (patch)
tree: a5a10e8876395dd545433787fc2b036560207c3e /commcrawler/scrapy.py
parent: f7fe0a457eb92611731ba93959f3fca0ceb16528 (diff)
download: Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2
Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip
1 files changed, 9 insertions, 9 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7147949..bdd28c3 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -43,6 +43,7 @@ class DefaultSpider:
     crawl_result = None
     links_reached = set()
     redirect = None
+    is_timeout = False
 
     def start_requests(self):
         q = {
@@ -128,13 +129,14 @@ class DefaultSpider:
                 pk=self.crawl_result.pk)
             result.status = "T"
             result.save()
+            self.is_timeout = True
         return True
 
     def parse(self, response):
         result = {
             "url": response.url,
         }
-        if self.timeout():
+        if self.is_timeout or self.timeout():
             return []
         for domain in self.excluded_domains:
             if domain in response.url:
@@ -156,14 +158,12 @@ class DefaultSpider:
                             is_internal = True
                             self.links_reached.add(url)
                             is_file = self._parse_internal_files(url, result)
-                            if is_file:
-                                pass
-                            elif not MAX_LINKS or \
-                                    len(self.links_reached) < MAX_LINKS:
+                            if not is_file and \
+                                    not self.is_timeout and \
+                                    not self.timeout() and (
+                                        not MAX_LINKS or
+                                        len(self.links_reached) < MAX_LINKS):
                                 yield response.follow(link.url, self.parse)
-                            else:
-                                print("MAX", self.allowed_domains,
-                                      self.links_reached)
                     if not is_internal:
                         current_domain = get_domain(url)
                         if current_domain in FACEBOOK_DOMAINS:
@@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
         while response is None:
             try:
                 response = requests.get(target.url, verify=verify_ssl,
-                                        timeout=20)
+                                        timeout=45)
             except requests.exceptions.SSLError:
                 if not verify_ssl:  # new error on SSL
                     response = "Try..."  # scrapy is more permissive
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-09 19:39:46 +0200
committer	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-09 19:39:46 +0200
commit	507e0ce240f739a1d4580405ce2d189390c9f68b (patch)
tree	a5a10e8876395dd545433787fc2b036560207c3e /commcrawler/scrapy.py
parent	f7fe0a457eb92611731ba93959f3fca0ceb16528 (diff)
download	Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2 Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip