diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 19:39:46 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 19:39:46 +0200 |
commit | 507e0ce240f739a1d4580405ce2d189390c9f68b (patch) | |
tree | a5a10e8876395dd545433787fc2b036560207c3e /commcrawler/scrapy.py | |
parent | f7fe0a457eb92611731ba93959f3fca0ceb16528 (diff) | |
download | Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.tar.bz2 Comm-on-net-507e0ce240f739a1d4580405ce2d189390c9f68b.zip |
Scrap: add a condition on timeout
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7147949..bdd28c3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -43,6 +43,7 @@ class DefaultSpider: crawl_result = None links_reached = set() redirect = None + is_timeout = False def start_requests(self): q = { @@ -128,13 +129,14 @@ class DefaultSpider: pk=self.crawl_result.pk) result.status = "T" result.save() + self.is_timeout = True return True def parse(self, response): result = { "url": response.url, } - if self.timeout(): + if self.is_timeout or self.timeout(): return [] for domain in self.excluded_domains: if domain in response.url: @@ -156,14 +158,12 @@ class DefaultSpider: is_internal = True self.links_reached.add(url) is_file = self._parse_internal_files(url, result) - if is_file: - pass - elif not MAX_LINKS or \ - len(self.links_reached) < MAX_LINKS: + if not is_file and \ + not self.is_timeout and \ + not self.timeout() and ( + not MAX_LINKS or + len(self.links_reached) < MAX_LINKS): yield response.follow(link.url, self.parse) - else: - print("MAX", self.allowed_domains, - self.links_reached) if not is_internal: current_domain = get_domain(url) if current_domain in FACEBOOK_DOMAINS: @@ -339,7 +339,7 @@ def launch_crawl(crawl_item, excluded_domains=None): while response is None: try: response = requests.get(target.url, verify=verify_ssl, - timeout=20) + timeout=45) except requests.exceptions.SSLError: if not verify_ssl: # new error on SSL response = "Try..." # scrapy is more permissive |