summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 16:48:14 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 16:48:14 +0200
commite8cbb1cd58853e28ea756ed2ccd77363c54e04d2 (patch)
treed59e2912c1da89429228fb9bc9d0284e2964537b /commcrawler/scrapy.py
parent0d17f3696f66b0cb66e560d06ae40d7b87900a8a (diff)
downloadComm-on-net-e8cbb1cd58853e28ea756ed2ccd77363c54e04d2.tar.bz2
Comm-on-net-e8cbb1cd58853e28ea756ed2ccd77363c54e04d2.zip
Let scrapy try domain with bad SSL
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py18
1 files changed, 12 insertions, 6 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 39e3a3e..e3782d6 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -337,27 +337,33 @@ def launch_crawl(crawl_item, excluded_domains=None):
"crawl_id": crawl_item.pk,
"target_id": target.pk,
}
- response, verify_ssl, retry = None, True, 5
+ response, verify_ssl = None, True
while response is None:
try:
response = requests.get(target.url, verify=verify_ssl)
except requests.exceptions.SSLError:
- update_db_result(result_dct, {"bad_ssl": True})
- verify_ssl = False
+ if not verify_ssl: # new error on SSL
+ response = "Try..." # scrapy is more permissive
+ else:
+ update_db_result(result_dct, {"bad_ssl": True})
+ verify_ssl = False
except requests.exceptions.RequestException:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
response = False
if response is False:
continue
- if response.status_code == 404:
+ if response == "Try...":
+ pass
+ elif response.status_code == 404:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
continue
+ else:
+ url = target.url
redirect = None
- url = target.url
- if response.history:
+ if getattr(response, 'history', None):
url = response.url
redirect = url
domain = get_domain(url)