summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py18
1 files changed, 12 insertions, 6 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 39e3a3e..e3782d6 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -337,27 +337,33 @@ def launch_crawl(crawl_item, excluded_domains=None):
"crawl_id": crawl_item.pk,
"target_id": target.pk,
}
- response, verify_ssl, retry = None, True, 5
+ response, verify_ssl = None, True
while response is None:
try:
response = requests.get(target.url, verify=verify_ssl)
except requests.exceptions.SSLError:
- update_db_result(result_dct, {"bad_ssl": True})
- verify_ssl = False
+ if not verify_ssl: # new error on SSL
+ response = "Try..." # scrapy is more permissive
+ else:
+ update_db_result(result_dct, {"bad_ssl": True})
+ verify_ssl = False
except requests.exceptions.RequestException:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
response = False
if response is False:
continue
- if response.status_code == 404:
+ if response == "Try...":
+ pass
+ elif response.status_code == 404:
update_db_result(result_dct, {"is_online": False,
"status": "F"})
continue
+ else:
+ url = target.url
redirect = None
- url = target.url
- if response.history:
+ if getattr(response, 'history', None):
url = response.url
redirect = url
domain = get_domain(url)