From 125f96e865d0ed6504ad90e4800389c1c56a4aba Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Fri, 9 Aug 2019 15:48:34 +0200 Subject: Manage other requests exceptions --- commcrawler/management/commands/launch_crawl.py | 1 - commcrawler/scrapy.py | 25 ++++++++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py index 883c035..ea49d81 100644 --- a/commcrawler/management/commands/launch_crawl.py +++ b/commcrawler/management/commands/launch_crawl.py @@ -1,4 +1,3 @@ -import csv import sys from django.core.management.base import BaseCommand diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7e076d6..30c1fd3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -324,13 +324,24 @@ def launch_crawl(crawl_item, excluded_domains=None): "crawl_id": crawl_item.pk, "target_id": target.pk, } - try: - response = requests.get(target.url) - except requests.exceptions.SSLError: - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.bad_ssl = True - result.save() + response, verify_ssl = None, True + while not response: + try: + response = requests.get(target.url, verify=verify_ssl) + except requests.exceptions.SSLError: + verify_ssl = False + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + result.bad_ssl = True + result.save() + except requests.exceptions.RequestException: + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + result.is_online = False + result.save() + break + if not response: + continue redirect = None url = target.url -- cgit v1.2.3