diff options
-rw-r--r-- | commcrawler/management/commands/launch_crawl.py | 1 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 25 |
2 files changed, 18 insertions, 8 deletions
diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py index 883c035..ea49d81 100644 --- a/commcrawler/management/commands/launch_crawl.py +++ b/commcrawler/management/commands/launch_crawl.py @@ -1,4 +1,3 @@ -import csv import sys from django.core.management.base import BaseCommand diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7e076d6..30c1fd3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -324,13 +324,24 @@ def launch_crawl(crawl_item, excluded_domains=None): "crawl_id": crawl_item.pk, "target_id": target.pk, } - try: - response = requests.get(target.url) - except requests.exceptions.SSLError: - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.bad_ssl = True - result.save() + response, verify_ssl = None, True + while not response: + try: + response = requests.get(target.url, verify=verify_ssl) + except requests.exceptions.SSLError: + verify_ssl = False + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + result.bad_ssl = True + result.save() + except requests.exceptions.RequestException: + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + result.is_online = False + result.save() + break + if not response: + continue redirect = None url = target.url |