From e8cbb1cd58853e28ea756ed2ccd77363c54e04d2 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Fri, 9 Aug 2019 16:48:14 +0200 Subject: Let scrapy try domain with bad SSL --- commcrawler/lookups.py | 2 +- commcrawler/scrapy.py | 18 ++++++++++++------ install.sh | 4 ++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/commcrawler/lookups.py b/commcrawler/lookups.py index 106a69a..396490b 100644 --- a/commcrawler/lookups.py +++ b/commcrawler/lookups.py @@ -14,7 +14,7 @@ class TargetLookup(LookupChannel): def get_query(self, q, request): query = Q() for term in q.strip().split(' '): - subquery = Q(name__icontains=term) + subquery = Q(name__icontains=term) | Q(url__icontains=term) query &= subquery return self.model.objects.filter(query).order_by('name')[:20] diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 39e3a3e..e3782d6 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -337,27 +337,33 @@ def launch_crawl(crawl_item, excluded_domains=None): "crawl_id": crawl_item.pk, "target_id": target.pk, } - response, verify_ssl, retry = None, True, 5 + response, verify_ssl = None, True while response is None: try: response = requests.get(target.url, verify=verify_ssl) except requests.exceptions.SSLError: - update_db_result(result_dct, {"bad_ssl": True}) - verify_ssl = False + if not verify_ssl: # new error on SSL + response = "Try..." # scrapy is more permissive + else: + update_db_result(result_dct, {"bad_ssl": True}) + verify_ssl = False except requests.exceptions.RequestException: update_db_result(result_dct, {"is_online": False, "status": "F"}) response = False if response is False: continue - if response.status_code == 404: + if response == "Try...": + pass + elif response.status_code == 404: update_db_result(result_dct, {"is_online": False, "status": "F"}) continue + else: + url = target.url redirect = None - url = target.url - if response.history: + if getattr(response, 'history', None): url = response.url redirect = url domain = get_domain(url) diff --git a/install.sh b/install.sh index 414bc1f..0005cd2 100755 --- a/install.sh +++ b/install.sh @@ -17,10 +17,10 @@ APT_OPTIONS=" -y -q " apt-get install $APT_OPTIONS git nginx uwsgi uwsgi-plugin-python3 postgresql apg sed gettext > /dev/null apt-get install $APT_OPTIONS python3 python3-pip python3-psycopg2 python3-pygments python3-service-identity > /dev/null -apt-get install $APT_OPTIONS -t stretch-backports python3-django python3-requests > /dev/null +apt-get install $APT_OPTIONS -t stretch-backports python3-django > /dev/null # buster/bulleyes: apt install python3-django python3-requests -pip3 install scrapy==1.7 tldextract==2.2 django-ajax-selects==1.6.0 service_identity==18.1 > /dev/null +pip3 install scrapy==1.7 tldextract==2.2 django-ajax-selects==1.6.0 service_identity==18.1 requests[security] > /dev/null # buster: apt install python3-tldextract django-ajax-selects # buster: apt install -t buster-backports python3-service-identity # bullseye: apt install python3-scrapy python3-tldextract django-ajax-selects python3-service-identity -- cgit v1.2.3