diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 16:48:14 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 16:48:14 +0200 |
commit | e8cbb1cd58853e28ea756ed2ccd77363c54e04d2 (patch) | |
tree | d59e2912c1da89429228fb9bc9d0284e2964537b /commcrawler | |
parent | 0d17f3696f66b0cb66e560d06ae40d7b87900a8a (diff) | |
download | Comm-on-net-e8cbb1cd58853e28ea756ed2ccd77363c54e04d2.tar.bz2 Comm-on-net-e8cbb1cd58853e28ea756ed2ccd77363c54e04d2.zip |
Let scrapy try domain with bad SSL
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/lookups.py | 2 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 18 |
2 files changed, 13 insertions, 7 deletions
diff --git a/commcrawler/lookups.py b/commcrawler/lookups.py index 106a69a..396490b 100644 --- a/commcrawler/lookups.py +++ b/commcrawler/lookups.py @@ -14,7 +14,7 @@ class TargetLookup(LookupChannel): def get_query(self, q, request): query = Q() for term in q.strip().split(' '): - subquery = Q(name__icontains=term) + subquery = Q(name__icontains=term) | Q(url__icontains=term) query &= subquery return self.model.objects.filter(query).order_by('name')[:20] diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 39e3a3e..e3782d6 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -337,27 +337,33 @@ def launch_crawl(crawl_item, excluded_domains=None): "crawl_id": crawl_item.pk, "target_id": target.pk, } - response, verify_ssl, retry = None, True, 5 + response, verify_ssl = None, True while response is None: try: response = requests.get(target.url, verify=verify_ssl) except requests.exceptions.SSLError: - update_db_result(result_dct, {"bad_ssl": True}) - verify_ssl = False + if not verify_ssl: # new error on SSL + response = "Try..." # scrapy is more permissive + else: + update_db_result(result_dct, {"bad_ssl": True}) + verify_ssl = False except requests.exceptions.RequestException: update_db_result(result_dct, {"is_online": False, "status": "F"}) response = False if response is False: continue - if response.status_code == 404: + if response == "Try...": + pass + elif response.status_code == 404: update_db_result(result_dct, {"is_online": False, "status": "F"}) continue + else: + url = target.url redirect = None - url = target.url - if response.history: + if getattr(response, 'history', None): url = response.url redirect = url domain = get_domain(url) |