diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 16:03:17 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 16:03:17 +0200 |
commit | 0f26c668bcc86d1a4cfc91f1b8154055409e8aab (patch) | |
tree | 70f45ec1bcebf8752c9d44ee964834c6070fdc23 | |
parent | d14de1420b8b875efb8d594628e524d7b857a64a (diff) | |
download | Comm-on-net-0f26c668bcc86d1a4cfc91f1b8154055409e8aab.tar.bz2 Comm-on-net-0f26c668bcc86d1a4cfc91f1b8154055409e8aab.zip |
Early management of 404
-rw-r--r-- | commcrawler/scrapy.py | 41 |
1 files changed, 22 insertions, 19 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 5fc9fce..a430f0e 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -312,6 +312,14 @@ def launch_match(crawl_item): crawl_item.save() +def update_db_result(result_dct, values): + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + for k in values.keys(): + setattr(result, k, values[k]) + result.save() + + def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) @@ -324,24 +332,22 @@ def launch_crawl(crawl_item, excluded_domains=None): "crawl_id": crawl_item.pk, "target_id": target.pk, } - response, verify_ssl = None, True - while not response: + response, verify_ssl, retry = None, True, 5 + while response is None: try: response = requests.get(target.url, verify=verify_ssl) except requests.exceptions.SSLError: + update_db_result(result_dct, {"bad_ssl": True}) verify_ssl = False - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.bad_ssl = True - result.save() except requests.exceptions.RequestException: - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.is_online = False - result.status = "F" - result.save() - break - if not response: + update_db_result(result_dct, {"is_online": False, + "status": "F"}) + response = False + if response is False: + continue + if response.status_code == 404: + update_db_result(result_dct, {"is_online": False, + "status": "F"}) continue redirect = None @@ -351,12 +357,9 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect = url domain = get_domain(url) if domain in excluded_domains: - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.redirection = redirect - result.is_online = False - result.status = "F" - result.save() + update_db_result( + result_dct, {"is_online": False, "status": "F", + "redirection": redirect}) continue process.crawl( create_spider( |