diff options
-rw-r--r-- | commcrawler/scrapy.py | 41 |
1 files changed, 22 insertions, 19 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 5fc9fce..a430f0e 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -312,6 +312,14 @@ def launch_match(crawl_item): crawl_item.save() +def update_db_result(result_dct, values): + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + for k in values.keys(): + setattr(result, k, values[k]) + result.save() + + def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) @@ -324,24 +332,22 @@ def launch_crawl(crawl_item, excluded_domains=None): "crawl_id": crawl_item.pk, "target_id": target.pk, } - response, verify_ssl = None, True - while not response: + response, verify_ssl, retry = None, True, 5 + while response is None: try: response = requests.get(target.url, verify=verify_ssl) except requests.exceptions.SSLError: + update_db_result(result_dct, {"bad_ssl": True}) verify_ssl = False - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.bad_ssl = True - result.save() except requests.exceptions.RequestException: - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.is_online = False - result.status = "F" - result.save() - break - if not response: + update_db_result(result_dct, {"is_online": False, + "status": "F"}) + response = False + if response is False: + continue + if response.status_code == 404: + update_db_result(result_dct, {"is_online": False, + "status": "F"}) continue redirect = None @@ -351,12 +357,9 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect = url domain = get_domain(url) if domain in excluded_domains: - result, __ = models.CrawlResult.objects.get_or_create( - **result_dct) - result.redirection = redirect - result.is_online = False - result.status = "F" - result.save() + update_db_result( + result_dct, {"is_online": False, "status": "F", + "redirection": redirect}) continue process.crawl( create_spider( |