diff options
-rw-r--r-- | commcrawler/admin.py | 4 | ||||
-rw-r--r-- | commcrawler/migrations/0002_crawlresult_bad_ssl.py | 20 | ||||
-rw-r--r-- | commcrawler/models.py | 2 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 20 |
4 files changed, 38 insertions, 8 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 9f5003b..46c5aab 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -30,12 +30,12 @@ class CrawlResultAdmin(admin.ModelAdmin): model = models.CrawlResult list_display = ( "target", "crawl", "started", "duration", "status", "is_online", - "nb_external_link", "nb_internal_link", + "bad_ssl", "nb_external_link", "nb_internal_link", "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube", "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf", "nb_external_pdf", "nb_internal_office", "nb_external_office" ) - list_filter = ("status", "crawl", "is_online") + list_filter = ("status", "crawl", "is_online", "bad_ssl") search_fields = ("target__name",) readonly_fields = ( diff --git a/commcrawler/migrations/0002_crawlresult_bad_ssl.py b/commcrawler/migrations/0002_crawlresult_bad_ssl.py new file mode 100644 index 0000000..7597876 --- /dev/null +++ b/commcrawler/migrations/0002_crawlresult_bad_ssl.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11 on 2019-08-09 13:38 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('commcrawler', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='crawlresult', + name='bad_ssl', + field=models.BooleanField(default=False, verbose_name='Bad SSL certificate'), + ), + ] diff --git a/commcrawler/models.py b/commcrawler/models.py index cdadd43..da578bb 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -125,6 +125,8 @@ class CrawlResult(models.Model): verbose_name=_("External office documents"), default=0) is_online = models.BooleanField( verbose_name=_("Website is online"), default=False) + bad_ssl = models.BooleanField( + verbose_name=_("Bad SSL certificate"), default=False) redirection = models.URLField( verbose_name=_("Redirection"), blank=True, null=True) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 67c9ee3..7e076d6 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -320,7 +320,18 @@ def launch_crawl(crawl_item, excluded_domains=None): crawl_item.status = "P" crawl_item.save() for target in crawl_item.targets.all(): - response = requests.get(target.url) + result_dct = { + "crawl_id": crawl_item.pk, + "target_id": target.pk, + } + try: + response = requests.get(target.url) + except requests.exceptions.SSLError: + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) + result.bad_ssl = True + result.save() + redirect = None url = target.url if response.history: @@ -328,11 +339,8 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect = url domain = get_domain(url) if domain in excluded_domains: - dct = { - "crawl_id": crawl_item.pk, - "target_id": target.pk, - } - result, __ = models.CrawlResult.objects.get_or_create(**dct) + result, __ = models.CrawlResult.objects.get_or_create( + **result_dct) result.redirection = redirect result.is_online = False result.status = "F" |