summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--commcrawler/admin.py4
-rw-r--r--commcrawler/migrations/0002_crawlresult_bad_ssl.py20
-rw-r--r--commcrawler/models.py2
-rw-r--r--commcrawler/scrapy.py20
4 files changed, 38 insertions, 8 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index 9f5003b..46c5aab 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -30,12 +30,12 @@ class CrawlResultAdmin(admin.ModelAdmin):
model = models.CrawlResult
list_display = (
"target", "crawl", "started", "duration", "status", "is_online",
- "nb_external_link", "nb_internal_link",
+ "bad_ssl", "nb_external_link", "nb_internal_link",
"nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
"nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
"nb_external_pdf", "nb_internal_office", "nb_external_office"
)
- list_filter = ("status", "crawl", "is_online")
+ list_filter = ("status", "crawl", "is_online", "bad_ssl")
search_fields = ("target__name",)
readonly_fields = (
diff --git a/commcrawler/migrations/0002_crawlresult_bad_ssl.py b/commcrawler/migrations/0002_crawlresult_bad_ssl.py
new file mode 100644
index 0000000..7597876
--- /dev/null
+++ b/commcrawler/migrations/0002_crawlresult_bad_ssl.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11 on 2019-08-09 13:38
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('commcrawler', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='crawlresult',
+ name='bad_ssl',
+ field=models.BooleanField(default=False, verbose_name='Bad SSL certificate'),
+ ),
+ ]
diff --git a/commcrawler/models.py b/commcrawler/models.py
index cdadd43..da578bb 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -125,6 +125,8 @@ class CrawlResult(models.Model):
verbose_name=_("External office documents"), default=0)
is_online = models.BooleanField(
verbose_name=_("Website is online"), default=False)
+ bad_ssl = models.BooleanField(
+ verbose_name=_("Bad SSL certificate"), default=False)
redirection = models.URLField(
verbose_name=_("Redirection"), blank=True, null=True)
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 67c9ee3..7e076d6 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -320,7 +320,18 @@ def launch_crawl(crawl_item, excluded_domains=None):
crawl_item.status = "P"
crawl_item.save()
for target in crawl_item.targets.all():
- response = requests.get(target.url)
+ result_dct = {
+ "crawl_id": crawl_item.pk,
+ "target_id": target.pk,
+ }
+ try:
+ response = requests.get(target.url)
+ except requests.exceptions.SSLError:
+ result, __ = models.CrawlResult.objects.get_or_create(
+ **result_dct)
+ result.bad_ssl = True
+ result.save()
+
redirect = None
url = target.url
if response.history:
@@ -328,11 +339,8 @@ def launch_crawl(crawl_item, excluded_domains=None):
redirect = url
domain = get_domain(url)
if domain in excluded_domains:
- dct = {
- "crawl_id": crawl_item.pk,
- "target_id": target.pk,
- }
- result, __ = models.CrawlResult.objects.get_or_create(**dct)
+ result, __ = models.CrawlResult.objects.get_or_create(
+ **result_dct)
result.redirection = redirect
result.is_online = False
result.status = "F"