summaryrefslogtreecommitdiff
path: root/commcrawler
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-05 12:52:31 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-05 12:52:31 +0200
commit72dfec0c3532941a46f77b3c0a6a49e16e6a2864 (patch)
tree8fcb33e87b357c796ca8f2e3325298272900745d /commcrawler
parentc6b3188e49049cf689658654a1458a3276304782 (diff)
downloadComm-on-net-72dfec0c3532941a46f77b3c0a6a49e16e6a2864.tar.bz2
Comm-on-net-72dfec0c3532941a46f77b3c0a6a49e16e6a2864.zip
Manage excluded domains
Diffstat (limited to 'commcrawler')
-rw-r--r--commcrawler/admin.py8
-rw-r--r--commcrawler/locale/fr/LC_MESSAGES/django.po96
-rw-r--r--commcrawler/management/commands/launch_crawl.py6
-rw-r--r--commcrawler/migrations/0001_initial.py27
-rw-r--r--commcrawler/models.py11
-rw-r--r--commcrawler/scrapy.py50
6 files changed, 136 insertions, 62 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index ea4ca14..457d2b0 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -44,3 +44,11 @@ class CrawlRelationAdmin(admin.ModelAdmin):
admin_site.register(models.CrawlRelation, CrawlRelationAdmin)
+
+
+class ExcludedDomainAdmin(admin.ModelAdmin):
+ list_display = ('url',)
+ search_fields = ('url',)
+
+
+admin_site.register(models.ExludedDomains, ExcludedDomainAdmin)
diff --git a/commcrawler/locale/fr/LC_MESSAGES/django.po b/commcrawler/locale/fr/LC_MESSAGES/django.po
index 021b9a6..2b2d3f9 100644
--- a/commcrawler/locale/fr/LC_MESSAGES/django.po
+++ b/commcrawler/locale/fr/LC_MESSAGES/django.po
@@ -2,7 +2,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: \n"
-"POT-Creation-Date: 2019-08-01 19:52+0200\n"
+"POT-Creation-Date: 2019-08-05 12:09+0200\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
@@ -16,159 +16,171 @@ msgstr ""
msgid "Crawler"
msgstr "Robot d'indexation"
-#: models.py:11
+#: models.py:10
+msgid "URL"
+msgstr "URL"
+
+#: models.py:13
+msgid "Excluded domain"
+msgstr "Domaine exclu"
+
+#: models.py:14
+msgid "Excluded domains"
+msgstr "Domaines exclus"
+
+#: models.py:22
msgid "Created"
msgstr "Créé"
-#: models.py:11
+#: models.py:22
msgid "In progress"
msgstr "En cours"
-#: models.py:12
+#: models.py:23
msgid "Finished"
msgstr "Fini"
-#: models.py:14
+#: models.py:25
msgid "Name"
msgstr "Nom"
-#: models.py:16
+#: models.py:27
msgid "Creation date"
msgstr "Date de création"
-#: models.py:18
+#: models.py:29
msgid "Start date"
msgstr "Date de début"
-#: models.py:20
+#: models.py:31
msgid "End date"
msgstr "Date de fin"
-#: models.py:22
+#: models.py:33
msgid "Status"
msgstr "État"
-#: models.py:27 models.py:40 models.py:98
+#: models.py:38 models.py:51 models.py:109
msgid "Crawl"
msgstr "Session d'indexation"
-#: models.py:28
+#: models.py:39
msgid "Crawls"
msgstr "Session d'indexation"
-#: models.py:41
+#: models.py:52
msgid "Target"
msgstr "Cible"
-#: models.py:43
+#: models.py:54
msgid "Number of external links"
msgstr "Nombre de liens externes"
-#: models.py:45
+#: models.py:56
msgid "Number of internal links"
msgstr "Nombre de liens internes"
-#: models.py:47
+#: models.py:58
msgid "Number of images"
msgstr "Nombre d'images"
-#: models.py:49
+#: models.py:60
msgid "Number of Facebook links"
msgstr "Nombre de liens Facebook"
-#: models.py:51
+#: models.py:62
msgid "Number of Twitter links"
msgstr "Nombre de liens Twitter"
-#: models.py:53
+#: models.py:64
msgid "Number of Instagram links"
msgstr "Nombre de liens Instagram"
-#: models.py:55
+#: models.py:66
msgid "Number of Youtube links"
msgstr "Nombre de liens Youtube"
-#: models.py:57
+#: models.py:68
msgid "Number of Dailymotion links"
msgstr "Nombre de liens Dailymotion"
-#: models.py:59
+#: models.py:70
msgid "Number of Vimeo links"
msgstr "Nombre de liens Vimeo"
-#: models.py:61
+#: models.py:72
msgid "Number of videos"
msgstr "Nombre de vidéos"
-#: models.py:63
+#: models.py:74
msgid "Number of audios"
msgstr "Nombre de fichiers audio"
-#: models.py:65
+#: models.py:76
msgid "Number of internal PDF"
msgstr "Nombre de PDF internes"
-#: models.py:67
+#: models.py:78
msgid "Number of external PDF"
msgstr "Nombre de PDF externes"
-#: models.py:69
+#: models.py:80
msgid "Number of internal office documents"
msgstr "Nombre de liens document office internes"
-#: models.py:71
+#: models.py:82
msgid "Number of external office documents"
msgstr "Nombre de liens document office externes"
-#: models.py:73
+#: models.py:84
msgid "Website is online"
msgstr "Site en ligne"
-#: models.py:75
+#: models.py:86
msgid "Redirection"
msgstr "Redirection"
-#: models.py:78
+#: models.py:89
msgid "Crawl result"
msgstr "Résultat d'indexation"
-#: models.py:79
+#: models.py:90
msgid "Crawl results"
msgstr "Résultats d'indexation"
-#: models.py:86
+#: models.py:97
msgid "Result"
msgstr "Résultat"
-#: models.py:87
+#: models.py:98
msgid "Link"
msgstr "Lien"
-#: models.py:90
+#: models.py:101
msgid "Crawl link"
-msgstr "Indexation - lien"
+msgstr "Indexation - Lien"
-#: models.py:91
+#: models.py:102
msgid "Crawl links"
-msgstr "Indexations - liens"
+msgstr "Indexations - Liens"
-#: models.py:99
+#: models.py:110
msgid "Source"
msgstr "Source"
-#: models.py:101
+#: models.py:112
msgid "Destination"
msgstr "Destination"
-#: models.py:103
+#: models.py:114
msgid "Number"
msgstr "Nombre"
-#: models.py:106
+#: models.py:117
msgid "Crawl relation"
msgstr "Indexation - Relation"
-#: models.py:107
+#: models.py:118
msgid "Crawl relations"
msgstr "Indexations - Relations"
diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py
index 92c3081..1248eeb 100644
--- a/commcrawler/management/commands/launch_crawl.py
+++ b/commcrawler/management/commands/launch_crawl.py
@@ -3,7 +3,7 @@ import sys
from django.core.management.base import BaseCommand
-from commcrawler.models import Crawl
+from commcrawler.models import Crawl, ExludedDomains
from commcrawler.scrapy import launch_crawl
@@ -57,5 +57,7 @@ class Command(BaseCommand):
except ValueError:
c_id = None
current_crawl = crawls[c_id]
- launch_crawl(current_crawl)
+ excluded = [domain.split("://")[1] for domain in
+ ExludedDomains.objects.all()]
+ launch_crawl(current_crawl, excluded_domains=excluded)
diff --git a/commcrawler/migrations/0001_initial.py b/commcrawler/migrations/0001_initial.py
index 0b4e561..c2e261b 100644
--- a/commcrawler/migrations/0001_initial.py
+++ b/commcrawler/migrations/0001_initial.py
@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
-# Generated by Django 1.11 on 2019-07-31 08:21
+# Generated by Django 1.11 on 2019-08-05 10:01
from __future__ import unicode_literals
import datetime
@@ -24,13 +24,13 @@ class Migration(migrations.Migration):
('created', models.DateTimeField(default=datetime.datetime.now, verbose_name='Creation date')),
('started', models.DateTimeField(blank=True, null=True, verbose_name='Start date')),
('ended', models.DateTimeField(blank=True, null=True, verbose_name='End date')),
- ('status', models.CharField(choices=[('C', 'Created'), ('P', 'In progress'), ('F', 'Finished')], default='C', max_length=1)),
+ ('status', models.CharField(choices=[('C', 'Created'), ('P', 'In progress'), ('F', 'Finished')], default='C', max_length=1, verbose_name='Status')),
('targets', models.ManyToManyField(blank=True, to='commorganization.Target')),
],
options={
'ordering': ('created', 'name'),
- 'verbose_name_plural': 'Crawls',
'verbose_name': 'Crawl',
+ 'verbose_name_plural': 'Crawls',
},
),
migrations.CreateModel(
@@ -40,8 +40,8 @@ class Migration(migrations.Migration):
('link', models.URLField(verbose_name='Link')),
],
options={
- 'verbose_name_plural': 'Crawl links',
'verbose_name': 'Crawl link',
+ 'verbose_name_plural': 'Crawl links',
},
),
migrations.CreateModel(
@@ -54,8 +54,8 @@ class Migration(migrations.Migration):
('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='relation_source', to='commorganization.Target', verbose_name='Source')),
],
options={
- 'verbose_name_plural': 'Crawl relations',
'verbose_name': 'Crawl relation',
+ 'verbose_name_plural': 'Crawl relations',
},
),
migrations.CreateModel(
@@ -75,16 +75,27 @@ class Migration(migrations.Migration):
('nb_audio', models.IntegerField(default=0, verbose_name='Number of audios')),
('nb_internal_pdf', models.IntegerField(default=0, verbose_name='Number of internal PDF')),
('nb_external_pdf', models.IntegerField(default=0, verbose_name='Number of external PDF')),
- ('nb_internal_office', models.IntegerField(default=0, verbose_name='Number of internal PDF')),
- ('nb_external_office', models.IntegerField(default=0, verbose_name='Number of external PDF')),
+ ('nb_internal_office', models.IntegerField(default=0, verbose_name='Number of internal office documents')),
+ ('nb_external_office', models.IntegerField(default=0, verbose_name='Number of external office documents')),
('is_online', models.BooleanField(default=False, verbose_name='Website is online')),
('redirection', models.URLField(blank=True, null=True, verbose_name='Redirection')),
('crawl', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commcrawler.Crawl', verbose_name='Crawl')),
('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commorganization.Target', verbose_name='Target')),
],
options={
- 'verbose_name_plural': 'Crawl results',
'verbose_name': 'Crawl result',
+ 'verbose_name_plural': 'Crawl results',
+ },
+ ),
+ migrations.CreateModel(
+ name='ExludedDomains',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('url', models.URLField(unique=True, verbose_name='URL')),
+ ],
+ options={
+ 'verbose_name': 'Excluded domain',
+ 'verbose_name_plural': 'Excluded domains',
},
),
migrations.AddField(
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 0050ad8..f3e3246 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -6,6 +6,17 @@ from django.utils.translation import ugettext_lazy as _
from commorganization.models import Target
+class ExludedDomains(models.Model):
+ url = models.URLField(verbose_name=_("URL"), unique=True)
+
+ class Meta:
+ verbose_name = _("Excluded domain")
+ verbose_name_plural = _("Excluded domains")
+
+ def __str__(self):
+ return self.url
+
+
class Crawl(models.Model):
STATUS = (
('C', _("Created")), ('P', _("In progress")),
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index e821a31..d218648 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,13 +1,21 @@
import scrapy
from scrapy.crawler import CrawlerProcess
+from scrapy.exceptions import NotSupported
from scrapy.linkextractors import LinkExtractor
import tldextract
from django.conf import settings
+MAX_LINKS = 500
+
class DefaultSpider:
+ name = None
+ start_urls = None
+ allowed_domains = []
+ excluded_domains = []
target_id = None
+ links_reached = set()
def _parse_image(self, response, result):
for __ in response.css('img'):
@@ -20,10 +28,27 @@ class DefaultSpider:
"url": response.url,
"target_id": self.target_id
}
- self._parse_image(response, result)
- for link in LinkExtractor().extract_links(response):
- if link.url is not None:
- yield response.follow(link.url, self.parse)
+ for domain in self.excluded_domains:
+ if domain in response.url:
+ result["offline"] = True
+ yield result
+ try:
+ self._parse_image(response, result)
+ for link in LinkExtractor().extract_links(response):
+ url = link.url
+ if url is None or url in self.links_reached:
+ continue
+ for domain in self.allowed_domains:
+ if domain in url:
+ self.links_reached.add(link.url)
+ if len(self.links_reached) < MAX_LINKS:
+ yield response.follow(link.url, self.parse)
+ else:
+ print("MAX", self.allowed_domains,
+ self.links_reached)
+ except NotSupported:
+ print("No response", response.url)
+ yield result
def get_domain(url):
@@ -31,16 +56,19 @@ def get_domain(url):
return '{}.{}'.format(ext.domain, ext.suffix)
-def create_spider(name, urls, target=None):
+def create_spider(name, urls, target, excluded_domains=None):
+ if not excluded_domains:
+ excluded_domains = []
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
- "target_id": target.pk}
+ "target_id": target.pk, "links_reached": set(),
+ "excluded_domains": excluded_domains}
)
-def launch_crawl(crawl_item):
+def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
scrap_settings.update({
'FEED_FORMAT': 'json',
@@ -49,8 +77,10 @@ def launch_crawl(crawl_item):
process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
- create_spider("Target{}".format(target.pk),
- [target.url],
- target)
+ create_spider(
+ "Target{}".format(target.pk),
+ [target.url], target,
+ excluded_domains
+ )
)
process.start()