diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-05 12:52:31 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-05 12:52:31 +0200 |
commit | 72dfec0c3532941a46f77b3c0a6a49e16e6a2864 (patch) | |
tree | 8fcb33e87b357c796ca8f2e3325298272900745d /commcrawler | |
parent | c6b3188e49049cf689658654a1458a3276304782 (diff) | |
download | Comm-on-net-72dfec0c3532941a46f77b3c0a6a49e16e6a2864.tar.bz2 Comm-on-net-72dfec0c3532941a46f77b3c0a6a49e16e6a2864.zip |
Manage excluded domains
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/admin.py | 8 | ||||
-rw-r--r-- | commcrawler/locale/fr/LC_MESSAGES/django.po | 96 | ||||
-rw-r--r-- | commcrawler/management/commands/launch_crawl.py | 6 | ||||
-rw-r--r-- | commcrawler/migrations/0001_initial.py | 27 | ||||
-rw-r--r-- | commcrawler/models.py | 11 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 50 |
6 files changed, 136 insertions, 62 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index ea4ca14..457d2b0 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -44,3 +44,11 @@ class CrawlRelationAdmin(admin.ModelAdmin): admin_site.register(models.CrawlRelation, CrawlRelationAdmin) + + +class ExcludedDomainAdmin(admin.ModelAdmin): + list_display = ('url',) + search_fields = ('url',) + + +admin_site.register(models.ExludedDomains, ExcludedDomainAdmin) diff --git a/commcrawler/locale/fr/LC_MESSAGES/django.po b/commcrawler/locale/fr/LC_MESSAGES/django.po index 021b9a6..2b2d3f9 100644 --- a/commcrawler/locale/fr/LC_MESSAGES/django.po +++ b/commcrawler/locale/fr/LC_MESSAGES/django.po @@ -2,7 +2,7 @@ msgid "" msgstr "" "Project-Id-Version: PACKAGE VERSION\n" "Report-Msgid-Bugs-To: \n" -"POT-Creation-Date: 2019-08-01 19:52+0200\n" +"POT-Creation-Date: 2019-08-05 12:09+0200\n" "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" "Last-Translator: FULL NAME <EMAIL@ADDRESS>\n" "Language-Team: LANGUAGE <LL@li.org>\n" @@ -16,159 +16,171 @@ msgstr "" msgid "Crawler" msgstr "Robot d'indexation" -#: models.py:11 +#: models.py:10 +msgid "URL" +msgstr "URL" + +#: models.py:13 +msgid "Excluded domain" +msgstr "Domaine exclu" + +#: models.py:14 +msgid "Excluded domains" +msgstr "Domaines exclus" + +#: models.py:22 msgid "Created" msgstr "Créé" -#: models.py:11 +#: models.py:22 msgid "In progress" msgstr "En cours" -#: models.py:12 +#: models.py:23 msgid "Finished" msgstr "Fini" -#: models.py:14 +#: models.py:25 msgid "Name" msgstr "Nom" -#: models.py:16 +#: models.py:27 msgid "Creation date" msgstr "Date de création" -#: models.py:18 +#: models.py:29 msgid "Start date" msgstr "Date de début" -#: models.py:20 +#: models.py:31 msgid "End date" msgstr "Date de fin" -#: models.py:22 +#: models.py:33 msgid "Status" msgstr "État" -#: models.py:27 models.py:40 models.py:98 +#: models.py:38 models.py:51 models.py:109 msgid "Crawl" msgstr "Session d'indexation" -#: models.py:28 +#: models.py:39 msgid "Crawls" msgstr "Session d'indexation" -#: models.py:41 +#: models.py:52 msgid "Target" msgstr "Cible" -#: models.py:43 +#: models.py:54 msgid "Number of external links" msgstr "Nombre de liens externes" -#: models.py:45 +#: models.py:56 msgid "Number of internal links" msgstr "Nombre de liens internes" -#: models.py:47 +#: models.py:58 msgid "Number of images" msgstr "Nombre d'images" -#: models.py:49 +#: models.py:60 msgid "Number of Facebook links" msgstr "Nombre de liens Facebook" -#: models.py:51 +#: models.py:62 msgid "Number of Twitter links" msgstr "Nombre de liens Twitter" -#: models.py:53 +#: models.py:64 msgid "Number of Instagram links" msgstr "Nombre de liens Instagram" -#: models.py:55 +#: models.py:66 msgid "Number of Youtube links" msgstr "Nombre de liens Youtube" -#: models.py:57 +#: models.py:68 msgid "Number of Dailymotion links" msgstr "Nombre de liens Dailymotion" -#: models.py:59 +#: models.py:70 msgid "Number of Vimeo links" msgstr "Nombre de liens Vimeo" -#: models.py:61 +#: models.py:72 msgid "Number of videos" msgstr "Nombre de vidéos" -#: models.py:63 +#: models.py:74 msgid "Number of audios" msgstr "Nombre de fichiers audio" -#: models.py:65 +#: models.py:76 msgid "Number of internal PDF" msgstr "Nombre de PDF internes" -#: models.py:67 +#: models.py:78 msgid "Number of external PDF" msgstr "Nombre de PDF externes" -#: models.py:69 +#: models.py:80 msgid "Number of internal office documents" msgstr "Nombre de liens document office internes" -#: models.py:71 +#: models.py:82 msgid "Number of external office documents" msgstr "Nombre de liens document office externes" -#: models.py:73 +#: models.py:84 msgid "Website is online" msgstr "Site en ligne" -#: models.py:75 +#: models.py:86 msgid "Redirection" msgstr "Redirection" -#: models.py:78 +#: models.py:89 msgid "Crawl result" msgstr "Résultat d'indexation" -#: models.py:79 +#: models.py:90 msgid "Crawl results" msgstr "Résultats d'indexation" -#: models.py:86 +#: models.py:97 msgid "Result" msgstr "Résultat" -#: models.py:87 +#: models.py:98 msgid "Link" msgstr "Lien" -#: models.py:90 +#: models.py:101 msgid "Crawl link" -msgstr "Indexation - lien" +msgstr "Indexation - Lien" -#: models.py:91 +#: models.py:102 msgid "Crawl links" -msgstr "Indexations - liens" +msgstr "Indexations - Liens" -#: models.py:99 +#: models.py:110 msgid "Source" msgstr "Source" -#: models.py:101 +#: models.py:112 msgid "Destination" msgstr "Destination" -#: models.py:103 +#: models.py:114 msgid "Number" msgstr "Nombre" -#: models.py:106 +#: models.py:117 msgid "Crawl relation" msgstr "Indexation - Relation" -#: models.py:107 +#: models.py:118 msgid "Crawl relations" msgstr "Indexations - Relations" diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py index 92c3081..1248eeb 100644 --- a/commcrawler/management/commands/launch_crawl.py +++ b/commcrawler/management/commands/launch_crawl.py @@ -3,7 +3,7 @@ import sys from django.core.management.base import BaseCommand -from commcrawler.models import Crawl +from commcrawler.models import Crawl, ExludedDomains from commcrawler.scrapy import launch_crawl @@ -57,5 +57,7 @@ class Command(BaseCommand): except ValueError: c_id = None current_crawl = crawls[c_id] - launch_crawl(current_crawl) + excluded = [domain.split("://")[1] for domain in + ExludedDomains.objects.all()] + launch_crawl(current_crawl, excluded_domains=excluded) diff --git a/commcrawler/migrations/0001_initial.py b/commcrawler/migrations/0001_initial.py index 0b4e561..c2e261b 100644 --- a/commcrawler/migrations/0001_initial.py +++ b/commcrawler/migrations/0001_initial.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -# Generated by Django 1.11 on 2019-07-31 08:21 +# Generated by Django 1.11 on 2019-08-05 10:01 from __future__ import unicode_literals import datetime @@ -24,13 +24,13 @@ class Migration(migrations.Migration): ('created', models.DateTimeField(default=datetime.datetime.now, verbose_name='Creation date')), ('started', models.DateTimeField(blank=True, null=True, verbose_name='Start date')), ('ended', models.DateTimeField(blank=True, null=True, verbose_name='End date')), - ('status', models.CharField(choices=[('C', 'Created'), ('P', 'In progress'), ('F', 'Finished')], default='C', max_length=1)), + ('status', models.CharField(choices=[('C', 'Created'), ('P', 'In progress'), ('F', 'Finished')], default='C', max_length=1, verbose_name='Status')), ('targets', models.ManyToManyField(blank=True, to='commorganization.Target')), ], options={ 'ordering': ('created', 'name'), - 'verbose_name_plural': 'Crawls', 'verbose_name': 'Crawl', + 'verbose_name_plural': 'Crawls', }, ), migrations.CreateModel( @@ -40,8 +40,8 @@ class Migration(migrations.Migration): ('link', models.URLField(verbose_name='Link')), ], options={ - 'verbose_name_plural': 'Crawl links', 'verbose_name': 'Crawl link', + 'verbose_name_plural': 'Crawl links', }, ), migrations.CreateModel( @@ -54,8 +54,8 @@ class Migration(migrations.Migration): ('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='relation_source', to='commorganization.Target', verbose_name='Source')), ], options={ - 'verbose_name_plural': 'Crawl relations', 'verbose_name': 'Crawl relation', + 'verbose_name_plural': 'Crawl relations', }, ), migrations.CreateModel( @@ -75,16 +75,27 @@ class Migration(migrations.Migration): ('nb_audio', models.IntegerField(default=0, verbose_name='Number of audios')), ('nb_internal_pdf', models.IntegerField(default=0, verbose_name='Number of internal PDF')), ('nb_external_pdf', models.IntegerField(default=0, verbose_name='Number of external PDF')), - ('nb_internal_office', models.IntegerField(default=0, verbose_name='Number of internal PDF')), - ('nb_external_office', models.IntegerField(default=0, verbose_name='Number of external PDF')), + ('nb_internal_office', models.IntegerField(default=0, verbose_name='Number of internal office documents')), + ('nb_external_office', models.IntegerField(default=0, verbose_name='Number of external office documents')), ('is_online', models.BooleanField(default=False, verbose_name='Website is online')), ('redirection', models.URLField(blank=True, null=True, verbose_name='Redirection')), ('crawl', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commcrawler.Crawl', verbose_name='Crawl')), ('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commorganization.Target', verbose_name='Target')), ], options={ - 'verbose_name_plural': 'Crawl results', 'verbose_name': 'Crawl result', + 'verbose_name_plural': 'Crawl results', + }, + ), + migrations.CreateModel( + name='ExludedDomains', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('url', models.URLField(unique=True, verbose_name='URL')), + ], + options={ + 'verbose_name': 'Excluded domain', + 'verbose_name_plural': 'Excluded domains', }, ), migrations.AddField( diff --git a/commcrawler/models.py b/commcrawler/models.py index 0050ad8..f3e3246 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -6,6 +6,17 @@ from django.utils.translation import ugettext_lazy as _ from commorganization.models import Target +class ExludedDomains(models.Model): + url = models.URLField(verbose_name=_("URL"), unique=True) + + class Meta: + verbose_name = _("Excluded domain") + verbose_name_plural = _("Excluded domains") + + def __str__(self): + return self.url + + class Crawl(models.Model): STATUS = ( ('C', _("Created")), ('P', _("In progress")), diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index e821a31..d218648 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,13 +1,21 @@ import scrapy from scrapy.crawler import CrawlerProcess +from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor import tldextract from django.conf import settings +MAX_LINKS = 500 + class DefaultSpider: + name = None + start_urls = None + allowed_domains = [] + excluded_domains = [] target_id = None + links_reached = set() def _parse_image(self, response, result): for __ in response.css('img'): @@ -20,10 +28,27 @@ class DefaultSpider: "url": response.url, "target_id": self.target_id } - self._parse_image(response, result) - for link in LinkExtractor().extract_links(response): - if link.url is not None: - yield response.follow(link.url, self.parse) + for domain in self.excluded_domains: + if domain in response.url: + result["offline"] = True + yield result + try: + self._parse_image(response, result) + for link in LinkExtractor().extract_links(response): + url = link.url + if url is None or url in self.links_reached: + continue + for domain in self.allowed_domains: + if domain in url: + self.links_reached.add(link.url) + if len(self.links_reached) < MAX_LINKS: + yield response.follow(link.url, self.parse) + else: + print("MAX", self.allowed_domains, + self.links_reached) + except NotSupported: + print("No response", response.url) + yield result def get_domain(url): @@ -31,16 +56,19 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, target=None): +def create_spider(name, urls, target, excluded_domains=None): + if not excluded_domains: + excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], - "target_id": target.pk} + "target_id": target.pk, "links_reached": set(), + "excluded_domains": excluded_domains} ) -def launch_crawl(crawl_item): +def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() scrap_settings.update({ 'FEED_FORMAT': 'json', @@ -49,8 +77,10 @@ def launch_crawl(crawl_item): process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( - create_spider("Target{}".format(target.pk), - [target.url], - target) + create_spider( + "Target{}".format(target.pk), + [target.url], target, + excluded_domains + ) ) process.start() |