diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 13:06:03 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 13:06:03 +0200 |
commit | e7dd52e03d8e770d8fcee1503fa8109dc3778d29 (patch) | |
tree | fbd89b70b52caa9cf0db21aa2efc24ed678fc2d3 /commcrawler | |
parent | 347a0822484ad16b9a29eef1ea30082b4a841ac6 (diff) | |
download | Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.tar.bz2 Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.zip |
Manage links betweens targets
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/admin.py | 2 | ||||
-rw-r--r-- | commcrawler/management/commands/launch_crawl.py | 2 | ||||
-rw-r--r-- | commcrawler/migrations/0002_auto_20190809_1231.py | 122 | ||||
-rw-r--r-- | commcrawler/migrations/0003_crawl_progression.py | 20 | ||||
-rw-r--r-- | commcrawler/models.py | 25 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 68 | ||||
-rw-r--r-- | commcrawler/utils.py | 18 |
7 files changed, 227 insertions, 30 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index fcd1a1b..64714d2 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -18,7 +18,7 @@ class CrawlAdmin(admin.ModelAdmin): list_display = ("name", "status", "target_nb", "created", "started", "ended", "progress") list_filter = ("status",) - readonly_fields = ("created", "started", "ended") + readonly_fields = ("created", "started", "crawl_ended", "ended") form = make_ajax_form(model, {'targets': 'target'}) diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py index bf76caf..883c035 100644 --- a/commcrawler/management/commands/launch_crawl.py +++ b/commcrawler/management/commands/launch_crawl.py @@ -30,7 +30,7 @@ class Command(BaseCommand): 'incompatible. Exit.\n') return - q = Crawl.objects.filter(status="C") + q = Crawl.objects.filter(status="A") if not q.count(): sys.stdout.write('No crawl waiting. Exit.\n') return diff --git a/commcrawler/migrations/0002_auto_20190809_1231.py b/commcrawler/migrations/0002_auto_20190809_1231.py new file mode 100644 index 0000000..7db1bba --- /dev/null +++ b/commcrawler/migrations/0002_auto_20190809_1231.py @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11 on 2019-08-09 10:31 +from __future__ import unicode_literals + +import datetime +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('commcrawler', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='crawl', + name='crawl_ended', + field=models.DateTimeField(blank=True, null=True, verbose_name='Crawl end'), + ), + migrations.AlterField( + model_name='crawl', + name='created', + field=models.DateTimeField(default=datetime.datetime.now, verbose_name='Creation'), + ), + migrations.AlterField( + model_name='crawl', + name='ended', + field=models.DateTimeField(blank=True, null=True, verbose_name='Ended'), + ), + migrations.AlterField( + model_name='crawl', + name='started', + field=models.DateTimeField(blank=True, null=True, verbose_name='Started'), + ), + migrations.AlterField( + model_name='crawl', + name='status', + field=models.CharField(choices=[('C', 'Created'), ('A', 'Planned'), ('P', 'Crawl in progress'), ('M', 'Match link in progress'), ('F', 'Finished')], default='C', max_length=1, verbose_name='Status'), + ), + migrations.AlterField( + model_name='crawlresult', + name='crawl', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='results', to='commcrawler.Crawl', verbose_name='Crawl'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_audio', + field=models.IntegerField(default=0, verbose_name='Internal audios'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_dailymotion', + field=models.IntegerField(default=0, verbose_name='Dailymotion links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_external_link', + field=models.IntegerField(default=0, verbose_name='External links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_external_office', + field=models.IntegerField(default=0, verbose_name='External office documents'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_external_pdf', + field=models.IntegerField(default=0, verbose_name='External PDF'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_facebook', + field=models.IntegerField(default=0, verbose_name='Facebook links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_images', + field=models.IntegerField(default=0, verbose_name='Images'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_instagram', + field=models.IntegerField(default=0, verbose_name='Instagram links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_internal_link', + field=models.IntegerField(default=0, verbose_name='Internal links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_internal_office', + field=models.IntegerField(default=0, verbose_name='Internal office documents'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_internal_pdf', + field=models.IntegerField(default=0, verbose_name='Internal PDF'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_twitter', + field=models.IntegerField(default=0, verbose_name='Twitter links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_video', + field=models.IntegerField(default=0, verbose_name='Internal videos'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_vimeo', + field=models.IntegerField(default=0, verbose_name='Vimeo links'), + ), + migrations.AlterField( + model_name='crawlresult', + name='nb_youtube', + field=models.IntegerField(default=0, verbose_name='Youtube links'), + ), + ] diff --git a/commcrawler/migrations/0003_crawl_progression.py b/commcrawler/migrations/0003_crawl_progression.py new file mode 100644 index 0000000..dfe9de2 --- /dev/null +++ b/commcrawler/migrations/0003_crawl_progression.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11 on 2019-08-09 10:45 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('commcrawler', '0002_auto_20190809_1231'), + ] + + operations = [ + migrations.AddField( + model_name='crawl', + name='progression', + field=models.IntegerField(blank=True, null=True, verbose_name='Progression'), + ), + ] diff --git a/commcrawler/models.py b/commcrawler/models.py index f526fb5..b520b09 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -23,20 +23,27 @@ class ExludedDomains(models.Model): class Crawl(models.Model): STATUS = ( - ('C', _("Created")), ('P', _("In progress")), + ('C', _("Created")), + ('A', _("Planned")), + ('P', _("Crawl in progress")), + ('M', _("Match link in progress")), ('F', _("Finished")) ) name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True) created = models.DateTimeField( - verbose_name=_("Creation date"), default=datetime.datetime.now) + verbose_name=_("Creation"), default=datetime.datetime.now) started = models.DateTimeField( - verbose_name=_("Start date"), blank=True, null=True) + verbose_name=_("Started"), blank=True, null=True) + crawl_ended = models.DateTimeField( + verbose_name=_("Crawl end"), blank=True, null=True) ended = models.DateTimeField( - verbose_name=_("End date"), blank=True, null=True) + verbose_name=_("Ended"), blank=True, null=True) status = models.CharField( verbose_name=_("Status"), max_length=1, choices=STATUS, default='C') targets = models.ManyToManyField(Target, blank=True) + progression = models.IntegerField( + verbose_name=_("Progression"), blank=True, null=True) class Meta: verbose_name = _("Crawl") @@ -52,13 +59,17 @@ class Crawl(models.Model): @property def progress(self): + todo = self.target_nb + if todo == 0: + return "-" if self.status == "P": - todo = self.target_nb - if todo == 0: - return "-" done = self.results.filter(status__in=("T", "F")).count() percent = int(done / todo * 100) return "{} % ({}/{})".format(percent, done, todo) + if self.status == "M": + done = self.progression or 0 + percent = int(done / todo * 100) + return "{} % ({}/{})".format(percent, done, todo) return "-" diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 5f59127..67c9ee3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,7 +1,5 @@ import datetime -import tldextract import requests -from urllib.parse import urldefrag import scrapy from scrapy.crawler import CrawlerProcess @@ -13,6 +11,7 @@ from django.db import transaction, IntegrityError from django.utils import timezone from . import models +from .utils import clean_url, append_to_results, get_domain """ CrawlLink @@ -32,17 +31,6 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus", OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx") -def clean_url(url): - url, __ = urldefrag(url) # remove anchors - return url - - -def append_to_results(results, key, value): - if key not in results: - results[key] = [] - results[key].append(value) - - MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -281,11 +269,6 @@ class DbPipeline: result.save() -def get_domain(url): - ext = tldextract.extract(url) - return '{}.{}'.format(ext.domain, ext.suffix) - - def create_spider(name, urls, crawl, target, excluded_domains=None, redirect=None): if not excluded_domains: @@ -299,6 +282,36 @@ def create_spider(name, urls, crawl, target, excluded_domains=None, ) +def launch_match(crawl_item): + # reset + models.CrawlRelation.objects.filter(crawl_id=crawl_item.pk).delete() + for result in crawl_item.results.values( + "pk", "crawl_result", "target_id", "target__url").all(): + if not result["crawl_result"] or \ + "external_link" not in result["crawl_result"][0]: + continue + domains = [ + get_domain(link) + for link in result["crawl_result"][0]["external_link"] + ] + for subresult in crawl_item.results.values( + "pk", "target_id", "target__url").all(): + if subresult["pk"] == result["pk"]: + continue + if get_domain(subresult["target__url"]) in domains: + rel, created = models.CrawlRelation.objects.get_or_create( + crawl_id=crawl_item.pk, source_id=result["target_id"], + destination_id=subresult["target_id"]) + if not created: # multiple links + rel.number += 1 + rel.save() + crawl_item.progression = (crawl_item.progression or 0) + 1 + crawl_item.save() + crawl_item.ended = timezone.now() + crawl_item.status = "F" + crawl_item.save() + + def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) @@ -311,8 +324,20 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect = None url = target.url if response.history: - redirect = url url = response.url + redirect = url + domain = get_domain(url) + if domain in excluded_domains: + dct = { + "crawl_id": crawl_item.pk, + "target_id": target.pk, + } + result, __ = models.CrawlResult.objects.get_or_create(**dct) + result.redirection = redirect + result.is_online = False + result.status = "F" + result.save() + continue process.crawl( create_spider( "Crawl{}Target{}".format(crawl_item.pk, target.pk), @@ -323,6 +348,7 @@ def launch_crawl(crawl_item, excluded_domains=None): ) ) process.start() - crawl_item.ended = timezone.now() - crawl_item.status = "F" + crawl_item.crawl_ended = timezone.now() + crawl_item.status = "M" crawl_item.save() + launch_match(crawl_item) diff --git a/commcrawler/utils.py b/commcrawler/utils.py new file mode 100644 index 0000000..6a49669 --- /dev/null +++ b/commcrawler/utils.py @@ -0,0 +1,18 @@ +from urllib.parse import urldefrag +import tldextract + + +def append_to_results(results, key, value): + if key not in results: + results[key] = [] + results[key].append(value) + + +def clean_url(url): + url, __ = urldefrag(url) # remove anchors + return url + + +def get_domain(url): + ext = tldextract.extract(url) + return '{}.{}'.format(ext.domain, ext.suffix) |