summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 13:06:03 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 13:06:03 +0200
commite7dd52e03d8e770d8fcee1503fa8109dc3778d29 (patch)
treefbd89b70b52caa9cf0db21aa2efc24ed678fc2d3
parent347a0822484ad16b9a29eef1ea30082b4a841ac6 (diff)
downloadComm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.tar.bz2
Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.zip
Manage links betweens targets
-rw-r--r--commcrawler/admin.py2
-rw-r--r--commcrawler/management/commands/launch_crawl.py2
-rw-r--r--commcrawler/migrations/0002_auto_20190809_1231.py122
-rw-r--r--commcrawler/migrations/0003_crawl_progression.py20
-rw-r--r--commcrawler/models.py25
-rw-r--r--commcrawler/scrapy.py68
-rw-r--r--commcrawler/utils.py18
7 files changed, 227 insertions, 30 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index fcd1a1b..64714d2 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -18,7 +18,7 @@ class CrawlAdmin(admin.ModelAdmin):
list_display = ("name", "status", "target_nb", "created", "started",
"ended", "progress")
list_filter = ("status",)
- readonly_fields = ("created", "started", "ended")
+ readonly_fields = ("created", "started", "crawl_ended", "ended")
form = make_ajax_form(model, {'targets': 'target'})
diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py
index bf76caf..883c035 100644
--- a/commcrawler/management/commands/launch_crawl.py
+++ b/commcrawler/management/commands/launch_crawl.py
@@ -30,7 +30,7 @@ class Command(BaseCommand):
'incompatible. Exit.\n')
return
- q = Crawl.objects.filter(status="C")
+ q = Crawl.objects.filter(status="A")
if not q.count():
sys.stdout.write('No crawl waiting. Exit.\n')
return
diff --git a/commcrawler/migrations/0002_auto_20190809_1231.py b/commcrawler/migrations/0002_auto_20190809_1231.py
new file mode 100644
index 0000000..7db1bba
--- /dev/null
+++ b/commcrawler/migrations/0002_auto_20190809_1231.py
@@ -0,0 +1,122 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11 on 2019-08-09 10:31
+from __future__ import unicode_literals
+
+import datetime
+from django.db import migrations, models
+import django.db.models.deletion
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('commcrawler', '0001_initial'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='crawl',
+ name='crawl_ended',
+ field=models.DateTimeField(blank=True, null=True, verbose_name='Crawl end'),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='created',
+ field=models.DateTimeField(default=datetime.datetime.now, verbose_name='Creation'),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='ended',
+ field=models.DateTimeField(blank=True, null=True, verbose_name='Ended'),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='started',
+ field=models.DateTimeField(blank=True, null=True, verbose_name='Started'),
+ ),
+ migrations.AlterField(
+ model_name='crawl',
+ name='status',
+ field=models.CharField(choices=[('C', 'Created'), ('A', 'Planned'), ('P', 'Crawl in progress'), ('M', 'Match link in progress'), ('F', 'Finished')], default='C', max_length=1, verbose_name='Status'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='crawl',
+ field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='results', to='commcrawler.Crawl', verbose_name='Crawl'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_audio',
+ field=models.IntegerField(default=0, verbose_name='Internal audios'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_dailymotion',
+ field=models.IntegerField(default=0, verbose_name='Dailymotion links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_external_link',
+ field=models.IntegerField(default=0, verbose_name='External links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_external_office',
+ field=models.IntegerField(default=0, verbose_name='External office documents'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_external_pdf',
+ field=models.IntegerField(default=0, verbose_name='External PDF'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_facebook',
+ field=models.IntegerField(default=0, verbose_name='Facebook links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_images',
+ field=models.IntegerField(default=0, verbose_name='Images'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_instagram',
+ field=models.IntegerField(default=0, verbose_name='Instagram links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_internal_link',
+ field=models.IntegerField(default=0, verbose_name='Internal links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_internal_office',
+ field=models.IntegerField(default=0, verbose_name='Internal office documents'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_internal_pdf',
+ field=models.IntegerField(default=0, verbose_name='Internal PDF'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_twitter',
+ field=models.IntegerField(default=0, verbose_name='Twitter links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_video',
+ field=models.IntegerField(default=0, verbose_name='Internal videos'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_vimeo',
+ field=models.IntegerField(default=0, verbose_name='Vimeo links'),
+ ),
+ migrations.AlterField(
+ model_name='crawlresult',
+ name='nb_youtube',
+ field=models.IntegerField(default=0, verbose_name='Youtube links'),
+ ),
+ ]
diff --git a/commcrawler/migrations/0003_crawl_progression.py b/commcrawler/migrations/0003_crawl_progression.py
new file mode 100644
index 0000000..dfe9de2
--- /dev/null
+++ b/commcrawler/migrations/0003_crawl_progression.py
@@ -0,0 +1,20 @@
+# -*- coding: utf-8 -*-
+# Generated by Django 1.11 on 2019-08-09 10:45
+from __future__ import unicode_literals
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('commcrawler', '0002_auto_20190809_1231'),
+ ]
+
+ operations = [
+ migrations.AddField(
+ model_name='crawl',
+ name='progression',
+ field=models.IntegerField(blank=True, null=True, verbose_name='Progression'),
+ ),
+ ]
diff --git a/commcrawler/models.py b/commcrawler/models.py
index f526fb5..b520b09 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -23,20 +23,27 @@ class ExludedDomains(models.Model):
class Crawl(models.Model):
STATUS = (
- ('C', _("Created")), ('P', _("In progress")),
+ ('C', _("Created")),
+ ('A', _("Planned")),
+ ('P', _("Crawl in progress")),
+ ('M', _("Match link in progress")),
('F', _("Finished"))
)
name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True)
created = models.DateTimeField(
- verbose_name=_("Creation date"), default=datetime.datetime.now)
+ verbose_name=_("Creation"), default=datetime.datetime.now)
started = models.DateTimeField(
- verbose_name=_("Start date"), blank=True, null=True)
+ verbose_name=_("Started"), blank=True, null=True)
+ crawl_ended = models.DateTimeField(
+ verbose_name=_("Crawl end"), blank=True, null=True)
ended = models.DateTimeField(
- verbose_name=_("End date"), blank=True, null=True)
+ verbose_name=_("Ended"), blank=True, null=True)
status = models.CharField(
verbose_name=_("Status"),
max_length=1, choices=STATUS, default='C')
targets = models.ManyToManyField(Target, blank=True)
+ progression = models.IntegerField(
+ verbose_name=_("Progression"), blank=True, null=True)
class Meta:
verbose_name = _("Crawl")
@@ -52,13 +59,17 @@ class Crawl(models.Model):
@property
def progress(self):
+ todo = self.target_nb
+ if todo == 0:
+ return "-"
if self.status == "P":
- todo = self.target_nb
- if todo == 0:
- return "-"
done = self.results.filter(status__in=("T", "F")).count()
percent = int(done / todo * 100)
return "{} % ({}/{})".format(percent, done, todo)
+ if self.status == "M":
+ done = self.progression or 0
+ percent = int(done / todo * 100)
+ return "{} % ({}/{})".format(percent, done, todo)
return "-"
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 5f59127..67c9ee3 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,7 +1,5 @@
import datetime
-import tldextract
import requests
-from urllib.parse import urldefrag
import scrapy
from scrapy.crawler import CrawlerProcess
@@ -13,6 +11,7 @@ from django.db import transaction, IntegrityError
from django.utils import timezone
from . import models
+from .utils import clean_url, append_to_results, get_domain
"""
CrawlLink
@@ -32,17 +31,6 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
-def clean_url(url):
- url, __ = urldefrag(url) # remove anchors
- return url
-
-
-def append_to_results(results, key, value):
- if key not in results:
- results[key] = []
- results[key].append(value)
-
-
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -281,11 +269,6 @@ class DbPipeline:
result.save()
-def get_domain(url):
- ext = tldextract.extract(url)
- return '{}.{}'.format(ext.domain, ext.suffix)
-
-
def create_spider(name, urls, crawl, target, excluded_domains=None,
redirect=None):
if not excluded_domains:
@@ -299,6 +282,36 @@ def create_spider(name, urls, crawl, target, excluded_domains=None,
)
+def launch_match(crawl_item):
+ # reset
+ models.CrawlRelation.objects.filter(crawl_id=crawl_item.pk).delete()
+ for result in crawl_item.results.values(
+ "pk", "crawl_result", "target_id", "target__url").all():
+ if not result["crawl_result"] or \
+ "external_link" not in result["crawl_result"][0]:
+ continue
+ domains = [
+ get_domain(link)
+ for link in result["crawl_result"][0]["external_link"]
+ ]
+ for subresult in crawl_item.results.values(
+ "pk", "target_id", "target__url").all():
+ if subresult["pk"] == result["pk"]:
+ continue
+ if get_domain(subresult["target__url"]) in domains:
+ rel, created = models.CrawlRelation.objects.get_or_create(
+ crawl_id=crawl_item.pk, source_id=result["target_id"],
+ destination_id=subresult["target_id"])
+ if not created: # multiple links
+ rel.number += 1
+ rel.save()
+ crawl_item.progression = (crawl_item.progression or 0) + 1
+ crawl_item.save()
+ crawl_item.ended = timezone.now()
+ crawl_item.status = "F"
+ crawl_item.save()
+
+
def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
process = CrawlerProcess(settings=scrap_settings)
@@ -311,8 +324,20 @@ def launch_crawl(crawl_item, excluded_domains=None):
redirect = None
url = target.url
if response.history:
- redirect = url
url = response.url
+ redirect = url
+ domain = get_domain(url)
+ if domain in excluded_domains:
+ dct = {
+ "crawl_id": crawl_item.pk,
+ "target_id": target.pk,
+ }
+ result, __ = models.CrawlResult.objects.get_or_create(**dct)
+ result.redirection = redirect
+ result.is_online = False
+ result.status = "F"
+ result.save()
+ continue
process.crawl(
create_spider(
"Crawl{}Target{}".format(crawl_item.pk, target.pk),
@@ -323,6 +348,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
)
)
process.start()
- crawl_item.ended = timezone.now()
- crawl_item.status = "F"
+ crawl_item.crawl_ended = timezone.now()
+ crawl_item.status = "M"
crawl_item.save()
+ launch_match(crawl_item)
diff --git a/commcrawler/utils.py b/commcrawler/utils.py
new file mode 100644
index 0000000..6a49669
--- /dev/null
+++ b/commcrawler/utils.py
@@ -0,0 +1,18 @@
+from urllib.parse import urldefrag
+import tldextract
+
+
+def append_to_results(results, key, value):
+ if key not in results:
+ results[key] = []
+ results[key].append(value)
+
+
+def clean_url(url):
+ url, __ = urldefrag(url) # remove anchors
+ return url
+
+
+def get_domain(url):
+ ext = tldextract.extract(url)
+ return '{}.{}'.format(ext.domain, ext.suffix)