summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-07 03:53:32 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-07 03:55:49 +0200
commitd737e04553f464966f54739ba37f9f06dab44586 (patch)
tree2b68891ff1629b55b820312fdd3a17ce91ac5722
parent9fbd94f70d4b819b45eef720425242c0d69b032d (diff)
downloadComm-on-net-d737e04553f464966f54739ba37f9f06dab44586.tar.bz2
Comm-on-net-d737e04553f464966f54739ba37f9f06dab44586.zip
Save crawling results in the database
-rw-r--r--.gitignore1
-rw-r--r--Makefile13
-rw-r--r--commcrawler/admin.py29
-rw-r--r--commcrawler/management/commands/launch_crawl.py3
-rw-r--r--commcrawler/migrations/0001_initial.py32
-rw-r--r--commcrawler/models.py22
-rw-r--r--commcrawler/scrapy.py200
-rw-r--r--commonnet/local_settings.py.sample12
-rw-r--r--commonnet/scrapy_setting.py3
-rw-r--r--commonnet/settings.py13
-rw-r--r--commorganization/admin.py1
-rw-r--r--requirements.txt2
12 files changed, 279 insertions, 52 deletions
diff --git a/.gitignore b/.gitignore
index e78f46c..2f96206 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@
*.sqlite3
local_settings.py
result.json
+data_src
diff --git a/Makefile b/Makefile
index 376d277..20e1e24 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,19 @@ compilemessages: ## compile messages for translation
$(PYTHON) ../manage.py compilemessages ; \
done
+migrations: ## make DB migrations
+ $(PYTHON) manage.py makemigrations
+
+
+migrate: ## apply DB migrations
+ $(PYTHON) manage.py migrate
+
+regenerate_all: migrate ## regenerate all the database
+ $(PYTHON) manage.py createsuperuser
+ $(PYTHON) manage.py import_csv_communes data_src/communes.csv
+ $(PYTHON) manage.py import_csv_autres data_src/autres.csv
+
+
run: ## run test server
$(PYTHON) manage.py runserver 0.0.0.0:8000
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index 457d2b0..857962c 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -1,5 +1,12 @@
+import json
+
+from pygments import highlight
+from pygments.lexers.data import JsonLexer
+from pygments.formatters.html import HtmlFormatter
+
from ajax_select import make_ajax_form
from django.contrib import admin
+from django.utils.safestring import mark_safe
from commonnet.admin_site import admin_site
from commcrawler import models
@@ -11,7 +18,7 @@ class CrawlAdmin(admin.ModelAdmin):
"ended")
list_filter = ("status",)
readonly_fields = ("status", "created", "started", "ended")
- exclude = ("targets", )
+ form = make_ajax_form(model, {'targets': 'target'})
admin_site.register(models.Crawl, CrawlAdmin)
@@ -19,10 +26,22 @@ admin_site.register(models.Crawl, CrawlAdmin)
class CrawlResultAdmin(admin.ModelAdmin):
model = models.CrawlResult
- list_display = ("target", "crawl", "is_online")
- list_filter = ("crawl",)
+ list_display = ("target", "crawl", "started", "duration", "status",
+ "is_online")
+ list_filter = ("status", "crawl")
+ search_fields = ("target__name",)
+ readonly_fields = ("started", "duration", "status",
+ "crawl_result_prettified")
+ exclude = ("crawl_result",)
form = make_ajax_form(model, {'target': 'target'})
+ def crawl_result_prettified(self, instance):
+ response = json.dumps(instance.crawl_result, sort_keys=True, indent=2)
+ formatter = HtmlFormatter(style='colorful')
+ response = highlight(response, JsonLexer(), formatter)
+ style = "<style>" + formatter.get_style_defs() + "</style><br>"
+ return mark_safe(style + response)
+
admin_site.register(models.CrawlResult, CrawlResultAdmin)
@@ -47,8 +66,8 @@ admin_site.register(models.CrawlRelation, CrawlRelationAdmin)
class ExcludedDomainAdmin(admin.ModelAdmin):
- list_display = ('url',)
- search_fields = ('url',)
+ list_display = ('domain',)
+ search_fields = ('domain',)
admin_site.register(models.ExludedDomains, ExcludedDomainAdmin)
diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py
index 1248eeb..bf76caf 100644
--- a/commcrawler/management/commands/launch_crawl.py
+++ b/commcrawler/management/commands/launch_crawl.py
@@ -57,7 +57,6 @@ class Command(BaseCommand):
except ValueError:
c_id = None
current_crawl = crawls[c_id]
- excluded = [domain.split("://")[1] for domain in
- ExludedDomains.objects.all()]
+ excluded = [domain.domain for domain in ExludedDomains.objects.all()]
launch_crawl(current_crawl, excluded_domains=excluded)
diff --git a/commcrawler/migrations/0001_initial.py b/commcrawler/migrations/0001_initial.py
index c2e261b..26cdd2d 100644
--- a/commcrawler/migrations/0001_initial.py
+++ b/commcrawler/migrations/0001_initial.py
@@ -1,8 +1,10 @@
# -*- coding: utf-8 -*-
-# Generated by Django 1.11 on 2019-08-05 10:01
+# Generated by Django 1.11 on 2019-08-07 01:17
from __future__ import unicode_literals
import datetime
+import django.contrib.postgres.fields.jsonb
+import django.contrib.sites.models
from django.db import migrations, models
import django.db.models.deletion
@@ -20,7 +22,7 @@ class Migration(migrations.Migration):
name='Crawl',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
- ('name', models.CharField(max_length=200, verbose_name='Name')),
+ ('name', models.CharField(max_length=200, unique=True, verbose_name='Name')),
('created', models.DateTimeField(default=datetime.datetime.now, verbose_name='Creation date')),
('started', models.DateTimeField(blank=True, null=True, verbose_name='Start date')),
('ended', models.DateTimeField(blank=True, null=True, verbose_name='End date')),
@@ -28,9 +30,9 @@ class Migration(migrations.Migration):
('targets', models.ManyToManyField(blank=True, to='commorganization.Target')),
],
options={
- 'ordering': ('created', 'name'),
- 'verbose_name': 'Crawl',
'verbose_name_plural': 'Crawls',
+ 'verbose_name': 'Crawl',
+ 'ordering': ('created', 'name'),
},
),
migrations.CreateModel(
@@ -40,8 +42,8 @@ class Migration(migrations.Migration):
('link', models.URLField(verbose_name='Link')),
],
options={
- 'verbose_name': 'Crawl link',
'verbose_name_plural': 'Crawl links',
+ 'verbose_name': 'Crawl link',
},
),
migrations.CreateModel(
@@ -54,14 +56,18 @@ class Migration(migrations.Migration):
('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='relation_source', to='commorganization.Target', verbose_name='Source')),
],
options={
- 'verbose_name': 'Crawl relation',
'verbose_name_plural': 'Crawl relations',
+ 'verbose_name': 'Crawl relation',
},
),
migrations.CreateModel(
name='CrawlResult',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('started', models.DateTimeField(default=datetime.datetime.now, verbose_name='Start date')),
+ ('duration', models.DurationField(blank=True, null=True, verbose_name='Duration')),
+ ('status', models.CharField(choices=[('P', 'In progress'), ('F', 'Finished')], default='P', max_length=1, verbose_name='Status')),
+ ('crawl_result', django.contrib.postgres.fields.jsonb.JSONField(default=list, verbose_name='Crawl result')),
('nb_external_link', models.IntegerField(default=0, verbose_name='Number of external links')),
('nb_internal_link', models.IntegerField(default=0, verbose_name='Number of internal links')),
('nb_images', models.IntegerField(default=0, verbose_name='Number of images')),
@@ -83,19 +89,19 @@ class Migration(migrations.Migration):
('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commorganization.Target', verbose_name='Target')),
],
options={
- 'verbose_name': 'Crawl result',
'verbose_name_plural': 'Crawl results',
+ 'verbose_name': 'Crawl result',
},
),
migrations.CreateModel(
name='ExludedDomains',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
- ('url', models.URLField(unique=True, verbose_name='URL')),
+ ('domain', models.CharField(max_length=100, unique=True, validators=[django.contrib.sites.models._simple_domain_name_validator], verbose_name='Domain name')),
],
options={
- 'verbose_name': 'Excluded domain',
'verbose_name_plural': 'Excluded domains',
+ 'verbose_name': 'Excluded domain',
},
),
migrations.AddField(
@@ -103,4 +109,12 @@ class Migration(migrations.Migration):
name='result',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commcrawler.CrawlResult', verbose_name='Result'),
),
+ migrations.AlterUniqueTogether(
+ name='crawlresult',
+ unique_together=set([('crawl', 'target')]),
+ ),
+ migrations.AlterUniqueTogether(
+ name='crawlrelation',
+ unique_together=set([('crawl', 'source', 'destination')]),
+ ),
]
diff --git a/commcrawler/models.py b/commcrawler/models.py
index f3e3246..9a98b89 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -1,5 +1,7 @@
import datetime
+from django.contrib.postgres.fields import JSONField
+from django.contrib.sites.models import _simple_domain_name_validator
from django.db import models
from django.utils.translation import ugettext_lazy as _
@@ -7,7 +9,9 @@ from commorganization.models import Target
class ExludedDomains(models.Model):
- url = models.URLField(verbose_name=_("URL"), unique=True)
+ domain = models.CharField(
+ _("Domain name"), max_length=100,
+ validators=[_simple_domain_name_validator], unique=True)
class Meta:
verbose_name = _("Excluded domain")
@@ -22,7 +26,7 @@ class Crawl(models.Model):
('C', _("Created")), ('P', _("In progress")),
('F', _("Finished"))
)
- name = models.CharField(verbose_name=_("Name"), max_length=200)
+ name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True)
created = models.DateTimeField(
verbose_name=_("Creation date"), default=datetime.datetime.now)
started = models.DateTimeField(
@@ -48,8 +52,20 @@ class Crawl(models.Model):
class CrawlResult(models.Model):
+ STATUS = (
+ ('P', _("In progress")),
+ ('F', _("Finished"))
+ )
crawl = models.ForeignKey(Crawl, verbose_name=_("Crawl"))
target = models.ForeignKey(Target, verbose_name=_("Target"))
+ started = models.DateTimeField(
+ verbose_name=_("Start date"), default=datetime.datetime.now)
+ duration = models.DurationField(
+ verbose_name=_("Duration"), blank=True, null=True)
+ status = models.CharField(
+ verbose_name=_("Status"),
+ max_length=1, choices=STATUS, default='P')
+ crawl_result = JSONField(verbose_name=_("Crawl result"), default=list)
nb_external_link = models.IntegerField(
verbose_name=_("Number of external links"), default=0)
nb_internal_link = models.IntegerField(
@@ -88,6 +104,7 @@ class CrawlResult(models.Model):
class Meta:
verbose_name = _("Crawl result")
verbose_name_plural = _("Crawl results")
+ unique_together = ("crawl", "target")
def __str__(self):
return "{} - {}".format(self.crawl, self.target)
@@ -116,6 +133,7 @@ class CrawlRelation(models.Model):
class Meta:
verbose_name = _("Crawl relation")
verbose_name_plural = _("Crawl relations")
+ unique_together = ("crawl", "source", "destination")
def __str__(self):
return "{} - {}".format(self.crawl, self.source, self.destination)
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index d218648..b0c4fe4 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,10 +1,36 @@
+import tldextract
+
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.exceptions import NotSupported
from scrapy.linkextractors import LinkExtractor
-import tldextract
from django.conf import settings
+from django.db import transaction, IntegrityError
+from django.utils import timezone
+
+from . import models
+
+"""
+nb_external_link
+nb_internal_link
+nb_images
+nb_facebook
+nb_twitter
+nb_instagram
+nb_youtube
+nb_dailymotion
+nb_vimeo
+nb_video
+nb_audio
+nb_internal_pdf
+nb_external_pdf
+nb_internal_office
+nb_external_office
+redirection
+
+CrawlLink
+"""
MAX_LINKS = 500
@@ -14,41 +40,154 @@ class DefaultSpider:
start_urls = None
allowed_domains = []
excluded_domains = []
+ crawl_id = None
target_id = None
links_reached = set()
+ def start_requests(self):
+ q = {
+ "crawl_id": self.crawl_id,
+ "target_id": self.target_id,
+ "status": "F"
+ }
+ if models.CrawlResult.objects.filter(**q).count():
+ return []
+ q.pop("status")
+ if models.CrawlResult.objects.filter(**q).count():
+ # delete a previous interrupted attempt
+ res = models.CrawlResult.objects.get(**q)
+ res.delete()
+
+ for url in self.start_urls:
+ yield scrapy.Request(url, self.parse)
+
def _parse_image(self, response, result):
- for __ in response.css('img'):
- if 'nb_images' not in result:
- result["nb_images"] = 0
- result["nb_images"] += 1
+ if "images" not in result:
+ result["images"] = []
+ for img in response.css('img'):
+ attributes = img.attrib
+ if "src" not in attributes:
+ continue
+ src = attributes["src"]
+ is_a_real_src = src.startswith("http") or src.startswith("/")
+ if not src or not is_a_real_src or src in result["images"]:
+ continue
+ result["images"].append(src)
def parse(self, response):
result = {
"url": response.url,
- "target_id": self.target_id
}
for domain in self.excluded_domains:
if domain in response.url:
- result["offline"] = True
- yield result
+ result["is_online"] = False
+ if result.get("is_online", None) is False:
+ yield result
+ else:
+ result["is_online"] = True
+ try:
+ self._parse_image(response, result)
+ for link in LinkExtractor().extract_links(response):
+ url = link.url
+ if url is None or url in self.links_reached:
+ continue
+ for domain in self.allowed_domains:
+ if domain in url:
+ self.links_reached.add(link.url)
+ if len(self.links_reached) < MAX_LINKS:
+ yield response.follow(link.url, self.parse)
+ else:
+ print("MAX", self.allowed_domains,
+ self.links_reached)
+ except NotSupported:
+ print("No response", response.url)
+ yield result
+
+ def closed(self, reason):
+ result = {
+ "crawl_id": self.crawl_id,
+ "target_id": self.target_id,
+ }
+ DbPipeline().close(self)
+
+
+class DbPipeline:
+ BASE_KEYS = ["url", "crawl_id", "target_id"]
+ NB_KEYS = ["external_link", "internal_link", "images",
+ "facebook", "twitter", "instagram", "youtube",
+ "dailymotion", "vimeo", "video", "audio",
+ "internal_pdf", "external_pdf", "internal_office",
+ "external_office", ]
+
+ def _get_result_pk(self, spider):
+ """
+ Atomic creation
+ :param spider: current spider
+ :return: result_pk, created
+ """
+ pks = {
+ "crawl_id": spider.crawl_id,
+ "target_id": spider.target_id,
+ }
+ created = False
try:
- self._parse_image(response, result)
- for link in LinkExtractor().extract_links(response):
- url = link.url
- if url is None or url in self.links_reached:
- continue
- for domain in self.allowed_domains:
- if domain in url:
- self.links_reached.add(link.url)
- if len(self.links_reached) < MAX_LINKS:
- yield response.follow(link.url, self.parse)
- else:
- print("MAX", self.allowed_domains,
- self.links_reached)
- except NotSupported:
- print("No response", response.url)
- yield result
+ result = models.CrawlResult.objects.get(**pks)
+ except models.CrawlResult.DoesNotExist:
+ try:
+ with transaction.atomic():
+ result = models.CrawlResult.objects.create(**pks)
+ created = True
+ except IntegrityError:
+ result = models.CrawlResult.objects.get(**pks)
+ return result.pk, created
+
+ def _update(self, result_pk, item, result_created):
+ """
+ Atomic update
+ """
+ with transaction.atomic():
+ result = models.CrawlResult.objects.select_for_update().get(
+ pk=result_pk)
+ crawl_result = result.crawl_result
+ if crawl_result:
+ crawl_result = crawl_result[0]
+ else:
+ crawl_result = {}
+ if "urls" not in crawl_result:
+ crawl_result["urls"] = []
+ url = item.pop("url")
+ if url in crawl_result["urls"]:
+ return
+ crawl_result["urls"].append(url)
+ for k, value in item.items():
+ if k == "is_online":
+ if result_created: # only update on the first link
+ result.is_online = value
+ elif k in self.NB_KEYS:
+ if k not in crawl_result:
+ crawl_result[k] = []
+ for subvalue in value:
+ if subvalue in crawl_result[k]:
+ continue
+ crawl_result[k].append(subvalue)
+ setattr(result, "nb_" + k, len(crawl_result[k]))
+ result.crawl_result = [crawl_result]
+ result.save()
+ return True
+
+ def process_item(self, item, spider):
+ result_pk, created = self._get_result_pk(spider)
+ self._update(result_pk, item, created)
+ return item
+
+ def close(self, spider):
+ result_pk, created = self._get_result_pk(spider)
+ with transaction.atomic():
+ result = models.CrawlResult.objects.select_for_update().get(
+ pk=result_pk)
+ result.status = "F"
+ result.duration = timezone.now() - result.started
+ result.save()
def get_domain(url):
@@ -56,30 +195,27 @@ def get_domain(url):
return '{}.{}'.format(ext.domain, ext.suffix)
-def create_spider(name, urls, target, excluded_domains=None):
+def create_spider(name, urls, crawl, target, excluded_domains=None):
if not excluded_domains:
excluded_domains = []
return type(
name, (DefaultSpider, scrapy.Spider),
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
- "target_id": target.pk, "links_reached": set(),
+ "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
"excluded_domains": excluded_domains}
)
def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
- scrap_settings.update({
- 'FEED_FORMAT': 'json',
- 'FEED_URI': 'result.json'
- })
process = CrawlerProcess(settings=scrap_settings)
for target in crawl_item.targets.all():
process.crawl(
create_spider(
- "Target{}".format(target.pk),
- [target.url], target,
+ "Crawl{}Target{}".format(crawl_item.pk, target.pk),
+ [target.url],
+ crawl_item, target,
excluded_domains
)
)
diff --git a/commonnet/local_settings.py.sample b/commonnet/local_settings.py.sample
index 0cba3f5..8c59516 100644
--- a/commonnet/local_settings.py.sample
+++ b/commonnet/local_settings.py.sample
@@ -4,3 +4,15 @@ RESPONSIBLE_EMAIL = None
DEV = True
EXTRA_APPS = []
+
+# you probably have at least to set a password for db connection
+DATABASES = {
+ 'default': {
+ 'ENGINE': 'django.contrib.gis.db.backends.postgis',
+ 'NAME': 'commonnet',
+ 'USER': 'commonnet',
+ 'PASSWORD': '',
+ 'HOST': '127.0.0.1',
+ 'PORT': '5432',
+ }
+}
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
index 2d2b7b7..0d0f15b 100644
--- a/commonnet/scrapy_setting.py
+++ b/commonnet/scrapy_setting.py
@@ -85,5 +85,8 @@ SCRAPPY_SETTINGS = {
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
"COOKIES_ENABLED": False,
+ "ITEM_PIPELINES": {
+ 'commcrawler.scrapy.DbPipeline': 300,
+ }
}
diff --git a/commonnet/settings.py b/commonnet/settings.py
index c72756c..40e3335 100644
--- a/commonnet/settings.py
+++ b/commonnet/settings.py
@@ -26,10 +26,13 @@ INSTALLED_APPS = [
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
+ 'django.contrib.sites',
'django.contrib.messages',
'django.contrib.staticfiles',
]
+SITE_ID = 1
+
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
@@ -64,14 +67,20 @@ WSGI_APPLICATION = 'commonnet.wsgi.application'
# Database
# https://docs.djangoproject.com/en/1.11/ref/settings/#databases
+
DATABASES = {
'default': {
- 'ENGINE': 'django.db.backends.sqlite3',
- 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'),
+ 'ENGINE': 'django.db.backends.postgresql',
+ 'NAME': 'commonnet',
+ 'USER': 'commonnet',
+ 'PASSWORD': '',
+ 'HOST': '127.0.0.1',
+ 'PORT': '5432',
}
}
+
# Password validation
# https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators
diff --git a/commorganization/admin.py b/commorganization/admin.py
index b26ad7e..278c470 100644
--- a/commorganization/admin.py
+++ b/commorganization/admin.py
@@ -71,6 +71,7 @@ class TargetAdmin(admin.ModelAdmin):
list_filter = ('organization__organization_type',)
form = make_ajax_form(model, {'organization': 'organization'})
actions = ['add_to_crawl']
+ search_fields = ["name", "organization__name"]
def add_to_crawl(self, request, queryset):
if 'apply' in request.POST:
diff --git a/requirements.txt b/requirements.txt
index bf39a7d..a685f50 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,4 +2,6 @@ scrapy==1.7
tldextract
django==1.11
django-ajax-selects==1.6.0
+psycopg2
+pygments
# https://splash.readthedocs.io/ \ No newline at end of file