diff options
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 13 | ||||
-rw-r--r-- | commcrawler/admin.py | 29 | ||||
-rw-r--r-- | commcrawler/management/commands/launch_crawl.py | 3 | ||||
-rw-r--r-- | commcrawler/migrations/0001_initial.py | 32 | ||||
-rw-r--r-- | commcrawler/models.py | 22 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 200 | ||||
-rw-r--r-- | commonnet/local_settings.py.sample | 12 | ||||
-rw-r--r-- | commonnet/scrapy_setting.py | 3 | ||||
-rw-r--r-- | commonnet/settings.py | 13 | ||||
-rw-r--r-- | commorganization/admin.py | 1 | ||||
-rw-r--r-- | requirements.txt | 2 |
12 files changed, 279 insertions, 52 deletions
@@ -8,3 +8,4 @@ *.sqlite3 local_settings.py result.json +data_src @@ -17,6 +17,19 @@ compilemessages: ## compile messages for translation $(PYTHON) ../manage.py compilemessages ; \ done +migrations: ## make DB migrations + $(PYTHON) manage.py makemigrations + + +migrate: ## apply DB migrations + $(PYTHON) manage.py migrate + +regenerate_all: migrate ## regenerate all the database + $(PYTHON) manage.py createsuperuser + $(PYTHON) manage.py import_csv_communes data_src/communes.csv + $(PYTHON) manage.py import_csv_autres data_src/autres.csv + + run: ## run test server $(PYTHON) manage.py runserver 0.0.0.0:8000 diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 457d2b0..857962c 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -1,5 +1,12 @@ +import json + +from pygments import highlight +from pygments.lexers.data import JsonLexer +from pygments.formatters.html import HtmlFormatter + from ajax_select import make_ajax_form from django.contrib import admin +from django.utils.safestring import mark_safe from commonnet.admin_site import admin_site from commcrawler import models @@ -11,7 +18,7 @@ class CrawlAdmin(admin.ModelAdmin): "ended") list_filter = ("status",) readonly_fields = ("status", "created", "started", "ended") - exclude = ("targets", ) + form = make_ajax_form(model, {'targets': 'target'}) admin_site.register(models.Crawl, CrawlAdmin) @@ -19,10 +26,22 @@ admin_site.register(models.Crawl, CrawlAdmin) class CrawlResultAdmin(admin.ModelAdmin): model = models.CrawlResult - list_display = ("target", "crawl", "is_online") - list_filter = ("crawl",) + list_display = ("target", "crawl", "started", "duration", "status", + "is_online") + list_filter = ("status", "crawl") + search_fields = ("target__name",) + readonly_fields = ("started", "duration", "status", + "crawl_result_prettified") + exclude = ("crawl_result",) form = make_ajax_form(model, {'target': 'target'}) + def crawl_result_prettified(self, instance): + response = json.dumps(instance.crawl_result, sort_keys=True, indent=2) + formatter = HtmlFormatter(style='colorful') + response = highlight(response, JsonLexer(), formatter) + style = "<style>" + formatter.get_style_defs() + "</style><br>" + return mark_safe(style + response) + admin_site.register(models.CrawlResult, CrawlResultAdmin) @@ -47,8 +66,8 @@ admin_site.register(models.CrawlRelation, CrawlRelationAdmin) class ExcludedDomainAdmin(admin.ModelAdmin): - list_display = ('url',) - search_fields = ('url',) + list_display = ('domain',) + search_fields = ('domain',) admin_site.register(models.ExludedDomains, ExcludedDomainAdmin) diff --git a/commcrawler/management/commands/launch_crawl.py b/commcrawler/management/commands/launch_crawl.py index 1248eeb..bf76caf 100644 --- a/commcrawler/management/commands/launch_crawl.py +++ b/commcrawler/management/commands/launch_crawl.py @@ -57,7 +57,6 @@ class Command(BaseCommand): except ValueError: c_id = None current_crawl = crawls[c_id] - excluded = [domain.split("://")[1] for domain in - ExludedDomains.objects.all()] + excluded = [domain.domain for domain in ExludedDomains.objects.all()] launch_crawl(current_crawl, excluded_domains=excluded) diff --git a/commcrawler/migrations/0001_initial.py b/commcrawler/migrations/0001_initial.py index c2e261b..26cdd2d 100644 --- a/commcrawler/migrations/0001_initial.py +++ b/commcrawler/migrations/0001_initial.py @@ -1,8 +1,10 @@ # -*- coding: utf-8 -*- -# Generated by Django 1.11 on 2019-08-05 10:01 +# Generated by Django 1.11 on 2019-08-07 01:17 from __future__ import unicode_literals import datetime +import django.contrib.postgres.fields.jsonb +import django.contrib.sites.models from django.db import migrations, models import django.db.models.deletion @@ -20,7 +22,7 @@ class Migration(migrations.Migration): name='Crawl', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('name', models.CharField(max_length=200, verbose_name='Name')), + ('name', models.CharField(max_length=200, unique=True, verbose_name='Name')), ('created', models.DateTimeField(default=datetime.datetime.now, verbose_name='Creation date')), ('started', models.DateTimeField(blank=True, null=True, verbose_name='Start date')), ('ended', models.DateTimeField(blank=True, null=True, verbose_name='End date')), @@ -28,9 +30,9 @@ class Migration(migrations.Migration): ('targets', models.ManyToManyField(blank=True, to='commorganization.Target')), ], options={ - 'ordering': ('created', 'name'), - 'verbose_name': 'Crawl', 'verbose_name_plural': 'Crawls', + 'verbose_name': 'Crawl', + 'ordering': ('created', 'name'), }, ), migrations.CreateModel( @@ -40,8 +42,8 @@ class Migration(migrations.Migration): ('link', models.URLField(verbose_name='Link')), ], options={ - 'verbose_name': 'Crawl link', 'verbose_name_plural': 'Crawl links', + 'verbose_name': 'Crawl link', }, ), migrations.CreateModel( @@ -54,14 +56,18 @@ class Migration(migrations.Migration): ('source', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='relation_source', to='commorganization.Target', verbose_name='Source')), ], options={ - 'verbose_name': 'Crawl relation', 'verbose_name_plural': 'Crawl relations', + 'verbose_name': 'Crawl relation', }, ), migrations.CreateModel( name='CrawlResult', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('started', models.DateTimeField(default=datetime.datetime.now, verbose_name='Start date')), + ('duration', models.DurationField(blank=True, null=True, verbose_name='Duration')), + ('status', models.CharField(choices=[('P', 'In progress'), ('F', 'Finished')], default='P', max_length=1, verbose_name='Status')), + ('crawl_result', django.contrib.postgres.fields.jsonb.JSONField(default=list, verbose_name='Crawl result')), ('nb_external_link', models.IntegerField(default=0, verbose_name='Number of external links')), ('nb_internal_link', models.IntegerField(default=0, verbose_name='Number of internal links')), ('nb_images', models.IntegerField(default=0, verbose_name='Number of images')), @@ -83,19 +89,19 @@ class Migration(migrations.Migration): ('target', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commorganization.Target', verbose_name='Target')), ], options={ - 'verbose_name': 'Crawl result', 'verbose_name_plural': 'Crawl results', + 'verbose_name': 'Crawl result', }, ), migrations.CreateModel( name='ExludedDomains', fields=[ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), - ('url', models.URLField(unique=True, verbose_name='URL')), + ('domain', models.CharField(max_length=100, unique=True, validators=[django.contrib.sites.models._simple_domain_name_validator], verbose_name='Domain name')), ], options={ - 'verbose_name': 'Excluded domain', 'verbose_name_plural': 'Excluded domains', + 'verbose_name': 'Excluded domain', }, ), migrations.AddField( @@ -103,4 +109,12 @@ class Migration(migrations.Migration): name='result', field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='commcrawler.CrawlResult', verbose_name='Result'), ), + migrations.AlterUniqueTogether( + name='crawlresult', + unique_together=set([('crawl', 'target')]), + ), + migrations.AlterUniqueTogether( + name='crawlrelation', + unique_together=set([('crawl', 'source', 'destination')]), + ), ] diff --git a/commcrawler/models.py b/commcrawler/models.py index f3e3246..9a98b89 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -1,5 +1,7 @@ import datetime +from django.contrib.postgres.fields import JSONField +from django.contrib.sites.models import _simple_domain_name_validator from django.db import models from django.utils.translation import ugettext_lazy as _ @@ -7,7 +9,9 @@ from commorganization.models import Target class ExludedDomains(models.Model): - url = models.URLField(verbose_name=_("URL"), unique=True) + domain = models.CharField( + _("Domain name"), max_length=100, + validators=[_simple_domain_name_validator], unique=True) class Meta: verbose_name = _("Excluded domain") @@ -22,7 +26,7 @@ class Crawl(models.Model): ('C', _("Created")), ('P', _("In progress")), ('F', _("Finished")) ) - name = models.CharField(verbose_name=_("Name"), max_length=200) + name = models.CharField(verbose_name=_("Name"), max_length=200, unique=True) created = models.DateTimeField( verbose_name=_("Creation date"), default=datetime.datetime.now) started = models.DateTimeField( @@ -48,8 +52,20 @@ class Crawl(models.Model): class CrawlResult(models.Model): + STATUS = ( + ('P', _("In progress")), + ('F', _("Finished")) + ) crawl = models.ForeignKey(Crawl, verbose_name=_("Crawl")) target = models.ForeignKey(Target, verbose_name=_("Target")) + started = models.DateTimeField( + verbose_name=_("Start date"), default=datetime.datetime.now) + duration = models.DurationField( + verbose_name=_("Duration"), blank=True, null=True) + status = models.CharField( + verbose_name=_("Status"), + max_length=1, choices=STATUS, default='P') + crawl_result = JSONField(verbose_name=_("Crawl result"), default=list) nb_external_link = models.IntegerField( verbose_name=_("Number of external links"), default=0) nb_internal_link = models.IntegerField( @@ -88,6 +104,7 @@ class CrawlResult(models.Model): class Meta: verbose_name = _("Crawl result") verbose_name_plural = _("Crawl results") + unique_together = ("crawl", "target") def __str__(self): return "{} - {}".format(self.crawl, self.target) @@ -116,6 +133,7 @@ class CrawlRelation(models.Model): class Meta: verbose_name = _("Crawl relation") verbose_name_plural = _("Crawl relations") + unique_together = ("crawl", "source", "destination") def __str__(self): return "{} - {}".format(self.crawl, self.source, self.destination) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index d218648..b0c4fe4 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,10 +1,36 @@ +import tldextract + import scrapy from scrapy.crawler import CrawlerProcess from scrapy.exceptions import NotSupported from scrapy.linkextractors import LinkExtractor -import tldextract from django.conf import settings +from django.db import transaction, IntegrityError +from django.utils import timezone + +from . import models + +""" +nb_external_link +nb_internal_link +nb_images +nb_facebook +nb_twitter +nb_instagram +nb_youtube +nb_dailymotion +nb_vimeo +nb_video +nb_audio +nb_internal_pdf +nb_external_pdf +nb_internal_office +nb_external_office +redirection + +CrawlLink +""" MAX_LINKS = 500 @@ -14,41 +40,154 @@ class DefaultSpider: start_urls = None allowed_domains = [] excluded_domains = [] + crawl_id = None target_id = None links_reached = set() + def start_requests(self): + q = { + "crawl_id": self.crawl_id, + "target_id": self.target_id, + "status": "F" + } + if models.CrawlResult.objects.filter(**q).count(): + return [] + q.pop("status") + if models.CrawlResult.objects.filter(**q).count(): + # delete a previous interrupted attempt + res = models.CrawlResult.objects.get(**q) + res.delete() + + for url in self.start_urls: + yield scrapy.Request(url, self.parse) + def _parse_image(self, response, result): - for __ in response.css('img'): - if 'nb_images' not in result: - result["nb_images"] = 0 - result["nb_images"] += 1 + if "images" not in result: + result["images"] = [] + for img in response.css('img'): + attributes = img.attrib + if "src" not in attributes: + continue + src = attributes["src"] + is_a_real_src = src.startswith("http") or src.startswith("/") + if not src or not is_a_real_src or src in result["images"]: + continue + result["images"].append(src) def parse(self, response): result = { "url": response.url, - "target_id": self.target_id } for domain in self.excluded_domains: if domain in response.url: - result["offline"] = True - yield result + result["is_online"] = False + if result.get("is_online", None) is False: + yield result + else: + result["is_online"] = True + try: + self._parse_image(response, result) + for link in LinkExtractor().extract_links(response): + url = link.url + if url is None or url in self.links_reached: + continue + for domain in self.allowed_domains: + if domain in url: + self.links_reached.add(link.url) + if len(self.links_reached) < MAX_LINKS: + yield response.follow(link.url, self.parse) + else: + print("MAX", self.allowed_domains, + self.links_reached) + except NotSupported: + print("No response", response.url) + yield result + + def closed(self, reason): + result = { + "crawl_id": self.crawl_id, + "target_id": self.target_id, + } + DbPipeline().close(self) + + +class DbPipeline: + BASE_KEYS = ["url", "crawl_id", "target_id"] + NB_KEYS = ["external_link", "internal_link", "images", + "facebook", "twitter", "instagram", "youtube", + "dailymotion", "vimeo", "video", "audio", + "internal_pdf", "external_pdf", "internal_office", + "external_office", ] + + def _get_result_pk(self, spider): + """ + Atomic creation + :param spider: current spider + :return: result_pk, created + """ + pks = { + "crawl_id": spider.crawl_id, + "target_id": spider.target_id, + } + created = False try: - self._parse_image(response, result) - for link in LinkExtractor().extract_links(response): - url = link.url - if url is None or url in self.links_reached: - continue - for domain in self.allowed_domains: - if domain in url: - self.links_reached.add(link.url) - if len(self.links_reached) < MAX_LINKS: - yield response.follow(link.url, self.parse) - else: - print("MAX", self.allowed_domains, - self.links_reached) - except NotSupported: - print("No response", response.url) - yield result + result = models.CrawlResult.objects.get(**pks) + except models.CrawlResult.DoesNotExist: + try: + with transaction.atomic(): + result = models.CrawlResult.objects.create(**pks) + created = True + except IntegrityError: + result = models.CrawlResult.objects.get(**pks) + return result.pk, created + + def _update(self, result_pk, item, result_created): + """ + Atomic update + """ + with transaction.atomic(): + result = models.CrawlResult.objects.select_for_update().get( + pk=result_pk) + crawl_result = result.crawl_result + if crawl_result: + crawl_result = crawl_result[0] + else: + crawl_result = {} + if "urls" not in crawl_result: + crawl_result["urls"] = [] + url = item.pop("url") + if url in crawl_result["urls"]: + return + crawl_result["urls"].append(url) + for k, value in item.items(): + if k == "is_online": + if result_created: # only update on the first link + result.is_online = value + elif k in self.NB_KEYS: + if k not in crawl_result: + crawl_result[k] = [] + for subvalue in value: + if subvalue in crawl_result[k]: + continue + crawl_result[k].append(subvalue) + setattr(result, "nb_" + k, len(crawl_result[k])) + result.crawl_result = [crawl_result] + result.save() + return True + + def process_item(self, item, spider): + result_pk, created = self._get_result_pk(spider) + self._update(result_pk, item, created) + return item + + def close(self, spider): + result_pk, created = self._get_result_pk(spider) + with transaction.atomic(): + result = models.CrawlResult.objects.select_for_update().get( + pk=result_pk) + result.status = "F" + result.duration = timezone.now() - result.started + result.save() def get_domain(url): @@ -56,30 +195,27 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, target, excluded_domains=None): +def create_spider(name, urls, crawl, target, excluded_domains=None): if not excluded_domains: excluded_domains = [] return type( name, (DefaultSpider, scrapy.Spider), {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], - "target_id": target.pk, "links_reached": set(), + "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), "excluded_domains": excluded_domains} ) def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() - scrap_settings.update({ - 'FEED_FORMAT': 'json', - 'FEED_URI': 'result.json' - }) process = CrawlerProcess(settings=scrap_settings) for target in crawl_item.targets.all(): process.crawl( create_spider( - "Target{}".format(target.pk), - [target.url], target, + "Crawl{}Target{}".format(crawl_item.pk, target.pk), + [target.url], + crawl_item, target, excluded_domains ) ) diff --git a/commonnet/local_settings.py.sample b/commonnet/local_settings.py.sample index 0cba3f5..8c59516 100644 --- a/commonnet/local_settings.py.sample +++ b/commonnet/local_settings.py.sample @@ -4,3 +4,15 @@ RESPONSIBLE_EMAIL = None DEV = True EXTRA_APPS = [] + +# you probably have at least to set a password for db connection +DATABASES = { + 'default': { + 'ENGINE': 'django.contrib.gis.db.backends.postgis', + 'NAME': 'commonnet', + 'USER': 'commonnet', + 'PASSWORD': '', + 'HOST': '127.0.0.1', + 'PORT': '5432', + } +} diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py index 2d2b7b7..0d0f15b 100644 --- a/commonnet/scrapy_setting.py +++ b/commonnet/scrapy_setting.py @@ -85,5 +85,8 @@ SCRAPPY_SETTINGS = { #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' "COOKIES_ENABLED": False, + "ITEM_PIPELINES": { + 'commcrawler.scrapy.DbPipeline': 300, + } } diff --git a/commonnet/settings.py b/commonnet/settings.py index c72756c..40e3335 100644 --- a/commonnet/settings.py +++ b/commonnet/settings.py @@ -26,10 +26,13 @@ INSTALLED_APPS = [ 'django.contrib.auth', 'django.contrib.contenttypes', 'django.contrib.sessions', + 'django.contrib.sites', 'django.contrib.messages', 'django.contrib.staticfiles', ] +SITE_ID = 1 + MIDDLEWARE = [ 'django.middleware.security.SecurityMiddleware', 'django.contrib.sessions.middleware.SessionMiddleware', @@ -64,14 +67,20 @@ WSGI_APPLICATION = 'commonnet.wsgi.application' # Database # https://docs.djangoproject.com/en/1.11/ref/settings/#databases + DATABASES = { 'default': { - 'ENGINE': 'django.db.backends.sqlite3', - 'NAME': os.path.join(BASE_DIR, 'db.sqlite3'), + 'ENGINE': 'django.db.backends.postgresql', + 'NAME': 'commonnet', + 'USER': 'commonnet', + 'PASSWORD': '', + 'HOST': '127.0.0.1', + 'PORT': '5432', } } + # Password validation # https://docs.djangoproject.com/en/1.11/ref/settings/#auth-password-validators diff --git a/commorganization/admin.py b/commorganization/admin.py index b26ad7e..278c470 100644 --- a/commorganization/admin.py +++ b/commorganization/admin.py @@ -71,6 +71,7 @@ class TargetAdmin(admin.ModelAdmin): list_filter = ('organization__organization_type',) form = make_ajax_form(model, {'organization': 'organization'}) actions = ['add_to_crawl'] + search_fields = ["name", "organization__name"] def add_to_crawl(self, request, queryset): if 'apply' in request.POST: diff --git a/requirements.txt b/requirements.txt index bf39a7d..a685f50 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,4 +2,6 @@ scrapy==1.7 tldextract django==1.11 django-ajax-selects==1.6.0 +psycopg2 +pygments # https://splash.readthedocs.io/
\ No newline at end of file |