diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 16:09:58 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 16:09:58 +0200 |
commit | e8068395d642fa36d7f6c53fe8088beabe7c2a31 (patch) | |
tree | 337c1e1f89653741a3b9266503a3aae8dc9f127b | |
parent | 0f26c668bcc86d1a4cfc91f1b8154055409e8aab (diff) | |
download | Comm-on-net-e8068395d642fa36d7f6c53fe8088beabe7c2a31.tar.bz2 Comm-on-net-e8068395d642fa36d7f6c53fe8088beabe7c2a31.zip |
Display pre-crawl progression
-rw-r--r-- | commcrawler/admin.py | 5 | ||||
-rw-r--r-- | commcrawler/migrations/0003_auto_20190809_1607.py | 25 | ||||
-rw-r--r-- | commcrawler/models.py | 5 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 10 |
4 files changed, 41 insertions, 4 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 46c5aab..6b06228 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -16,9 +16,10 @@ from commcrawler import models class CrawlAdmin(admin.ModelAdmin): model = models.Crawl list_display = ("name", "status", "target_nb", "created", "started", - "crawl_ended", "ended", "progress") + "pre_crawl_ended", "crawl_ended", "ended", "progress") list_filter = ("status",) - exclude = ("progression","created", "started", "crawl_ended", "ended") + exclude = ("progression", "created", "started", "pre_crawl_ended", + "crawl_ended", "ended") readonly_fields = () form = make_ajax_form(model, {'targets': 'target'}) diff --git a/commcrawler/migrations/0003_auto_20190809_1607.py b/commcrawler/migrations/0003_auto_20190809_1607.py new file mode 100644 index 0000000..3018fc2 --- /dev/null +++ b/commcrawler/migrations/0003_auto_20190809_1607.py @@ -0,0 +1,25 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.11 on 2019-08-09 14:07 +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('commcrawler', '0002_crawlresult_bad_ssl'), + ] + + operations = [ + migrations.AddField( + model_name='crawl', + name='pre_crawl_ended', + field=models.DateTimeField(blank=True, null=True, verbose_name='Pre-crawl end'), + ), + migrations.AlterField( + model_name='crawl', + name='status', + field=models.CharField(choices=[('C', 'Created'), ('A', 'Planned'), ('W', 'Pre-crawl in progress'), ('P', 'Crawl in progress'), ('M', 'Match link in progress'), ('F', 'Finished')], default='C', max_length=1, verbose_name='Status'), + ), + ] diff --git a/commcrawler/models.py b/commcrawler/models.py index da578bb..ce3deb0 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -28,6 +28,7 @@ class Crawl(models.Model): STATUS = ( ('C', _("Created")), ('A', _("Planned")), + ('W', _("Pre-crawl in progress")), ('P', _("Crawl in progress")), ('M', _("Match link in progress")), ('F', _("Finished")) @@ -37,6 +38,8 @@ class Crawl(models.Model): verbose_name=_("Creation"), default=datetime.datetime.now) started = models.DateTimeField( verbose_name=_("Started"), blank=True, null=True) + pre_crawl_ended = models.DateTimeField( + verbose_name=_("Pre-crawl end"), blank=True, null=True) crawl_ended = models.DateTimeField( verbose_name=_("Crawl end"), blank=True, null=True) ended = models.DateTimeField( @@ -69,7 +72,7 @@ class Crawl(models.Model): done = self.results.filter(status__in=("T", "F")).count() percent = int(done / todo * 100) return "{} % ({}/{})".format(percent, done, todo) - if self.status == "M": + if self.status in ("W", "M"): done = self.progression or 0 percent = int(done / todo * 100) return "{} % ({}/{})".format(percent, done, todo) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index a430f0e..39e3a3e 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -324,10 +324,15 @@ def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) crawl_item.started = timezone.now() + crawl_item.pre_crawl_ended = None + crawl_item.crawl_ended = None crawl_item.ended = None - crawl_item.status = "P" + crawl_item.progression = 0 + crawl_item.status = "W" crawl_item.save() for target in crawl_item.targets.all(): + crawl_item.progression += 1 + crawl_item.save() result_dct = { "crawl_id": crawl_item.pk, "target_id": target.pk, @@ -370,6 +375,9 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect ) ) + crawl_item.pre_crawl_ended = timezone.now() + crawl_item.status = "P" + crawl_item.save() process.start() crawl_item.crawl_ended = timezone.now() crawl_item.status = "M" |