diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-20 11:52:02 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-20 11:52:02 +0200 |
commit | a465ac1fa4271e26bdc659aff32bb9ebeaae7922 (patch) | |
tree | ef091e8bae6c6c69677e0b869e907c552824c40d | |
parent | 01d72fa7c94359049e2a7beb068167cb7f047805 (diff) | |
download | Comm-on-net-a465ac1fa4271e26bdc659aff32bb9ebeaae7922.tar.bz2 Comm-on-net-a465ac1fa4271e26bdc659aff32bb9ebeaae7922.zip |
Slice crawls - add a crontab example
-rw-r--r-- | commcrawler/models.py | 2 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 5 | ||||
-rw-r--r-- | commonnet/settings.py | 2 | ||||
-rw-r--r-- | conf/crontab | 1 |
4 files changed, 8 insertions, 2 deletions
diff --git a/commcrawler/models.py b/commcrawler/models.py index 22e5602..ef99c4f 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -73,7 +73,7 @@ class Crawl(models.Model): todo = self.target_nb if todo == 0: return "-" - if self.status == "P": + if self.status in ("P", "A"): done = self.results.filter(status__in=("T", "F")).count() percent = int(done / todo * 100) return "{} % ({}/{})".format(percent, done, todo) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 490142c..6af1c0b 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -435,6 +435,11 @@ def launch_crawl(crawl_item, excluded_domains=None): process.start(stop_after_crawl=ONLY_FIRST_PAGE) page += 1 crawl_item.crawl_ended = timezone.now() + if ONLY_FIRST_PAGE and page <= page_number: + crawl_item.status = "A" + crawl_item.save() + return + crawl_item.status = "M" crawl_item.save() launch_match(crawl_item) diff --git a/commonnet/settings.py b/commonnet/settings.py index 03e4710..6143b0f 100644 --- a/commonnet/settings.py +++ b/commonnet/settings.py @@ -105,7 +105,7 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000 STATIC_URL = '/static/' STATIC_ROOT = os.path.join(BASE_DIR, "collected_static") -NUMBER_PER_SESSION = 5000 +NUMBER_PER_SESSION = 50 try: from .local_settings import * diff --git a/conf/crontab b/conf/crontab new file mode 100644 index 0000000..5e78530 --- /dev/null +++ b/conf/crontab @@ -0,0 +1 @@ +*/15 * * * * cd /srv/comm-on-net/ && /usr/bin/make crawl
\ No newline at end of file |