summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-20 11:52:02 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-20 11:52:02 +0200
commita465ac1fa4271e26bdc659aff32bb9ebeaae7922 (patch)
treeef091e8bae6c6c69677e0b869e907c552824c40d
parent01d72fa7c94359049e2a7beb068167cb7f047805 (diff)
downloadComm-on-net-a465ac1fa4271e26bdc659aff32bb9ebeaae7922.tar.bz2
Comm-on-net-a465ac1fa4271e26bdc659aff32bb9ebeaae7922.zip
Slice crawls - add a crontab example
-rw-r--r--commcrawler/models.py2
-rw-r--r--commcrawler/scrapy.py5
-rw-r--r--commonnet/settings.py2
-rw-r--r--conf/crontab1
4 files changed, 8 insertions, 2 deletions
diff --git a/commcrawler/models.py b/commcrawler/models.py
index 22e5602..ef99c4f 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -73,7 +73,7 @@ class Crawl(models.Model):
todo = self.target_nb
if todo == 0:
return "-"
- if self.status == "P":
+ if self.status in ("P", "A"):
done = self.results.filter(status__in=("T", "F")).count()
percent = int(done / todo * 100)
return "{} % ({}/{})".format(percent, done, todo)
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 490142c..6af1c0b 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -435,6 +435,11 @@ def launch_crawl(crawl_item, excluded_domains=None):
process.start(stop_after_crawl=ONLY_FIRST_PAGE)
page += 1
crawl_item.crawl_ended = timezone.now()
+ if ONLY_FIRST_PAGE and page <= page_number:
+ crawl_item.status = "A"
+ crawl_item.save()
+ return
+
crawl_item.status = "M"
crawl_item.save()
launch_match(crawl_item)
diff --git a/commonnet/settings.py b/commonnet/settings.py
index 03e4710..6143b0f 100644
--- a/commonnet/settings.py
+++ b/commonnet/settings.py
@@ -105,7 +105,7 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000
STATIC_URL = '/static/'
STATIC_ROOT = os.path.join(BASE_DIR, "collected_static")
-NUMBER_PER_SESSION = 5000
+NUMBER_PER_SESSION = 50
try:
from .local_settings import *
diff --git a/conf/crontab b/conf/crontab
new file mode 100644
index 0000000..5e78530
--- /dev/null
+++ b/conf/crontab
@@ -0,0 +1 @@
+*/15 * * * * cd /srv/comm-on-net/ && /usr/bin/make crawl \ No newline at end of file