summaryrefslogtreecommitdiff
path: root/commcrawler
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 17:05:59 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-12 17:05:59 +0200
commitf1b5889f9cf6d654175d548a67a5e415b490b182 (patch)
treea4b18a8265f6b8dd8ceecb37f44c682865ebb069 /commcrawler
parentfdb3fdae3e75951467b36ee693313ffbf659c200 (diff)
downloadComm-on-net-f1b5889f9cf6d654175d548a67a5e415b490b182.tar.bz2
Comm-on-net-f1b5889f9cf6d654175d548a67a5e415b490b182.zip
Only first page
Diffstat (limited to 'commcrawler')
-rw-r--r--commcrawler/scrapy.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index bc17225..f4be032 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -315,6 +315,10 @@ def update_db_result(result_dct, values):
result.save()
+NUMBER_PER_PAGE = 250
+ONLY_FIRST_PAGE = True
+
+
def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
crawl_item.started = timezone.now()
@@ -328,10 +332,9 @@ def launch_crawl(crawl_item, excluded_domains=None):
# slice
total = q.count()
targets = q.values("id")
- NUMBER_PER_PAGE = 250
page = 0
page_number = total // NUMBER_PER_PAGE
- while page <= page_number:
+ while page <= page_number and not (ONLY_FIRST_PAGE and page):
process = CrawlerProcess(settings=scrap_settings)
idx = 0
current_idx = page * NUMBER_PER_PAGE