diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 17:05:59 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-12 17:05:59 +0200 |
commit | f1b5889f9cf6d654175d548a67a5e415b490b182 (patch) | |
tree | a4b18a8265f6b8dd8ceecb37f44c682865ebb069 /commcrawler | |
parent | fdb3fdae3e75951467b36ee693313ffbf659c200 (diff) | |
download | Comm-on-net-f1b5889f9cf6d654175d548a67a5e415b490b182.tar.bz2 Comm-on-net-f1b5889f9cf6d654175d548a67a5e415b490b182.zip |
Only first page
Diffstat (limited to 'commcrawler')
-rw-r--r-- | commcrawler/scrapy.py | 7 |
1 files changed, 5 insertions, 2 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index bc17225..f4be032 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -315,6 +315,10 @@ def update_db_result(result_dct, values): result.save() +NUMBER_PER_PAGE = 250 +ONLY_FIRST_PAGE = True + + def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() crawl_item.started = timezone.now() @@ -328,10 +332,9 @@ def launch_crawl(crawl_item, excluded_domains=None): # slice total = q.count() targets = q.values("id") - NUMBER_PER_PAGE = 250 page = 0 page_number = total // NUMBER_PER_PAGE - while page <= page_number: + while page <= page_number and not (ONLY_FIRST_PAGE and page): process = CrawlerProcess(settings=scrap_settings) idx = 0 current_idx = page * NUMBER_PER_PAGE |