From 391cb54e98dc2c1661a4e0ae13739ed710297d02 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Mon, 12 Aug 2019 17:40:48 +0200 Subject: Settings --- commcrawler/scrapy.py | 7 +++---- commonnet/settings.py | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 5fbeb43..8c78e61 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -33,6 +33,9 @@ CALENDAR_KEYS = ["agenda", "calendar"] MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) +NUMBER_PER_PAGE = settings.NUMBER_PER_SESSION +ONLY_FIRST_PAGE = True + class DefaultSpider: name = None @@ -317,10 +320,6 @@ def update_db_result(result_dct, values): result.save() -NUMBER_PER_PAGE = 250 -ONLY_FIRST_PAGE = True - - def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() crawl_item.started = timezone.now() diff --git a/commonnet/settings.py b/commonnet/settings.py index d50aff9..626055a 100644 --- a/commonnet/settings.py +++ b/commonnet/settings.py @@ -106,6 +106,7 @@ STATIC_URL = '/static/' STATIC_ROOT = os.path.join(BASE_DIR, "collected_static") CRAWL_TIMEOUT = 30 # timeout for each website crawl in minutes +NUMBER_PER_SESSION = 5000 try: from .local_settings import * -- cgit v1.2.3