summaryrefslogtreecommitdiff
path: root/commonnet
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-07-31 16:33:11 +0200
committerÉtienne Loks <etienne@peacefrogs.net>2019-07-31 16:33:25 +0200
commitdd2dd640aa649c715a843fa431621fd955ca6767 (patch)
tree11a16e5c6c3920ebec8b2c40a426381406da6e35 /commonnet
parent6c6b1417111233b52fc55c792e9353964a60b536 (diff)
downloadComm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.tar.bz2
Comm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.zip
Basic scrapy configuration
Diffstat (limited to 'commonnet')
-rw-r--r--commonnet/local_settings.py.sample4
-rw-r--r--commonnet/scrapy_setting.py88
-rw-r--r--commonnet/settings.py25
-rw-r--r--commonnet/version.py1
4 files changed, 118 insertions, 0 deletions
diff --git a/commonnet/local_settings.py.sample b/commonnet/local_settings.py.sample
new file mode 100644
index 0000000..c280f86
--- /dev/null
+++ b/commonnet/local_settings.py.sample
@@ -0,0 +1,4 @@
+RESPONSIBLE_EMAIL = None
+
+# Cache for http request - set to false in production
+DEV = True
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
new file mode 100644
index 0000000..6330705
--- /dev/null
+++ b/commonnet/scrapy_setting.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+SCRAPPY_SETTINGS = {
+ # Scrapy settings for scraper project
+ #
+ # For simplicity, this file contains only settings considered important or
+ # commonly used. You can find more settings consulting the documentation:
+ #
+ # https://doc.scrapy.org/en/latest/topics/settings.html
+ # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+ "BOT_NAME": 'commonnet-scraper',
+
+ # Obey robots.txt rules
+ "ROBOTSTXT_OBEY": True,
+
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
+ "CONCURRENT_REQUESTS": 16,
+
+ # Configure a delay for requests for the same website (default: 0)
+ # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+ # See also autothrottle settings and docs
+ "DOWNLOAD_DELAY": 3,
+
+ # increase or decrease it with CPU usage
+ "CONCURRENT_REQUESTS_PER_DOMAIN": 100,
+
+ # Disable Telnet Console (enabled by default)
+ "TELNETCONSOLE_ENABLED": False,
+
+ # Override the default request headers:
+ #DEFAULT_REQUEST_HEADERS = {
+ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ # 'Accept-Language': 'en',
+ #}
+
+ # Enable or disable spider middlewares
+ # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+ #SPIDER_MIDDLEWARES = {
+ # 'scraper.middlewares.ScraperSpiderMiddleware': 543,
+ #}
+
+ # Enable or disable downloader middlewares
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+ #DOWNLOADER_MIDDLEWARES = {
+ # 'scraper.middlewares.ScraperDownloaderMiddleware': 543,
+ #}
+
+ # Enable or disable extensions
+ # See https://doc.scrapy.org/en/latest/topics/extensions.html
+ #EXTENSIONS = {
+ # 'scrapy.extensions.telnet.TelnetConsole': None,
+ #}
+
+ # Configure item pipelines
+ # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ #ITEM_PIPELINES = {
+ # 'scraper.pipelines.ScraperPipeline': 300,
+ #}
+
+ # Enable and configure the AutoThrottle extension (disabled by default)
+ # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+ "AUTOTHROTTLE_ENABLED": True,
+
+ # The initial download delay
+ #AUTOTHROTTLE_START_DELAY = 5
+ # The maximum download delay to be set in case of high latencies
+ #AUTOTHROTTLE_MAX_DELAY = 60
+ # The average number of requests Scrapy should be sending in parallel to
+ # each remote server
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+ # Enable showing throttling stats for every response received:
+ #AUTOTHROTTLE_DEBUG = False
+
+ # better queue for crawling many different domains in parallel
+ "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+
+ # Enable and configure HTTP caching (disabled by default)
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+ #HTTPCACHE_ENABLED = True
+ #HTTPCACHE_EXPIRATION_SECS = 0
+ #HTTPCACHE_DIR = 'httpcache'
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
+ #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+ "COOKIES_ENABLED": False,
+}
+
diff --git a/commonnet/settings.py b/commonnet/settings.py
index 26c5c66..b51b83e 100644
--- a/commonnet/settings.py
+++ b/commonnet/settings.py
@@ -1,5 +1,8 @@
import os
+from .scrapy_setting import SCRAPPY_SETTINGS
+from .version import VERSION
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# SECURITY WARNING: keep the secret key used in production secret!
@@ -7,6 +10,8 @@ SECRET_KEY = '!lh+r$hzd(_-aj8a2&@)34bat=w&=!k+9w%$_+&^gjhf#n6z42'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
+# Cache for http request
+DEV = True
ALLOWED_HOSTS = []
@@ -106,3 +111,23 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000
# https://docs.djangoproject.com/en/1.11/howto/static-files/
STATIC_URL = '/static/'
+
+RESPONSIBLE_EMAIL = None
+
+try:
+ from .local_settings import *
+except ImportError:
+ print("ERROR: a local_settings.py must be defined")
+ exit(1)
+
+if not RESPONSIBLE_EMAIL:
+ print("ERROR: a RESPONSIBLE_EMAIL must be defined in your "
+ "local_settings.py")
+ exit(1)
+
+SCRAPPY_SETTINGS["USER_AGENT"] = "commonnet-scraper v{} ({})".format(
+ VERSION, RESPONSIBLE_EMAIL
+)
+
+if DEV:
+ SCRAPPY_SETTINGS["HTTPCACHE_ENABLED"] = True
diff --git a/commonnet/version.py b/commonnet/version.py
new file mode 100644
index 0000000..87deb8f
--- /dev/null
+++ b/commonnet/version.py
@@ -0,0 +1 @@
+VERSION = "1.0.beta1"