summaryrefslogtreecommitdiff
path: root/commonnet/scrapy_setting.py
diff options
context:
space:
mode:
Diffstat (limited to 'commonnet/scrapy_setting.py')
-rw-r--r--commonnet/scrapy_setting.py88
1 files changed, 88 insertions, 0 deletions
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
new file mode 100644
index 0000000..6330705
--- /dev/null
+++ b/commonnet/scrapy_setting.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+SCRAPPY_SETTINGS = {
+ # Scrapy settings for scraper project
+ #
+ # For simplicity, this file contains only settings considered important or
+ # commonly used. You can find more settings consulting the documentation:
+ #
+ # https://doc.scrapy.org/en/latest/topics/settings.html
+ # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+ "BOT_NAME": 'commonnet-scraper',
+
+ # Obey robots.txt rules
+ "ROBOTSTXT_OBEY": True,
+
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
+ "CONCURRENT_REQUESTS": 16,
+
+ # Configure a delay for requests for the same website (default: 0)
+ # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+ # See also autothrottle settings and docs
+ "DOWNLOAD_DELAY": 3,
+
+ # increase or decrease it with CPU usage
+ "CONCURRENT_REQUESTS_PER_DOMAIN": 100,
+
+ # Disable Telnet Console (enabled by default)
+ "TELNETCONSOLE_ENABLED": False,
+
+ # Override the default request headers:
+ #DEFAULT_REQUEST_HEADERS = {
+ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ # 'Accept-Language': 'en',
+ #}
+
+ # Enable or disable spider middlewares
+ # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+ #SPIDER_MIDDLEWARES = {
+ # 'scraper.middlewares.ScraperSpiderMiddleware': 543,
+ #}
+
+ # Enable or disable downloader middlewares
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+ #DOWNLOADER_MIDDLEWARES = {
+ # 'scraper.middlewares.ScraperDownloaderMiddleware': 543,
+ #}
+
+ # Enable or disable extensions
+ # See https://doc.scrapy.org/en/latest/topics/extensions.html
+ #EXTENSIONS = {
+ # 'scrapy.extensions.telnet.TelnetConsole': None,
+ #}
+
+ # Configure item pipelines
+ # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ #ITEM_PIPELINES = {
+ # 'scraper.pipelines.ScraperPipeline': 300,
+ #}
+
+ # Enable and configure the AutoThrottle extension (disabled by default)
+ # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+ "AUTOTHROTTLE_ENABLED": True,
+
+ # The initial download delay
+ #AUTOTHROTTLE_START_DELAY = 5
+ # The maximum download delay to be set in case of high latencies
+ #AUTOTHROTTLE_MAX_DELAY = 60
+ # The average number of requests Scrapy should be sending in parallel to
+ # each remote server
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+ # Enable showing throttling stats for every response received:
+ #AUTOTHROTTLE_DEBUG = False
+
+ # better queue for crawling many different domains in parallel
+ "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+
+ # Enable and configure HTTP caching (disabled by default)
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+ #HTTPCACHE_ENABLED = True
+ #HTTPCACHE_EXPIRATION_SECS = 0
+ #HTTPCACHE_DIR = 'httpcache'
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
+ #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+ "COOKIES_ENABLED": False,
+}
+