Basic scrapy configuration

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-07-31 16:33:11 +0200
committer: Étienne Loks <etienne@peacefrogs.net> 2019-07-31 16:33:25 +0200
commit: dd2dd640aa649c715a843fa431621fd955ca6767 (patch)
tree: 11a16e5c6c3920ebec8b2c40a426381406da6e35
parent: 6c6b1417111233b52fc55c792e9353964a60b536 (diff)
download: Comm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.tar.bz2
Comm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.zip
7 files changed, 147 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index 9ea83a1..8ff3fee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
 *~
 .idea
 *.sqlite3
+local_settings.py
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
new file mode 100644
index 0000000..77dafe9
--- /dev/null
+++ b/commcrawler/scrapy.py
@@ -0,0 +1,26 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+from django.conf import settings
+
+
+class DefaultSpider:
+    pass
+
+
+def create_spider(name, urls, target=None):
+    return type(
+        name, (scrapy.Spider, DefaultSpider),
+        {"name": name, "start_urls": urls, "target": target}
+    )
+
+
+def crawl(crawl_item):
+    process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS)
+    for target in crawl_item.targets.all():
+        process.crawl(
+            create_spider("Target{}".format(target.pk),
+                          [target.url],
+                          target)
+        )
+    process.start()
diff --git a/commonnet/local_settings.py.sample b/commonnet/local_settings.py.sample
new file mode 100644
index 0000000..c280f86
--- /dev/null
+++ b/commonnet/local_settings.py.sample
@@ -0,0 +1,4 @@
+RESPONSIBLE_EMAIL = None
+
+# Cache for http request - set to false in production
+DEV = True
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
new file mode 100644
index 0000000..6330705
--- /dev/null
+++ b/commonnet/scrapy_setting.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+SCRAPPY_SETTINGS = {
+    # Scrapy settings for scraper project
+    #
+    # For simplicity, this file contains only settings considered important or
+    # commonly used. You can find more settings consulting the documentation:
+    #
+    #     https://doc.scrapy.org/en/latest/topics/settings.html
+    #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+    #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+    "BOT_NAME": 'commonnet-scraper',
+
+    # Obey robots.txt rules
+    "ROBOTSTXT_OBEY": True,
+
+    # Configure maximum concurrent requests performed by Scrapy (default: 16)
+    "CONCURRENT_REQUESTS": 16,
+
+    # Configure a delay for requests for the same website (default: 0)
+    # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+    # See also autothrottle settings and docs
+    "DOWNLOAD_DELAY": 3,
+
+    # increase or decrease it with CPU usage
+    "CONCURRENT_REQUESTS_PER_DOMAIN": 100,
+
+    # Disable Telnet Console (enabled by default)
+    "TELNETCONSOLE_ENABLED": False,
+
+    # Override the default request headers:
+    #DEFAULT_REQUEST_HEADERS = {
+    #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+    #   'Accept-Language': 'en',
+    #}
+
+    # Enable or disable spider middlewares
+    # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+    #SPIDER_MIDDLEWARES = {
+    #    'scraper.middlewares.ScraperSpiderMiddleware': 543,
+    #}
+
+    # Enable or disable downloader middlewares
+    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+    #DOWNLOADER_MIDDLEWARES = {
+    #    'scraper.middlewares.ScraperDownloaderMiddleware': 543,
+    #}
+
+    # Enable or disable extensions
+    # See https://doc.scrapy.org/en/latest/topics/extensions.html
+    #EXTENSIONS = {
+    #    'scrapy.extensions.telnet.TelnetConsole': None,
+    #}
+
+    # Configure item pipelines
+    # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+    #ITEM_PIPELINES = {
+    #    'scraper.pipelines.ScraperPipeline': 300,
+    #}
+
+    # Enable and configure the AutoThrottle extension (disabled by default)
+    # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+    "AUTOTHROTTLE_ENABLED": True,
+
+    # The initial download delay
+    #AUTOTHROTTLE_START_DELAY = 5
+    # The maximum download delay to be set in case of high latencies
+    #AUTOTHROTTLE_MAX_DELAY = 60
+    # The average number of requests Scrapy should be sending in parallel to
+    # each remote server
+    #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+    # Enable showing throttling stats for every response received:
+    #AUTOTHROTTLE_DEBUG = False
+
+    # better queue for crawling many different domains in parallel
+    "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+
+    # Enable and configure HTTP caching (disabled by default)
+    # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+    #HTTPCACHE_ENABLED = True
+    #HTTPCACHE_EXPIRATION_SECS = 0
+    #HTTPCACHE_DIR = 'httpcache'
+    #HTTPCACHE_IGNORE_HTTP_CODES = []
+    #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+    "COOKIES_ENABLED": False,
+}
+
diff --git a/commonnet/settings.py b/commonnet/settings.py
index 26c5c66..b51b83e 100644
--- a/commonnet/settings.py
+++ b/commonnet/settings.py
@@ -1,5 +1,8 @@
 import os
 
+from .scrapy_setting import SCRAPPY_SETTINGS
+from .version import VERSION
+
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 
 # SECURITY WARNING: keep the secret key used in production secret!
@@ -7,6 +10,8 @@ SECRET_KEY = '!lh+r$hzd(_-aj8a2&@)34bat=w&=!k+9w%$_+&^gjhf#n6z42'
 
 # SECURITY WARNING: don't run with debug turned on in production!
 DEBUG = True
+# Cache for http request
+DEV = True
 
 ALLOWED_HOSTS = []
 
@@ -106,3 +111,23 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000
 # https://docs.djangoproject.com/en/1.11/howto/static-files/
 
 STATIC_URL = '/static/'
+
+RESPONSIBLE_EMAIL = None
+
+try:
+    from .local_settings import *
+except ImportError:
+    print("ERROR: a local_settings.py must be defined")
+    exit(1)
+
+if not RESPONSIBLE_EMAIL:
+    print("ERROR: a RESPONSIBLE_EMAIL must be defined in your "
+          "local_settings.py")
+    exit(1)
+
+SCRAPPY_SETTINGS["USER_AGENT"] = "commonnet-scraper v{} ({})".format(
+    VERSION, RESPONSIBLE_EMAIL
+)
+
+if DEV:
+    SCRAPPY_SETTINGS["HTTPCACHE_ENABLED"] = True
diff --git a/commonnet/version.py b/commonnet/version.py
new file mode 100644
index 0000000..87deb8f
--- /dev/null
+++ b/commonnet/version.py
@@ -0,0 +1 @@
+VERSION = "1.0.beta1"
diff --git a/requirements.txt b/requirements.txt
index 7028b7e..43068f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 django==1.11
 scrapy==1.5
-django-ajax-selects==1.6.0
-\ No newline at end of file
+django-ajax-selects==1.6.0
+# https://splash.readthedocs.io/
+\ No newline at end of file
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-07-31 16:33:11 +0200
committer	Étienne Loks <etienne@peacefrogs.net>	2019-07-31 16:33:25 +0200
commit	dd2dd640aa649c715a843fa431621fd955ca6767 (patch)
tree	11a16e5c6c3920ebec8b2c40a426381406da6e35
parent	6c6b1417111233b52fc55c792e9353964a60b536 (diff)
download	Comm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.tar.bz2 Comm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.zip