summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-07-31 16:33:11 +0200
committerÉtienne Loks <etienne@peacefrogs.net>2019-07-31 16:33:25 +0200
commitdd2dd640aa649c715a843fa431621fd955ca6767 (patch)
tree11a16e5c6c3920ebec8b2c40a426381406da6e35
parent6c6b1417111233b52fc55c792e9353964a60b536 (diff)
downloadComm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.tar.bz2
Comm-on-net-dd2dd640aa649c715a843fa431621fd955ca6767.zip
Basic scrapy configuration
-rw-r--r--.gitignore1
-rw-r--r--commcrawler/scrapy.py26
-rw-r--r--commonnet/local_settings.py.sample4
-rw-r--r--commonnet/scrapy_setting.py88
-rw-r--r--commonnet/settings.py25
-rw-r--r--commonnet/version.py1
-rw-r--r--requirements.txt3
7 files changed, 147 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index 9ea83a1..8ff3fee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,4 @@
*~
.idea
*.sqlite3
+local_settings.py
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
new file mode 100644
index 0000000..77dafe9
--- /dev/null
+++ b/commcrawler/scrapy.py
@@ -0,0 +1,26 @@
+import scrapy
+from scrapy.crawler import CrawlerProcess
+
+from django.conf import settings
+
+
+class DefaultSpider:
+ pass
+
+
+def create_spider(name, urls, target=None):
+ return type(
+ name, (scrapy.Spider, DefaultSpider),
+ {"name": name, "start_urls": urls, "target": target}
+ )
+
+
+def crawl(crawl_item):
+ process = CrawlerProcess(settings=settings.SCRAPPY_SETTINGS)
+ for target in crawl_item.targets.all():
+ process.crawl(
+ create_spider("Target{}".format(target.pk),
+ [target.url],
+ target)
+ )
+ process.start()
diff --git a/commonnet/local_settings.py.sample b/commonnet/local_settings.py.sample
new file mode 100644
index 0000000..c280f86
--- /dev/null
+++ b/commonnet/local_settings.py.sample
@@ -0,0 +1,4 @@
+RESPONSIBLE_EMAIL = None
+
+# Cache for http request - set to false in production
+DEV = True
diff --git a/commonnet/scrapy_setting.py b/commonnet/scrapy_setting.py
new file mode 100644
index 0000000..6330705
--- /dev/null
+++ b/commonnet/scrapy_setting.py
@@ -0,0 +1,88 @@
+# -*- coding: utf-8 -*-
+
+SCRAPPY_SETTINGS = {
+ # Scrapy settings for scraper project
+ #
+ # For simplicity, this file contains only settings considered important or
+ # commonly used. You can find more settings consulting the documentation:
+ #
+ # https://doc.scrapy.org/en/latest/topics/settings.html
+ # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+ "BOT_NAME": 'commonnet-scraper',
+
+ # Obey robots.txt rules
+ "ROBOTSTXT_OBEY": True,
+
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
+ "CONCURRENT_REQUESTS": 16,
+
+ # Configure a delay for requests for the same website (default: 0)
+ # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+ # See also autothrottle settings and docs
+ "DOWNLOAD_DELAY": 3,
+
+ # increase or decrease it with CPU usage
+ "CONCURRENT_REQUESTS_PER_DOMAIN": 100,
+
+ # Disable Telnet Console (enabled by default)
+ "TELNETCONSOLE_ENABLED": False,
+
+ # Override the default request headers:
+ #DEFAULT_REQUEST_HEADERS = {
+ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+ # 'Accept-Language': 'en',
+ #}
+
+ # Enable or disable spider middlewares
+ # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+ #SPIDER_MIDDLEWARES = {
+ # 'scraper.middlewares.ScraperSpiderMiddleware': 543,
+ #}
+
+ # Enable or disable downloader middlewares
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+ #DOWNLOADER_MIDDLEWARES = {
+ # 'scraper.middlewares.ScraperDownloaderMiddleware': 543,
+ #}
+
+ # Enable or disable extensions
+ # See https://doc.scrapy.org/en/latest/topics/extensions.html
+ #EXTENSIONS = {
+ # 'scrapy.extensions.telnet.TelnetConsole': None,
+ #}
+
+ # Configure item pipelines
+ # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+ #ITEM_PIPELINES = {
+ # 'scraper.pipelines.ScraperPipeline': 300,
+ #}
+
+ # Enable and configure the AutoThrottle extension (disabled by default)
+ # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+ "AUTOTHROTTLE_ENABLED": True,
+
+ # The initial download delay
+ #AUTOTHROTTLE_START_DELAY = 5
+ # The maximum download delay to be set in case of high latencies
+ #AUTOTHROTTLE_MAX_DELAY = 60
+ # The average number of requests Scrapy should be sending in parallel to
+ # each remote server
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+ # Enable showing throttling stats for every response received:
+ #AUTOTHROTTLE_DEBUG = False
+
+ # better queue for crawling many different domains in parallel
+ "SCHEDULER_PRIORITY_QUEUE": 'scrapy.pqueues.DownloaderAwarePriorityQueue',
+
+ # Enable and configure HTTP caching (disabled by default)
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+ #HTTPCACHE_ENABLED = True
+ #HTTPCACHE_EXPIRATION_SECS = 0
+ #HTTPCACHE_DIR = 'httpcache'
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
+ #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+ "COOKIES_ENABLED": False,
+}
+
diff --git a/commonnet/settings.py b/commonnet/settings.py
index 26c5c66..b51b83e 100644
--- a/commonnet/settings.py
+++ b/commonnet/settings.py
@@ -1,5 +1,8 @@
import os
+from .scrapy_setting import SCRAPPY_SETTINGS
+from .version import VERSION
+
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
# SECURITY WARNING: keep the secret key used in production secret!
@@ -7,6 +10,8 @@ SECRET_KEY = '!lh+r$hzd(_-aj8a2&@)34bat=w&=!k+9w%$_+&^gjhf#n6z42'
# SECURITY WARNING: don't run with debug turned on in production!
DEBUG = True
+# Cache for http request
+DEV = True
ALLOWED_HOSTS = []
@@ -106,3 +111,23 @@ DATA_UPLOAD_MAX_NUMBER_FIELDS = 5000
# https://docs.djangoproject.com/en/1.11/howto/static-files/
STATIC_URL = '/static/'
+
+RESPONSIBLE_EMAIL = None
+
+try:
+ from .local_settings import *
+except ImportError:
+ print("ERROR: a local_settings.py must be defined")
+ exit(1)
+
+if not RESPONSIBLE_EMAIL:
+ print("ERROR: a RESPONSIBLE_EMAIL must be defined in your "
+ "local_settings.py")
+ exit(1)
+
+SCRAPPY_SETTINGS["USER_AGENT"] = "commonnet-scraper v{} ({})".format(
+ VERSION, RESPONSIBLE_EMAIL
+)
+
+if DEV:
+ SCRAPPY_SETTINGS["HTTPCACHE_ENABLED"] = True
diff --git a/commonnet/version.py b/commonnet/version.py
new file mode 100644
index 0000000..87deb8f
--- /dev/null
+++ b/commonnet/version.py
@@ -0,0 +1 @@
+VERSION = "1.0.beta1"
diff --git a/requirements.txt b/requirements.txt
index 7028b7e..43068f9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
django==1.11
scrapy==1.5
-django-ajax-selects==1.6.0 \ No newline at end of file
+django-ajax-selects==1.6.0
+# https://splash.readthedocs.io/ \ No newline at end of file