diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 00:52:56 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 00:52:56 +0200 |
commit | 347a0822484ad16b9a29eef1ea30082b4a841ac6 (patch) | |
tree | 2d4fa79c31b6ee3a74328c70219a6fcad8f603d8 | |
parent | bcacb3dcae815230c106cd773130f7b0ea5f720d (diff) | |
download | Comm-on-net-347a0822484ad16b9a29eef1ea30082b4a841ac6.tar.bz2 Comm-on-net-347a0822484ad16b9a29eef1ea30082b4a841ac6.zip |
Manage redirections
-rw-r--r-- | commcrawler/scrapy.py | 21 | ||||
-rwxr-xr-x | install.sh | 6 | ||||
-rw-r--r-- | requirements.txt | 1 |
3 files changed, 20 insertions, 8 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 1fabafe..5f59127 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,5 +1,6 @@ import datetime import tldextract +import requests from urllib.parse import urldefrag import scrapy @@ -14,7 +15,6 @@ from django.utils import timezone from . import models """ -redirection CrawlLink """ @@ -56,6 +56,7 @@ class DefaultSpider: target_id = None crawl_result = None links_reached = set() + redirect = None def start_requests(self): q = { @@ -221,6 +222,8 @@ class DbPipeline: except models.CrawlResult.DoesNotExist: try: with transaction.atomic(): + if spider.redirect: + pks["redirection"] = spider.redirect result = models.CrawlResult.objects.create(**pks) created = True except IntegrityError: @@ -283,7 +286,8 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, crawl, target, excluded_domains=None): +def create_spider(name, urls, crawl, target, excluded_domains=None, + redirect=None): if not excluded_domains: excluded_domains = [] return type( @@ -291,7 +295,7 @@ def create_spider(name, urls, crawl, target, excluded_domains=None): {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), - "excluded_domains": excluded_domains} + "excluded_domains": excluded_domains, "redirect": redirect} ) @@ -303,12 +307,19 @@ def launch_crawl(crawl_item, excluded_domains=None): crawl_item.status = "P" crawl_item.save() for target in crawl_item.targets.all(): + response = requests.get(target.url) + redirect = None + url = target.url + if response.history: + redirect = url + url = response.url process.crawl( create_spider( "Crawl{}Target{}".format(crawl_item.pk, target.pk), - [target.url], + [url], crawl_item, target, - excluded_domains + excluded_domains, + redirect ) ) process.start() @@ -16,9 +16,9 @@ APT_OPTIONS=" -y -q " [ "$DEBUG" == 'true' ] && APT_OPTIONS="" apt-get install $APT_OPTIONS git nginx uwsgi uwsgi-plugin-python3 postgresql apg sed gettext > /dev/null -apt-get install $APT_OPTIONS python3 python3-pip python3-psycopg2 python3-pygments > /dev/null -apt-get install $APT_OPTIONS -t stretch-backports python3-django > /dev/null -# buster: apt install python3-django +apt-get install $APT_OPTIONS python3 python3-pip python3-psycopg2 python3-pygments > /dev/null +apt-get install $APT_OPTIONS -t stretch-backports python3-django python3-requests > /dev/null +# buster/bulleyes: apt install python3-django python3-requests pip3 install scrapy==1.7 tldextract==2.2 django-ajax-selects==1.6.0 > /dev/null # buster: apt install python3-tldextract django-ajax-selects diff --git a/requirements.txt b/requirements.txt index a6df1be..f303ec6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,4 +4,5 @@ django==1.11 django-ajax-selects==1.6.0 psycopg2 pygments==2.2 +requests==2.21 # https://splash.readthedocs.io/
\ No newline at end of file |