From 347a0822484ad16b9a29eef1ea30082b4a841ac6 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Fri, 9 Aug 2019 00:52:56 +0200 Subject: Manage redirections --- commcrawler/scrapy.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) (limited to 'commcrawler') diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 1fabafe..5f59127 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,5 +1,6 @@ import datetime import tldextract +import requests from urllib.parse import urldefrag import scrapy @@ -14,7 +15,6 @@ from django.utils import timezone from . import models """ -redirection CrawlLink """ @@ -56,6 +56,7 @@ class DefaultSpider: target_id = None crawl_result = None links_reached = set() + redirect = None def start_requests(self): q = { @@ -221,6 +222,8 @@ class DbPipeline: except models.CrawlResult.DoesNotExist: try: with transaction.atomic(): + if spider.redirect: + pks["redirection"] = spider.redirect result = models.CrawlResult.objects.create(**pks) created = True except IntegrityError: @@ -283,7 +286,8 @@ def get_domain(url): return '{}.{}'.format(ext.domain, ext.suffix) -def create_spider(name, urls, crawl, target, excluded_domains=None): +def create_spider(name, urls, crawl, target, excluded_domains=None, + redirect=None): if not excluded_domains: excluded_domains = [] return type( @@ -291,7 +295,7 @@ def create_spider(name, urls, crawl, target, excluded_domains=None): {"name": name, "start_urls": urls, "allowed_domains": [get_domain(url) for url in urls], "crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(), - "excluded_domains": excluded_domains} + "excluded_domains": excluded_domains, "redirect": redirect} ) @@ -303,12 +307,19 @@ def launch_crawl(crawl_item, excluded_domains=None): crawl_item.status = "P" crawl_item.save() for target in crawl_item.targets.all(): + response = requests.get(target.url) + redirect = None + url = target.url + if response.history: + redirect = url + url = response.url process.crawl( create_spider( "Crawl{}Target{}".format(crawl_item.pk, target.pk), - [target.url], + [url], crawl_item, target, - excluded_domains + excluded_domains, + redirect ) ) process.start() -- cgit v1.2.3