summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 00:52:56 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 00:52:56 +0200
commit347a0822484ad16b9a29eef1ea30082b4a841ac6 (patch)
tree2d4fa79c31b6ee3a74328c70219a6fcad8f603d8 /commcrawler/scrapy.py
parentbcacb3dcae815230c106cd773130f7b0ea5f720d (diff)
downloadComm-on-net-347a0822484ad16b9a29eef1ea30082b4a841ac6.tar.bz2
Comm-on-net-347a0822484ad16b9a29eef1ea30082b4a841ac6.zip
Manage redirections
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py21
1 files changed, 16 insertions, 5 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 1fabafe..5f59127 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,5 +1,6 @@
import datetime
import tldextract
+import requests
from urllib.parse import urldefrag
import scrapy
@@ -14,7 +15,6 @@ from django.utils import timezone
from . import models
"""
-redirection
CrawlLink
"""
@@ -56,6 +56,7 @@ class DefaultSpider:
target_id = None
crawl_result = None
links_reached = set()
+ redirect = None
def start_requests(self):
q = {
@@ -221,6 +222,8 @@ class DbPipeline:
except models.CrawlResult.DoesNotExist:
try:
with transaction.atomic():
+ if spider.redirect:
+ pks["redirection"] = spider.redirect
result = models.CrawlResult.objects.create(**pks)
created = True
except IntegrityError:
@@ -283,7 +286,8 @@ def get_domain(url):
return '{}.{}'.format(ext.domain, ext.suffix)
-def create_spider(name, urls, crawl, target, excluded_domains=None):
+def create_spider(name, urls, crawl, target, excluded_domains=None,
+ redirect=None):
if not excluded_domains:
excluded_domains = []
return type(
@@ -291,7 +295,7 @@ def create_spider(name, urls, crawl, target, excluded_domains=None):
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
"crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
- "excluded_domains": excluded_domains}
+ "excluded_domains": excluded_domains, "redirect": redirect}
)
@@ -303,12 +307,19 @@ def launch_crawl(crawl_item, excluded_domains=None):
crawl_item.status = "P"
crawl_item.save()
for target in crawl_item.targets.all():
+ response = requests.get(target.url)
+ redirect = None
+ url = target.url
+ if response.history:
+ redirect = url
+ url = response.url
process.crawl(
create_spider(
"Crawl{}Target{}".format(crawl_item.pk, target.pk),
- [target.url],
+ [url],
crawl_item, target,
- excluded_domains
+ excluded_domains,
+ redirect
)
)
process.start()