summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 00:52:56 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 00:52:56 +0200
commit347a0822484ad16b9a29eef1ea30082b4a841ac6 (patch)
tree2d4fa79c31b6ee3a74328c70219a6fcad8f603d8
parentbcacb3dcae815230c106cd773130f7b0ea5f720d (diff)
downloadComm-on-net-347a0822484ad16b9a29eef1ea30082b4a841ac6.tar.bz2
Comm-on-net-347a0822484ad16b9a29eef1ea30082b4a841ac6.zip
Manage redirections
-rw-r--r--commcrawler/scrapy.py21
-rwxr-xr-xinstall.sh6
-rw-r--r--requirements.txt1
3 files changed, 20 insertions, 8 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 1fabafe..5f59127 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,5 +1,6 @@
import datetime
import tldextract
+import requests
from urllib.parse import urldefrag
import scrapy
@@ -14,7 +15,6 @@ from django.utils import timezone
from . import models
"""
-redirection
CrawlLink
"""
@@ -56,6 +56,7 @@ class DefaultSpider:
target_id = None
crawl_result = None
links_reached = set()
+ redirect = None
def start_requests(self):
q = {
@@ -221,6 +222,8 @@ class DbPipeline:
except models.CrawlResult.DoesNotExist:
try:
with transaction.atomic():
+ if spider.redirect:
+ pks["redirection"] = spider.redirect
result = models.CrawlResult.objects.create(**pks)
created = True
except IntegrityError:
@@ -283,7 +286,8 @@ def get_domain(url):
return '{}.{}'.format(ext.domain, ext.suffix)
-def create_spider(name, urls, crawl, target, excluded_domains=None):
+def create_spider(name, urls, crawl, target, excluded_domains=None,
+ redirect=None):
if not excluded_domains:
excluded_domains = []
return type(
@@ -291,7 +295,7 @@ def create_spider(name, urls, crawl, target, excluded_domains=None):
{"name": name, "start_urls": urls,
"allowed_domains": [get_domain(url) for url in urls],
"crawl_id": crawl.pk, "target_id": target.pk, "links_reached": set(),
- "excluded_domains": excluded_domains}
+ "excluded_domains": excluded_domains, "redirect": redirect}
)
@@ -303,12 +307,19 @@ def launch_crawl(crawl_item, excluded_domains=None):
crawl_item.status = "P"
crawl_item.save()
for target in crawl_item.targets.all():
+ response = requests.get(target.url)
+ redirect = None
+ url = target.url
+ if response.history:
+ redirect = url
+ url = response.url
process.crawl(
create_spider(
"Crawl{}Target{}".format(crawl_item.pk, target.pk),
- [target.url],
+ [url],
crawl_item, target,
- excluded_domains
+ excluded_domains,
+ redirect
)
)
process.start()
diff --git a/install.sh b/install.sh
index 556ba31..6e97df1 100755
--- a/install.sh
+++ b/install.sh
@@ -16,9 +16,9 @@ APT_OPTIONS=" -y -q "
[ "$DEBUG" == 'true' ] && APT_OPTIONS=""
apt-get install $APT_OPTIONS git nginx uwsgi uwsgi-plugin-python3 postgresql apg sed gettext > /dev/null
-apt-get install $APT_OPTIONS python3 python3-pip python3-psycopg2 python3-pygments > /dev/null
-apt-get install $APT_OPTIONS -t stretch-backports python3-django > /dev/null
-# buster: apt install python3-django
+apt-get install $APT_OPTIONS python3 python3-pip python3-psycopg2 python3-pygments > /dev/null
+apt-get install $APT_OPTIONS -t stretch-backports python3-django python3-requests > /dev/null
+# buster/bulleyes: apt install python3-django python3-requests
pip3 install scrapy==1.7 tldextract==2.2 django-ajax-selects==1.6.0 > /dev/null
# buster: apt install python3-tldextract django-ajax-selects
diff --git a/requirements.txt b/requirements.txt
index a6df1be..f303ec6 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -4,4 +4,5 @@ django==1.11
django-ajax-selects==1.6.0
psycopg2
pygments==2.2
+requests==2.21
# https://splash.readthedocs.io/ \ No newline at end of file