summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 13:06:03 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 13:06:03 +0200
commite7dd52e03d8e770d8fcee1503fa8109dc3778d29 (patch)
treefbd89b70b52caa9cf0db21aa2efc24ed678fc2d3 /commcrawler/scrapy.py
parent347a0822484ad16b9a29eef1ea30082b4a841ac6 (diff)
downloadComm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.tar.bz2
Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.zip
Manage links betweens targets
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py68
1 files changed, 47 insertions, 21 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 5f59127..67c9ee3 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,7 +1,5 @@
import datetime
-import tldextract
import requests
-from urllib.parse import urldefrag
import scrapy
from scrapy.crawler import CrawlerProcess
@@ -13,6 +11,7 @@ from django.db import transaction, IntegrityError
from django.utils import timezone
from . import models
+from .utils import clean_url, append_to_results, get_domain
"""
CrawlLink
@@ -32,17 +31,6 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
-def clean_url(url):
- url, __ = urldefrag(url) # remove anchors
- return url
-
-
-def append_to_results(results, key, value):
- if key not in results:
- results[key] = []
- results[key].append(value)
-
-
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -281,11 +269,6 @@ class DbPipeline:
result.save()
-def get_domain(url):
- ext = tldextract.extract(url)
- return '{}.{}'.format(ext.domain, ext.suffix)
-
-
def create_spider(name, urls, crawl, target, excluded_domains=None,
redirect=None):
if not excluded_domains:
@@ -299,6 +282,36 @@ def create_spider(name, urls, crawl, target, excluded_domains=None,
)
+def launch_match(crawl_item):
+ # reset
+ models.CrawlRelation.objects.filter(crawl_id=crawl_item.pk).delete()
+ for result in crawl_item.results.values(
+ "pk", "crawl_result", "target_id", "target__url").all():
+ if not result["crawl_result"] or \
+ "external_link" not in result["crawl_result"][0]:
+ continue
+ domains = [
+ get_domain(link)
+ for link in result["crawl_result"][0]["external_link"]
+ ]
+ for subresult in crawl_item.results.values(
+ "pk", "target_id", "target__url").all():
+ if subresult["pk"] == result["pk"]:
+ continue
+ if get_domain(subresult["target__url"]) in domains:
+ rel, created = models.CrawlRelation.objects.get_or_create(
+ crawl_id=crawl_item.pk, source_id=result["target_id"],
+ destination_id=subresult["target_id"])
+ if not created: # multiple links
+ rel.number += 1
+ rel.save()
+ crawl_item.progression = (crawl_item.progression or 0) + 1
+ crawl_item.save()
+ crawl_item.ended = timezone.now()
+ crawl_item.status = "F"
+ crawl_item.save()
+
+
def launch_crawl(crawl_item, excluded_domains=None):
scrap_settings = settings.SCRAPPY_SETTINGS.copy()
process = CrawlerProcess(settings=scrap_settings)
@@ -311,8 +324,20 @@ def launch_crawl(crawl_item, excluded_domains=None):
redirect = None
url = target.url
if response.history:
- redirect = url
url = response.url
+ redirect = url
+ domain = get_domain(url)
+ if domain in excluded_domains:
+ dct = {
+ "crawl_id": crawl_item.pk,
+ "target_id": target.pk,
+ }
+ result, __ = models.CrawlResult.objects.get_or_create(**dct)
+ result.redirection = redirect
+ result.is_online = False
+ result.status = "F"
+ result.save()
+ continue
process.crawl(
create_spider(
"Crawl{}Target{}".format(crawl_item.pk, target.pk),
@@ -323,6 +348,7 @@ def launch_crawl(crawl_item, excluded_domains=None):
)
)
process.start()
- crawl_item.ended = timezone.now()
- crawl_item.status = "F"
+ crawl_item.crawl_ended = timezone.now()
+ crawl_item.status = "M"
crawl_item.save()
+ launch_match(crawl_item)