diff options
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 68 |
1 files changed, 47 insertions, 21 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 5f59127..67c9ee3 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,7 +1,5 @@ import datetime -import tldextract import requests -from urllib.parse import urldefrag import scrapy from scrapy.crawler import CrawlerProcess @@ -13,6 +11,7 @@ from django.db import transaction, IntegrityError from django.utils import timezone from . import models +from .utils import clean_url, append_to_results, get_domain """ CrawlLink @@ -32,17 +31,6 @@ AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus", OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx") -def clean_url(url): - url, __ = urldefrag(url) # remove anchors - return url - - -def append_to_results(results, key, value): - if key not in results: - results[key] = [] - results[key].append(value) - - MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -281,11 +269,6 @@ class DbPipeline: result.save() -def get_domain(url): - ext = tldextract.extract(url) - return '{}.{}'.format(ext.domain, ext.suffix) - - def create_spider(name, urls, crawl, target, excluded_domains=None, redirect=None): if not excluded_domains: @@ -299,6 +282,36 @@ def create_spider(name, urls, crawl, target, excluded_domains=None, ) +def launch_match(crawl_item): + # reset + models.CrawlRelation.objects.filter(crawl_id=crawl_item.pk).delete() + for result in crawl_item.results.values( + "pk", "crawl_result", "target_id", "target__url").all(): + if not result["crawl_result"] or \ + "external_link" not in result["crawl_result"][0]: + continue + domains = [ + get_domain(link) + for link in result["crawl_result"][0]["external_link"] + ] + for subresult in crawl_item.results.values( + "pk", "target_id", "target__url").all(): + if subresult["pk"] == result["pk"]: + continue + if get_domain(subresult["target__url"]) in domains: + rel, created = models.CrawlRelation.objects.get_or_create( + crawl_id=crawl_item.pk, source_id=result["target_id"], + destination_id=subresult["target_id"]) + if not created: # multiple links + rel.number += 1 + rel.save() + crawl_item.progression = (crawl_item.progression or 0) + 1 + crawl_item.save() + crawl_item.ended = timezone.now() + crawl_item.status = "F" + crawl_item.save() + + def launch_crawl(crawl_item, excluded_domains=None): scrap_settings = settings.SCRAPPY_SETTINGS.copy() process = CrawlerProcess(settings=scrap_settings) @@ -311,8 +324,20 @@ def launch_crawl(crawl_item, excluded_domains=None): redirect = None url = target.url if response.history: - redirect = url url = response.url + redirect = url + domain = get_domain(url) + if domain in excluded_domains: + dct = { + "crawl_id": crawl_item.pk, + "target_id": target.pk, + } + result, __ = models.CrawlResult.objects.get_or_create(**dct) + result.redirection = redirect + result.is_online = False + result.status = "F" + result.save() + continue process.crawl( create_spider( "Crawl{}Target{}".format(crawl_item.pk, target.pk), @@ -323,6 +348,7 @@ def launch_crawl(crawl_item, excluded_domains=None): ) ) process.start() - crawl_item.ended = timezone.now() - crawl_item.status = "F" + crawl_item.crawl_ended = timezone.now() + crawl_item.status = "M" crawl_item.save() + launch_match(crawl_item) |