diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-07 23:15:17 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-07 23:15:45 +0200 |
commit | 838fd71728067d34a490f4a6fcaa3a09a460ef0a (patch) | |
tree | 1d45ace92fce2be464fd85ffeb8181fac8db1de7 | |
parent | 37036fde028b0c1c5f9db06b71bbd4bc9a287e51 (diff) | |
download | Comm-on-net-838fd71728067d34a490f4a6fcaa3a09a460ef0a.tar.bz2 Comm-on-net-838fd71728067d34a490f4a6fcaa3a09a460ef0a.zip |
Count external links
-rw-r--r-- | Makefile | 5 | ||||
-rw-r--r-- | commcrawler/admin.py | 5 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 26 |
3 files changed, 27 insertions, 9 deletions
@@ -30,6 +30,9 @@ regenerate_all: migrate ## regenerate all the database $(PYTHON) manage.py import_csv_autres data_src/autres.csv +crawl: ## launch crawl on the first planified crawler + $(PYTHON) manage.py launch_crawl --first-available + run: ## run test server $(PYTHON) manage.py runserver 0.0.0.0:8000 @@ -37,4 +40,4 @@ generate_graphs: ## generate graph model for documentation $(PYTHON) manage.py graph_models \ -S --arrow-shape normal -g -n -L fr-fr $(APPS) > /tmp/$(PROJECT).dot dot -Tpng /tmp/$(PROJECT).dot > docs/images/models.png - rm /tmp/$(PROJECT).dot
\ No newline at end of file + rm /tmp/$(PROJECT).dot diff --git a/commcrawler/admin.py b/commcrawler/admin.py index a89c871..015457e 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -6,6 +6,7 @@ from pygments.formatters.html import HtmlFormatter from ajax_select import make_ajax_form from django.contrib import admin +from django.utils.translation import ugettext_lazy as _ from django.utils.safestring import mark_safe from commonnet.admin_site import admin_site @@ -36,7 +37,7 @@ class CrawlResultAdmin(admin.ModelAdmin): "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube", "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf", "nb_external_pdf", "nb_internal_office", "nb_external_office", - "is_online", "redirection", + "is_online", "redirection", "crawl_result_prettified" ) exclude = ("crawl_result",) form = make_ajax_form(model, {'target': 'target'}) @@ -48,6 +49,8 @@ class CrawlResultAdmin(admin.ModelAdmin): style = "<style>" + formatter.get_style_defs() + "</style><br>" return mark_safe(style + response) + crawl_result_prettified.short_description = _("Crawl result") + admin_site.register(models.CrawlResult, CrawlResultAdmin) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 47a0521..7f7bea5 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -1,5 +1,6 @@ import datetime import tldextract +from urllib.parse import urldefrag import scrapy from scrapy.crawler import CrawlerProcess @@ -13,9 +14,6 @@ from django.utils import timezone from . import models """ -nb_external_link -nb_internal_link -nb_images nb_facebook nb_twitter nb_instagram @@ -33,7 +31,13 @@ redirection CrawlLink """ -MAX_LINKS = 500 + +def clean_url(url): + url, __ = urldefrag(url) # remove anchors + return url + + +MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -112,17 +116,24 @@ class DefaultSpider: try: self._parse_image(response, result) for link in LinkExtractor().extract_links(response): - url = link.url + url = clean_url(link.url) if url is None or url in self.links_reached: continue + is_internal = False for domain in self.allowed_domains: if domain in url: + is_internal = True self.links_reached.add(link.url) - if len(self.links_reached) < MAX_LINKS: + if not MAX_LINKS or \ + len(self.links_reached) < MAX_LINKS: yield response.follow(link.url, self.parse) else: print("MAX", self.allowed_domains, self.links_reached) + if not is_internal: + if "external_link" not in result: + result["external_link"] = [] + result["external_link"].append(url) except NotSupported: print("No response", response.url) yield result @@ -137,7 +148,7 @@ class DbPipeline: "facebook", "twitter", "instagram", "youtube", "dailymotion", "vimeo", "video", "audio", "internal_pdf", "external_pdf", "internal_office", - "external_office", ] + "external_office"] def _get_result_pk(self, spider): """ @@ -179,6 +190,7 @@ class DbPipeline: if url in crawl_result["urls"]: return crawl_result["urls"].append(url) + result.nb_internal_links = len(crawl_result["urls"]) - 1 for k, value in item.items(): if k == "is_online": if result_created: # only update on the first link |