Count external links

author: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-07 23:15:17 +0200
committer: Étienne Loks <etienne.loks@iggdrasil.net> 2019-08-07 23:15:45 +0200
commit: 838fd71728067d34a490f4a6fcaa3a09a460ef0a (patch)
tree: 1d45ace92fce2be464fd85ffeb8181fac8db1de7
parent: 37036fde028b0c1c5f9db06b71bbd4bc9a287e51 (diff)
download: Comm-on-net-838fd71728067d34a490f4a6fcaa3a09a460ef0a.tar.bz2
Comm-on-net-838fd71728067d34a490f4a6fcaa3a09a460ef0a.zip
3 files changed, 27 insertions, 9 deletions
diff --git a/Makefile b/Makefile
index 20e1e24..9ce49e9 100644
--- a/Makefile
+++ b/Makefile
@@ -30,6 +30,9 @@ regenerate_all: migrate  ## regenerate all the database
 	$(PYTHON) manage.py import_csv_autres data_src/autres.csv
 
 
+crawl:  ## launch crawl on the first planified crawler
+	$(PYTHON) manage.py launch_crawl --first-available
+
 run:  ## run test server
 	$(PYTHON) manage.py runserver 0.0.0.0:8000
 
@@ -37,4 +40,4 @@ generate_graphs:  ## generate graph model for documentation
 	$(PYTHON) manage.py graph_models \
 		-S --arrow-shape normal -g -n -L fr-fr $(APPS) > /tmp/$(PROJECT).dot
 	dot -Tpng /tmp/$(PROJECT).dot > docs/images/models.png
-	rm /tmp/$(PROJECT).dot
-\ No newline at end of file
+	rm /tmp/$(PROJECT).dot
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index a89c871..015457e 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -6,6 +6,7 @@ from pygments.formatters.html import HtmlFormatter
 
 from ajax_select import make_ajax_form
 from django.contrib import admin
+from django.utils.translation import ugettext_lazy as _
 from django.utils.safestring import mark_safe
 
 from commonnet.admin_site import admin_site
@@ -36,7 +37,7 @@ class CrawlResultAdmin(admin.ModelAdmin):
         "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
         "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
         "nb_external_pdf", "nb_internal_office", "nb_external_office",
-        "is_online", "redirection",
+        "is_online", "redirection", "crawl_result_prettified"
     )
     exclude = ("crawl_result",)
     form = make_ajax_form(model, {'target': 'target'})
@@ -48,6 +49,8 @@ class CrawlResultAdmin(admin.ModelAdmin):
         style = "<style>" + formatter.get_style_defs() + "</style><br>"
         return mark_safe(style + response)
 
+    crawl_result_prettified.short_description = _("Crawl result")
+
 
 admin_site.register(models.CrawlResult, CrawlResultAdmin)
 
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 47a0521..7f7bea5 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,5 +1,6 @@
 import datetime
 import tldextract
+from urllib.parse import urldefrag
 
 import scrapy
 from scrapy.crawler import CrawlerProcess
@@ -13,9 +14,6 @@ from django.utils import timezone
 from . import models
 
 """
-nb_external_link
-nb_internal_link
-nb_images
 nb_facebook
 nb_twitter
 nb_instagram
@@ -33,7 +31,13 @@ redirection
 CrawlLink
 """
 
-MAX_LINKS = 500
+
+def clean_url(url):
+    url, __ = urldefrag(url)  # remove anchors
+    return url
+
+
+MAX_LINKS = None  # if None no max
 TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
 
 
@@ -112,17 +116,24 @@ class DefaultSpider:
             try:
                 self._parse_image(response, result)
                 for link in LinkExtractor().extract_links(response):
-                    url = link.url
+                    url = clean_url(link.url)
                     if url is None or url in self.links_reached:
                         continue
+                    is_internal = False
                     for domain in self.allowed_domains:
                         if domain in url:
+                            is_internal = True
                             self.links_reached.add(link.url)
-                            if len(self.links_reached) < MAX_LINKS:
+                            if not MAX_LINKS or \
+                                    len(self.links_reached) < MAX_LINKS:
                                 yield response.follow(link.url, self.parse)
                             else:
                                 print("MAX", self.allowed_domains,
                                       self.links_reached)
+                    if not is_internal:
+                        if "external_link" not in result:
+                            result["external_link"] = []
+                        result["external_link"].append(url)
             except NotSupported:
                 print("No response", response.url)
             yield result
@@ -137,7 +148,7 @@ class DbPipeline:
                "facebook", "twitter", "instagram", "youtube",
                "dailymotion", "vimeo", "video", "audio",
                "internal_pdf", "external_pdf", "internal_office",
-               "external_office", ]
+               "external_office"]
 
     def _get_result_pk(self, spider):
         """
@@ -179,6 +190,7 @@ class DbPipeline:
             if url in crawl_result["urls"]:
                 return
             crawl_result["urls"].append(url)
+            result.nb_internal_links = len(crawl_result["urls"]) - 1
             for k, value in item.items():
                 if k == "is_online":
                     if result_created:  # only update on the first link
author	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-07 23:15:17 +0200
committer	Étienne Loks <etienne.loks@iggdrasil.net>	2019-08-07 23:15:45 +0200
commit	838fd71728067d34a490f4a6fcaa3a09a460ef0a (patch)
tree	1d45ace92fce2be464fd85ffeb8181fac8db1de7
parent	37036fde028b0c1c5f9db06b71bbd4bc9a287e51 (diff)
download	Comm-on-net-838fd71728067d34a490f4a6fcaa3a09a460ef0a.tar.bz2 Comm-on-net-838fd71728067d34a490f4a6fcaa3a09a460ef0a.zip