summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile5
-rw-r--r--commcrawler/admin.py5
-rw-r--r--commcrawler/scrapy.py26
3 files changed, 27 insertions, 9 deletions
diff --git a/Makefile b/Makefile
index 20e1e24..9ce49e9 100644
--- a/Makefile
+++ b/Makefile
@@ -30,6 +30,9 @@ regenerate_all: migrate ## regenerate all the database
$(PYTHON) manage.py import_csv_autres data_src/autres.csv
+crawl: ## launch crawl on the first planified crawler
+ $(PYTHON) manage.py launch_crawl --first-available
+
run: ## run test server
$(PYTHON) manage.py runserver 0.0.0.0:8000
@@ -37,4 +40,4 @@ generate_graphs: ## generate graph model for documentation
$(PYTHON) manage.py graph_models \
-S --arrow-shape normal -g -n -L fr-fr $(APPS) > /tmp/$(PROJECT).dot
dot -Tpng /tmp/$(PROJECT).dot > docs/images/models.png
- rm /tmp/$(PROJECT).dot \ No newline at end of file
+ rm /tmp/$(PROJECT).dot
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index a89c871..015457e 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -6,6 +6,7 @@ from pygments.formatters.html import HtmlFormatter
from ajax_select import make_ajax_form
from django.contrib import admin
+from django.utils.translation import ugettext_lazy as _
from django.utils.safestring import mark_safe
from commonnet.admin_site import admin_site
@@ -36,7 +37,7 @@ class CrawlResultAdmin(admin.ModelAdmin):
"nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
"nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
"nb_external_pdf", "nb_internal_office", "nb_external_office",
- "is_online", "redirection",
+ "is_online", "redirection", "crawl_result_prettified"
)
exclude = ("crawl_result",)
form = make_ajax_form(model, {'target': 'target'})
@@ -48,6 +49,8 @@ class CrawlResultAdmin(admin.ModelAdmin):
style = "<style>" + formatter.get_style_defs() + "</style><br>"
return mark_safe(style + response)
+ crawl_result_prettified.short_description = _("Crawl result")
+
admin_site.register(models.CrawlResult, CrawlResultAdmin)
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 47a0521..7f7bea5 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -1,5 +1,6 @@
import datetime
import tldextract
+from urllib.parse import urldefrag
import scrapy
from scrapy.crawler import CrawlerProcess
@@ -13,9 +14,6 @@ from django.utils import timezone
from . import models
"""
-nb_external_link
-nb_internal_link
-nb_images
nb_facebook
nb_twitter
nb_instagram
@@ -33,7 +31,13 @@ redirection
CrawlLink
"""
-MAX_LINKS = 500
+
+def clean_url(url):
+ url, __ = urldefrag(url) # remove anchors
+ return url
+
+
+MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -112,17 +116,24 @@ class DefaultSpider:
try:
self._parse_image(response, result)
for link in LinkExtractor().extract_links(response):
- url = link.url
+ url = clean_url(link.url)
if url is None or url in self.links_reached:
continue
+ is_internal = False
for domain in self.allowed_domains:
if domain in url:
+ is_internal = True
self.links_reached.add(link.url)
- if len(self.links_reached) < MAX_LINKS:
+ if not MAX_LINKS or \
+ len(self.links_reached) < MAX_LINKS:
yield response.follow(link.url, self.parse)
else:
print("MAX", self.allowed_domains,
self.links_reached)
+ if not is_internal:
+ if "external_link" not in result:
+ result["external_link"] = []
+ result["external_link"].append(url)
except NotSupported:
print("No response", response.url)
yield result
@@ -137,7 +148,7 @@ class DbPipeline:
"facebook", "twitter", "instagram", "youtube",
"dailymotion", "vimeo", "video", "audio",
"internal_pdf", "external_pdf", "internal_office",
- "external_office", ]
+ "external_office"]
def _get_result_pk(self, spider):
"""
@@ -179,6 +190,7 @@ class DbPipeline:
if url in crawl_result["urls"]:
return
crawl_result["urls"].append(url)
+ result.nb_internal_links = len(crawl_result["urls"]) - 1
for k, value in item.items():
if k == "is_online":
if result_created: # only update on the first link