summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--commcrawler/admin.py11
-rw-r--r--commcrawler/models.py30
-rw-r--r--commcrawler/scrapy.py95
3 files changed, 98 insertions, 38 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py
index 015457e..71a44a5 100644
--- a/commcrawler/admin.py
+++ b/commcrawler/admin.py
@@ -27,9 +27,14 @@ admin_site.register(models.Crawl, CrawlAdmin)
class CrawlResultAdmin(admin.ModelAdmin):
model = models.CrawlResult
- list_display = ("target", "crawl", "started", "duration", "status",
- "is_online")
- list_filter = ("status", "crawl")
+ list_display = (
+ "target", "crawl", "started", "duration", "status", "is_online",
+ "nb_external_link", "nb_internal_link",
+ "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube",
+ "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf",
+ "nb_external_pdf", "nb_internal_office", "nb_external_office"
+ )
+ list_filter = ("status", "crawl", "is_online")
search_fields = ("target__name",)
readonly_fields = (
diff --git a/commcrawler/models.py b/commcrawler/models.py
index be371ca..e715408 100644
--- a/commcrawler/models.py
+++ b/commcrawler/models.py
@@ -68,35 +68,35 @@ class CrawlResult(models.Model):
max_length=1, choices=STATUS, default='P')
crawl_result = JSONField(verbose_name=_("Crawl result"), default=list)
nb_external_link = models.IntegerField(
- verbose_name=_("Number of external links"), default=0)
+ verbose_name=_("External links"), default=0)
nb_internal_link = models.IntegerField(
- verbose_name=_("Number of internal links"), default=0)
+ verbose_name=_("Internal links"), default=0)
nb_images = models.IntegerField(
- verbose_name=_("Number of images"), default=0)
+ verbose_name=_("Images"), default=0)
nb_facebook = models.IntegerField(
- verbose_name=_("Number of Facebook links"), default=0)
+ verbose_name=_("Facebook links"), default=0)
nb_twitter = models.IntegerField(
- verbose_name=_("Number of Twitter links"), default=0)
+ verbose_name=_("Twitter links"), default=0)
nb_instagram = models.IntegerField(
- verbose_name=_("Number of Instagram links"), default=0)
+ verbose_name=_("Instagram links"), default=0)
nb_youtube = models.IntegerField(
- verbose_name=_("Number of Youtube links"), default=0)
+ verbose_name=_("Youtube links"), default=0)
nb_dailymotion = models.IntegerField(
- verbose_name=_("Number of Dailymotion links"), default=0)
+ verbose_name=_("Dailymotion links"), default=0)
nb_vimeo = models.IntegerField(
- verbose_name=_("Number of Vimeo links"), default=0)
+ verbose_name=_("Vimeo links"), default=0)
nb_video = models.IntegerField(
- verbose_name=_("Number of videos"), default=0)
+ verbose_name=_("Internal videos"), default=0)
nb_audio = models.IntegerField(
- verbose_name=_("Number of audios"), default=0)
+ verbose_name=_("Internal audios"), default=0)
nb_internal_pdf = models.IntegerField(
- verbose_name=_("Number of internal PDF"), default=0)
+ verbose_name=_("Internal PDF"), default=0)
nb_external_pdf = models.IntegerField(
- verbose_name=_("Number of external PDF"), default=0)
+ verbose_name=_("External PDF"), default=0)
nb_internal_office = models.IntegerField(
- verbose_name=_("Number of internal office documents"), default=0)
+ verbose_name=_("Internal office documents"), default=0)
nb_external_office = models.IntegerField(
- verbose_name=_("Number of external office documents"), default=0)
+ verbose_name=_("External office documents"), default=0)
is_online = models.BooleanField(
verbose_name=_("Website is online"), default=False)
redirection = models.URLField(
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7f7bea5..d24c3c2 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -14,29 +14,35 @@ from django.utils import timezone
from . import models
"""
-nb_facebook
-nb_twitter
-nb_instagram
-nb_youtube
-nb_dailymotion
-nb_vimeo
-nb_video
-nb_audio
-nb_internal_pdf
-nb_external_pdf
-nb_internal_office
-nb_external_office
redirection
-
CrawlLink
"""
+FACEBOOK_DOMAINS = ("facebook.com", "facebook.net", "fbcdn.net")
+TWITTER_DOMAINS = ("twitter.com", "twimg.com", "twttr.net", "twttr.com",
+ "abs.twimg.com")
+INSTAGRAM_DOMAINS = ("instagram.com", "cdninstagram.com")
+YOUTUBE_DOMAINS = ("youtu.be", "youtube.com")
+DAILYMOTION_DOMAINS = ("dailymotion.com",)
+VIMEO_DOMAINS = ("vimeo.com",)
+VIDEO_EXTS = (".webm", ".mkv", ".flv", ".ogv", ".mov", ".wmv", ".avi", ".mpg",
+ ".mp4", ".m4v", ".mp2", ".mpeg")
+AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
+ ".wma", ".webm")
+OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
+
def clean_url(url):
url, __ = urldefrag(url) # remove anchors
return url
+def append_to_results(results, key, value):
+ if key not in results:
+ results[key] = []
+ results[key].append(value)
+
+
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -81,6 +87,43 @@ class DefaultSpider:
continue
result["images"].append(src)
+ def _parse_iframe(self, response, result):
+ for img in response.css('iframe'):
+ attributes = img.attrib
+ if "src" not in attributes:
+ continue
+ src = attributes["src"]
+ is_a_real_src = src.startswith("http") or src.startswith("/")
+ if not src or not is_a_real_src:
+ continue
+ current_domain = get_domain(src)
+ if current_domain in YOUTUBE_DOMAINS:
+ append_to_results(result, "youtube", src)
+ elif current_domain in DAILYMOTION_DOMAINS:
+ append_to_results(result, "dailymotion", src)
+ elif current_domain in VIMEO_DOMAINS:
+ append_to_results(result, "vimeo", src)
+
+ def _parse_internal_files(self, url, result):
+ types = (("video", VIDEO_EXTS), ("audio", AUDIO_EXTS),
+ ("internal_pdf", ("pdf",)), ("internal_office", OFFICE_EXTS))
+ return self._parse_files(url, result, types)
+
+ def _parse_external_files(self, url, result):
+ types = (("external_pdf", ("pdf",)), ("external_office", OFFICE_EXTS))
+ return self._parse_files(url, result, types)
+
+ def _parse_files(self, url, result, types):
+ """
+ Parse url for file
+ :return: True if is a file
+ """
+ url = url.lower()
+ for content_type, extensions in types:
+ if [1 for ext in extensions if url.endswith(ext)]:
+ append_to_results(result, content_type, url)
+ return True
+
def timeout(self):
if not self.crawl_result:
q = {
@@ -115,6 +158,7 @@ class DefaultSpider:
result["is_online"] = True
try:
self._parse_image(response, result)
+ self._parse_iframe(response, result)
for link in LinkExtractor().extract_links(response):
url = clean_url(link.url)
if url is None or url in self.links_reached:
@@ -123,17 +167,28 @@ class DefaultSpider:
for domain in self.allowed_domains:
if domain in url:
is_internal = True
- self.links_reached.add(link.url)
- if not MAX_LINKS or \
+ self.links_reached.add(url)
+ is_file = self._parse_internal_files(url, result)
+ if is_file:
+ pass
+ elif not MAX_LINKS or \
len(self.links_reached) < MAX_LINKS:
yield response.follow(link.url, self.parse)
else:
print("MAX", self.allowed_domains,
self.links_reached)
if not is_internal:
- if "external_link" not in result:
- result["external_link"] = []
- result["external_link"].append(url)
+ current_domain = get_domain(url)
+ if current_domain in FACEBOOK_DOMAINS:
+ append_to_results(result, "facebook", url)
+ elif current_domain in TWITTER_DOMAINS:
+ append_to_results(result, "twitter", url)
+ elif current_domain in INSTAGRAM_DOMAINS:
+ append_to_results(result, "instagram", url)
+ else:
+ is_file = self._parse_external_files(url, result)
+ if not is_file:
+ append_to_results(result, "external_link", url)
except NotSupported:
print("No response", response.url)
yield result
@@ -144,7 +199,7 @@ class DefaultSpider:
class DbPipeline:
BASE_KEYS = ["url", "crawl_id", "target_id"]
- NB_KEYS = ["external_link", "internal_link", "images",
+ NB_KEYS = ["external_link", "images",
"facebook", "twitter", "instagram", "youtube",
"dailymotion", "vimeo", "video", "audio",
"internal_pdf", "external_pdf", "internal_office",
@@ -190,7 +245,6 @@ class DbPipeline:
if url in crawl_result["urls"]:
return
crawl_result["urls"].append(url)
- result.nb_internal_links = len(crawl_result["urls"]) - 1
for k, value in item.items():
if k == "is_online":
if result_created: # only update on the first link
@@ -203,6 +257,7 @@ class DbPipeline:
continue
crawl_result[k].append(subvalue)
setattr(result, "nb_" + k, len(crawl_result[k]))
+ result.nb_internal_link = len(crawl_result["urls"]) - 1
result.crawl_result = [crawl_result]
result.save()
return True