summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-08 18:49:35 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-08 18:49:35 +0200
commit969ad20715b00f35d845f84b748c87f0c0a5a54e (patch)
treea984146e4476857e393eb8d1855120bc862400db /commcrawler/scrapy.py
parented9f173ac5a5e5af7670dfd7f783942e80bdf770 (diff)
downloadComm-on-net-969ad20715b00f35d845f84b748c87f0c0a5a54e.tar.bz2
Comm-on-net-969ad20715b00f35d845f84b748c87f0c0a5a54e.zip
Count video, audio, etc.
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py95
1 files changed, 75 insertions, 20 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 7f7bea5..d24c3c2 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -14,29 +14,35 @@ from django.utils import timezone
from . import models
"""
-nb_facebook
-nb_twitter
-nb_instagram
-nb_youtube
-nb_dailymotion
-nb_vimeo
-nb_video
-nb_audio
-nb_internal_pdf
-nb_external_pdf
-nb_internal_office
-nb_external_office
redirection
-
CrawlLink
"""
+FACEBOOK_DOMAINS = ("facebook.com", "facebook.net", "fbcdn.net")
+TWITTER_DOMAINS = ("twitter.com", "twimg.com", "twttr.net", "twttr.com",
+ "abs.twimg.com")
+INSTAGRAM_DOMAINS = ("instagram.com", "cdninstagram.com")
+YOUTUBE_DOMAINS = ("youtu.be", "youtube.com")
+DAILYMOTION_DOMAINS = ("dailymotion.com",)
+VIMEO_DOMAINS = ("vimeo.com",)
+VIDEO_EXTS = (".webm", ".mkv", ".flv", ".ogv", ".mov", ".wmv", ".avi", ".mpg",
+ ".mp4", ".m4v", ".mp2", ".mpeg")
+AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus",
+ ".wma", ".webm")
+OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx")
+
def clean_url(url):
url, __ = urldefrag(url) # remove anchors
return url
+def append_to_results(results, key, value):
+ if key not in results:
+ results[key] = []
+ results[key].append(value)
+
+
MAX_LINKS = None # if None no max
TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT)
@@ -81,6 +87,43 @@ class DefaultSpider:
continue
result["images"].append(src)
+ def _parse_iframe(self, response, result):
+ for img in response.css('iframe'):
+ attributes = img.attrib
+ if "src" not in attributes:
+ continue
+ src = attributes["src"]
+ is_a_real_src = src.startswith("http") or src.startswith("/")
+ if not src or not is_a_real_src:
+ continue
+ current_domain = get_domain(src)
+ if current_domain in YOUTUBE_DOMAINS:
+ append_to_results(result, "youtube", src)
+ elif current_domain in DAILYMOTION_DOMAINS:
+ append_to_results(result, "dailymotion", src)
+ elif current_domain in VIMEO_DOMAINS:
+ append_to_results(result, "vimeo", src)
+
+ def _parse_internal_files(self, url, result):
+ types = (("video", VIDEO_EXTS), ("audio", AUDIO_EXTS),
+ ("internal_pdf", ("pdf",)), ("internal_office", OFFICE_EXTS))
+ return self._parse_files(url, result, types)
+
+ def _parse_external_files(self, url, result):
+ types = (("external_pdf", ("pdf",)), ("external_office", OFFICE_EXTS))
+ return self._parse_files(url, result, types)
+
+ def _parse_files(self, url, result, types):
+ """
+ Parse url for file
+ :return: True if is a file
+ """
+ url = url.lower()
+ for content_type, extensions in types:
+ if [1 for ext in extensions if url.endswith(ext)]:
+ append_to_results(result, content_type, url)
+ return True
+
def timeout(self):
if not self.crawl_result:
q = {
@@ -115,6 +158,7 @@ class DefaultSpider:
result["is_online"] = True
try:
self._parse_image(response, result)
+ self._parse_iframe(response, result)
for link in LinkExtractor().extract_links(response):
url = clean_url(link.url)
if url is None or url in self.links_reached:
@@ -123,17 +167,28 @@ class DefaultSpider:
for domain in self.allowed_domains:
if domain in url:
is_internal = True
- self.links_reached.add(link.url)
- if not MAX_LINKS or \
+ self.links_reached.add(url)
+ is_file = self._parse_internal_files(url, result)
+ if is_file:
+ pass
+ elif not MAX_LINKS or \
len(self.links_reached) < MAX_LINKS:
yield response.follow(link.url, self.parse)
else:
print("MAX", self.allowed_domains,
self.links_reached)
if not is_internal:
- if "external_link" not in result:
- result["external_link"] = []
- result["external_link"].append(url)
+ current_domain = get_domain(url)
+ if current_domain in FACEBOOK_DOMAINS:
+ append_to_results(result, "facebook", url)
+ elif current_domain in TWITTER_DOMAINS:
+ append_to_results(result, "twitter", url)
+ elif current_domain in INSTAGRAM_DOMAINS:
+ append_to_results(result, "instagram", url)
+ else:
+ is_file = self._parse_external_files(url, result)
+ if not is_file:
+ append_to_results(result, "external_link", url)
except NotSupported:
print("No response", response.url)
yield result
@@ -144,7 +199,7 @@ class DefaultSpider:
class DbPipeline:
BASE_KEYS = ["url", "crawl_id", "target_id"]
- NB_KEYS = ["external_link", "internal_link", "images",
+ NB_KEYS = ["external_link", "images",
"facebook", "twitter", "instagram", "youtube",
"dailymotion", "vimeo", "video", "audio",
"internal_pdf", "external_pdf", "internal_office",
@@ -190,7 +245,6 @@ class DbPipeline:
if url in crawl_result["urls"]:
return
crawl_result["urls"].append(url)
- result.nb_internal_links = len(crawl_result["urls"]) - 1
for k, value in item.items():
if k == "is_online":
if result_created: # only update on the first link
@@ -203,6 +257,7 @@ class DbPipeline:
continue
crawl_result[k].append(subvalue)
setattr(result, "nb_" + k, len(crawl_result[k]))
+ result.nb_internal_link = len(crawl_result["urls"]) - 1
result.crawl_result = [crawl_result]
result.save()
return True