diff options
-rw-r--r-- | commcrawler/admin.py | 11 | ||||
-rw-r--r-- | commcrawler/models.py | 30 | ||||
-rw-r--r-- | commcrawler/scrapy.py | 95 |
3 files changed, 98 insertions, 38 deletions
diff --git a/commcrawler/admin.py b/commcrawler/admin.py index 015457e..71a44a5 100644 --- a/commcrawler/admin.py +++ b/commcrawler/admin.py @@ -27,9 +27,14 @@ admin_site.register(models.Crawl, CrawlAdmin) class CrawlResultAdmin(admin.ModelAdmin): model = models.CrawlResult - list_display = ("target", "crawl", "started", "duration", "status", - "is_online") - list_filter = ("status", "crawl") + list_display = ( + "target", "crawl", "started", "duration", "status", "is_online", + "nb_external_link", "nb_internal_link", + "nb_images", "nb_facebook", "nb_twitter", "nb_instagram", "nb_youtube", + "nb_dailymotion", "nb_vimeo", "nb_video", "nb_audio", "nb_internal_pdf", + "nb_external_pdf", "nb_internal_office", "nb_external_office" + ) + list_filter = ("status", "crawl", "is_online") search_fields = ("target__name",) readonly_fields = ( diff --git a/commcrawler/models.py b/commcrawler/models.py index be371ca..e715408 100644 --- a/commcrawler/models.py +++ b/commcrawler/models.py @@ -68,35 +68,35 @@ class CrawlResult(models.Model): max_length=1, choices=STATUS, default='P') crawl_result = JSONField(verbose_name=_("Crawl result"), default=list) nb_external_link = models.IntegerField( - verbose_name=_("Number of external links"), default=0) + verbose_name=_("External links"), default=0) nb_internal_link = models.IntegerField( - verbose_name=_("Number of internal links"), default=0) + verbose_name=_("Internal links"), default=0) nb_images = models.IntegerField( - verbose_name=_("Number of images"), default=0) + verbose_name=_("Images"), default=0) nb_facebook = models.IntegerField( - verbose_name=_("Number of Facebook links"), default=0) + verbose_name=_("Facebook links"), default=0) nb_twitter = models.IntegerField( - verbose_name=_("Number of Twitter links"), default=0) + verbose_name=_("Twitter links"), default=0) nb_instagram = models.IntegerField( - verbose_name=_("Number of Instagram links"), default=0) + verbose_name=_("Instagram links"), default=0) nb_youtube = models.IntegerField( - verbose_name=_("Number of Youtube links"), default=0) + verbose_name=_("Youtube links"), default=0) nb_dailymotion = models.IntegerField( - verbose_name=_("Number of Dailymotion links"), default=0) + verbose_name=_("Dailymotion links"), default=0) nb_vimeo = models.IntegerField( - verbose_name=_("Number of Vimeo links"), default=0) + verbose_name=_("Vimeo links"), default=0) nb_video = models.IntegerField( - verbose_name=_("Number of videos"), default=0) + verbose_name=_("Internal videos"), default=0) nb_audio = models.IntegerField( - verbose_name=_("Number of audios"), default=0) + verbose_name=_("Internal audios"), default=0) nb_internal_pdf = models.IntegerField( - verbose_name=_("Number of internal PDF"), default=0) + verbose_name=_("Internal PDF"), default=0) nb_external_pdf = models.IntegerField( - verbose_name=_("Number of external PDF"), default=0) + verbose_name=_("External PDF"), default=0) nb_internal_office = models.IntegerField( - verbose_name=_("Number of internal office documents"), default=0) + verbose_name=_("Internal office documents"), default=0) nb_external_office = models.IntegerField( - verbose_name=_("Number of external office documents"), default=0) + verbose_name=_("External office documents"), default=0) is_online = models.BooleanField( verbose_name=_("Website is online"), default=False) redirection = models.URLField( diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 7f7bea5..d24c3c2 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -14,29 +14,35 @@ from django.utils import timezone from . import models """ -nb_facebook -nb_twitter -nb_instagram -nb_youtube -nb_dailymotion -nb_vimeo -nb_video -nb_audio -nb_internal_pdf -nb_external_pdf -nb_internal_office -nb_external_office redirection - CrawlLink """ +FACEBOOK_DOMAINS = ("facebook.com", "facebook.net", "fbcdn.net") +TWITTER_DOMAINS = ("twitter.com", "twimg.com", "twttr.net", "twttr.com", + "abs.twimg.com") +INSTAGRAM_DOMAINS = ("instagram.com", "cdninstagram.com") +YOUTUBE_DOMAINS = ("youtu.be", "youtube.com") +DAILYMOTION_DOMAINS = ("dailymotion.com",) +VIMEO_DOMAINS = ("vimeo.com",) +VIDEO_EXTS = (".webm", ".mkv", ".flv", ".ogv", ".mov", ".wmv", ".avi", ".mpg", + ".mp4", ".m4v", ".mp2", ".mpeg") +AUDIO_EXTS = (".aac", ".flac", ".m4a", ".mp3", ".ogg", ".oga", ".opus", + ".wma", ".webm") +OFFICE_EXTS = (".csv", ".doc", ".docx", ".odt", ".rtf", ".ods", ".xls", ".xlsx") + def clean_url(url): url, __ = urldefrag(url) # remove anchors return url +def append_to_results(results, key, value): + if key not in results: + results[key] = [] + results[key].append(value) + + MAX_LINKS = None # if None no max TIMEOUT = datetime.timedelta(minutes=settings.CRAWL_TIMEOUT) @@ -81,6 +87,43 @@ class DefaultSpider: continue result["images"].append(src) + def _parse_iframe(self, response, result): + for img in response.css('iframe'): + attributes = img.attrib + if "src" not in attributes: + continue + src = attributes["src"] + is_a_real_src = src.startswith("http") or src.startswith("/") + if not src or not is_a_real_src: + continue + current_domain = get_domain(src) + if current_domain in YOUTUBE_DOMAINS: + append_to_results(result, "youtube", src) + elif current_domain in DAILYMOTION_DOMAINS: + append_to_results(result, "dailymotion", src) + elif current_domain in VIMEO_DOMAINS: + append_to_results(result, "vimeo", src) + + def _parse_internal_files(self, url, result): + types = (("video", VIDEO_EXTS), ("audio", AUDIO_EXTS), + ("internal_pdf", ("pdf",)), ("internal_office", OFFICE_EXTS)) + return self._parse_files(url, result, types) + + def _parse_external_files(self, url, result): + types = (("external_pdf", ("pdf",)), ("external_office", OFFICE_EXTS)) + return self._parse_files(url, result, types) + + def _parse_files(self, url, result, types): + """ + Parse url for file + :return: True if is a file + """ + url = url.lower() + for content_type, extensions in types: + if [1 for ext in extensions if url.endswith(ext)]: + append_to_results(result, content_type, url) + return True + def timeout(self): if not self.crawl_result: q = { @@ -115,6 +158,7 @@ class DefaultSpider: result["is_online"] = True try: self._parse_image(response, result) + self._parse_iframe(response, result) for link in LinkExtractor().extract_links(response): url = clean_url(link.url) if url is None or url in self.links_reached: @@ -123,17 +167,28 @@ class DefaultSpider: for domain in self.allowed_domains: if domain in url: is_internal = True - self.links_reached.add(link.url) - if not MAX_LINKS or \ + self.links_reached.add(url) + is_file = self._parse_internal_files(url, result) + if is_file: + pass + elif not MAX_LINKS or \ len(self.links_reached) < MAX_LINKS: yield response.follow(link.url, self.parse) else: print("MAX", self.allowed_domains, self.links_reached) if not is_internal: - if "external_link" not in result: - result["external_link"] = [] - result["external_link"].append(url) + current_domain = get_domain(url) + if current_domain in FACEBOOK_DOMAINS: + append_to_results(result, "facebook", url) + elif current_domain in TWITTER_DOMAINS: + append_to_results(result, "twitter", url) + elif current_domain in INSTAGRAM_DOMAINS: + append_to_results(result, "instagram", url) + else: + is_file = self._parse_external_files(url, result) + if not is_file: + append_to_results(result, "external_link", url) except NotSupported: print("No response", response.url) yield result @@ -144,7 +199,7 @@ class DefaultSpider: class DbPipeline: BASE_KEYS = ["url", "crawl_id", "target_id"] - NB_KEYS = ["external_link", "internal_link", "images", + NB_KEYS = ["external_link", "images", "facebook", "twitter", "instagram", "youtube", "dailymotion", "vimeo", "video", "audio", "internal_pdf", "external_pdf", "internal_office", @@ -190,7 +245,6 @@ class DbPipeline: if url in crawl_result["urls"]: return crawl_result["urls"].append(url) - result.nb_internal_links = len(crawl_result["urls"]) - 1 for k, value in item.items(): if k == "is_online": if result_created: # only update on the first link @@ -203,6 +257,7 @@ class DbPipeline: continue crawl_result[k].append(subvalue) setattr(result, "nb_" + k, len(crawl_result[k])) + result.nb_internal_link = len(crawl_result["urls"]) - 1 result.crawl_result = [crawl_result] result.save() return True |