From 01d72fa7c94359049e2a7beb068167cb7f047805 Mon Sep 17 00:00:00 2001 From: Étienne Loks Date: Tue, 13 Aug 2019 17:21:51 +0200 Subject: Fix domain check --- commcrawler/scrapy.py | 7 +++---- commcrawler/utils.py | 5 +++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 0967650..490142c 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -89,9 +89,6 @@ class DefaultSpider: if "src" not in attributes: continue src = attributes["src"] - is_a_real_src = src.startswith("http") or src.startswith("/") - if not src or not is_a_real_src: - continue current_domain = get_domain(src) if current_domain in YOUTUBE_DOMAINS: append_to_results(result, "youtube", src) @@ -196,7 +193,9 @@ class DefaultSpider: yield response.follow(link.url, self.parse) if not is_internal: current_domain = get_domain(url) - if current_domain in FACEBOOK_DOMAINS: + if not current_domain: + pass + elif current_domain in FACEBOOK_DOMAINS: append_to_results(result, "facebook", url) elif current_domain in TWITTER_DOMAINS: append_to_results(result, "twitter", url) diff --git a/commcrawler/utils.py b/commcrawler/utils.py index 6a49669..c1051dd 100644 --- a/commcrawler/utils.py +++ b/commcrawler/utils.py @@ -14,5 +14,10 @@ def clean_url(url): def get_domain(url): + if not url: + return + is_a_real_src = url.startswith("http") or url.startswith("/") + if not is_a_real_src: + return ext = tldextract.extract(url) return '{}.{}'.format(ext.domain, ext.suffix) -- cgit v1.2.3