diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-13 17:21:51 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-13 17:21:51 +0200 |
commit | 01d72fa7c94359049e2a7beb068167cb7f047805 (patch) | |
tree | 7e4eff796c75d8429c5a4d73f4628f7d42c47dc8 /commcrawler/scrapy.py | |
parent | eb7a9f8c1ed76858c0963a9a7fb4bb896c1a7857 (diff) | |
download | Comm-on-net-01d72fa7c94359049e2a7beb068167cb7f047805.tar.bz2 Comm-on-net-01d72fa7c94359049e2a7beb068167cb7f047805.zip |
Fix domain check
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r-- | commcrawler/scrapy.py | 7 |
1 files changed, 3 insertions, 4 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py index 0967650..490142c 100644 --- a/commcrawler/scrapy.py +++ b/commcrawler/scrapy.py @@ -89,9 +89,6 @@ class DefaultSpider: if "src" not in attributes: continue src = attributes["src"] - is_a_real_src = src.startswith("http") or src.startswith("/") - if not src or not is_a_real_src: - continue current_domain = get_domain(src) if current_domain in YOUTUBE_DOMAINS: append_to_results(result, "youtube", src) @@ -196,7 +193,9 @@ class DefaultSpider: yield response.follow(link.url, self.parse) if not is_internal: current_domain = get_domain(url) - if current_domain in FACEBOOK_DOMAINS: + if not current_domain: + pass + elif current_domain in FACEBOOK_DOMAINS: append_to_results(result, "facebook", url) elif current_domain in TWITTER_DOMAINS: append_to_results(result, "twitter", url) |