summaryrefslogtreecommitdiff
path: root/commcrawler/scrapy.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-13 17:21:51 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-13 17:21:51 +0200
commit01d72fa7c94359049e2a7beb068167cb7f047805 (patch)
tree7e4eff796c75d8429c5a4d73f4628f7d42c47dc8 /commcrawler/scrapy.py
parenteb7a9f8c1ed76858c0963a9a7fb4bb896c1a7857 (diff)
downloadComm-on-net-01d72fa7c94359049e2a7beb068167cb7f047805.tar.bz2
Comm-on-net-01d72fa7c94359049e2a7beb068167cb7f047805.zip
Fix domain check
Diffstat (limited to 'commcrawler/scrapy.py')
-rw-r--r--commcrawler/scrapy.py7
1 files changed, 3 insertions, 4 deletions
diff --git a/commcrawler/scrapy.py b/commcrawler/scrapy.py
index 0967650..490142c 100644
--- a/commcrawler/scrapy.py
+++ b/commcrawler/scrapy.py
@@ -89,9 +89,6 @@ class DefaultSpider:
if "src" not in attributes:
continue
src = attributes["src"]
- is_a_real_src = src.startswith("http") or src.startswith("/")
- if not src or not is_a_real_src:
- continue
current_domain = get_domain(src)
if current_domain in YOUTUBE_DOMAINS:
append_to_results(result, "youtube", src)
@@ -196,7 +193,9 @@ class DefaultSpider:
yield response.follow(link.url, self.parse)
if not is_internal:
current_domain = get_domain(url)
- if current_domain in FACEBOOK_DOMAINS:
+ if not current_domain:
+ pass
+ elif current_domain in FACEBOOK_DOMAINS:
append_to_results(result, "facebook", url)
elif current_domain in TWITTER_DOMAINS:
append_to_results(result, "twitter", url)