diff options
author | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 13:06:03 +0200 |
---|---|---|
committer | Étienne Loks <etienne.loks@iggdrasil.net> | 2019-08-09 13:06:03 +0200 |
commit | e7dd52e03d8e770d8fcee1503fa8109dc3778d29 (patch) | |
tree | fbd89b70b52caa9cf0db21aa2efc24ed678fc2d3 /commcrawler/utils.py | |
parent | 347a0822484ad16b9a29eef1ea30082b4a841ac6 (diff) | |
download | Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.tar.bz2 Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.zip |
Manage links betweens targets
Diffstat (limited to 'commcrawler/utils.py')
-rw-r--r-- | commcrawler/utils.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/commcrawler/utils.py b/commcrawler/utils.py new file mode 100644 index 0000000..6a49669 --- /dev/null +++ b/commcrawler/utils.py @@ -0,0 +1,18 @@ +from urllib.parse import urldefrag +import tldextract + + +def append_to_results(results, key, value): + if key not in results: + results[key] = [] + results[key].append(value) + + +def clean_url(url): + url, __ = urldefrag(url) # remove anchors + return url + + +def get_domain(url): + ext = tldextract.extract(url) + return '{}.{}'.format(ext.domain, ext.suffix) |