diff options
Diffstat (limited to 'commcrawler/utils.py')
-rw-r--r-- | commcrawler/utils.py | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/commcrawler/utils.py b/commcrawler/utils.py new file mode 100644 index 0000000..6a49669 --- /dev/null +++ b/commcrawler/utils.py @@ -0,0 +1,18 @@ +from urllib.parse import urldefrag +import tldextract + + +def append_to_results(results, key, value): + if key not in results: + results[key] = [] + results[key].append(value) + + +def clean_url(url): + url, __ = urldefrag(url) # remove anchors + return url + + +def get_domain(url): + ext = tldextract.extract(url) + return '{}.{}'.format(ext.domain, ext.suffix) |