summaryrefslogtreecommitdiff
path: root/commcrawler/utils.py
diff options
context:
space:
mode:
authorÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 13:06:03 +0200
committerÉtienne Loks <etienne.loks@iggdrasil.net>2019-08-09 13:06:03 +0200
commite7dd52e03d8e770d8fcee1503fa8109dc3778d29 (patch)
treefbd89b70b52caa9cf0db21aa2efc24ed678fc2d3 /commcrawler/utils.py
parent347a0822484ad16b9a29eef1ea30082b4a841ac6 (diff)
downloadComm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.tar.bz2
Comm-on-net-e7dd52e03d8e770d8fcee1503fa8109dc3778d29.zip
Manage links betweens targets
Diffstat (limited to 'commcrawler/utils.py')
-rw-r--r--commcrawler/utils.py18
1 files changed, 18 insertions, 0 deletions
diff --git a/commcrawler/utils.py b/commcrawler/utils.py
new file mode 100644
index 0000000..6a49669
--- /dev/null
+++ b/commcrawler/utils.py
@@ -0,0 +1,18 @@
+from urllib.parse import urldefrag
+import tldextract
+
+
+def append_to_results(results, key, value):
+ if key not in results:
+ results[key] = []
+ results[key].append(value)
+
+
+def clean_url(url):
+ url, __ = urldefrag(url) # remove anchors
+ return url
+
+
+def get_domain(url):
+ ext = tldextract.extract(url)
+ return '{}.{}'.format(ext.domain, ext.suffix)