summaryrefslogtreecommitdiff
path: root/commcrawler/utils.py
blob: 6a496694f35ff30fdf1e4f44a0070cb215e70a41 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
from urllib.parse import urldefrag
import tldextract


def append_to_results(results, key, value):
    if key not in results:
        results[key] = []
    results[key].append(value)


def clean_url(url):
    url, __ = urldefrag(url)  # remove anchors
    return url


def get_domain(url):
    ext = tldextract.extract(url)
    return '{}.{}'.format(ext.domain, ext.suffix)