blob: c1051dd47a4a2bf4d13128a94f20a18cbb31fb14 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
from urllib.parse import urldefrag
import tldextract
def append_to_results(results, key, value):
if key not in results:
results[key] = []
results[key].append(value)
def clean_url(url):
url, __ = urldefrag(url) # remove anchors
return url
def get_domain(url):
if not url:
return
is_a_real_src = url.startswith("http") or url.startswith("/")
if not is_a_real_src:
return
ext = tldextract.extract(url)
return '{}.{}'.format(ext.domain, ext.suffix)
|