from __future__ import unicode_literals
import codecs
from six import PY3, text_type
from six.moves.urllib.parse import urlsplit, SplitResult
from six.moves.urllib.request import urlopen
from .conf import get_setting
from .exceptions import (
TldBadUrl,
TldDomainNotFound,
TldImproperlyConfigured,
TldIOError,
)
from .helpers import project_dir
__title__ = 'tld.utils'
__author__ = 'Artur Barseghyan'
__copyright__ = '2013-2019 Artur Barseghyan'
__license__ = 'MPL-1.1 OR GPL-2.0-only OR LGPL-2.0-or-later'
__all__ = (
'get_fld',
'get_tld',
'get_tld_names',
'get_tld_names_container',
'is_tld',
'parse_tld',
'process_url',
'reset_tld_names',
'Result',
'update_tld_names',
'update_tld_names_cli',
)
tld_names = {}
[docs]class Result(object):
"""Container."""
__slots__ = ('subdomain', 'domain', 'tld', '__fld', 'parsed_url')
def __init__(self, tld, domain, subdomain, parsed_url):
self.tld = tld
self.domain = domain if domain != '' else tld
self.subdomain = subdomain
self.parsed_url = parsed_url
if domain:
self.__fld = "{0}.{1}".format(self.domain, self.tld)
else:
self.__fld = self.tld
@property
def extension(self):
"""Alias of ``tld``.
:return str:
"""
return self.tld
suffix = extension
@property
def fld(self):
"""First level domain.
:return:
:rtype: str
"""
return self.__fld
def __str__(self):
if PY3:
return self.tld
else:
try:
return self.tld.encode('utf8')
except UnicodeEncodeError:
return self.tld
__repr__ = __str__
__unicode__ = __str__
@property
def __dict__(self):
"""Mimic __dict__ functionality.
:return:
:rtype: dict
"""
return {
'tld': self.tld,
'domain': self.domain,
'subdomain': self.subdomain,
'fld': self.fld,
'parsed_url': self.parsed_url,
}
class TrieNode(object):
"""Class representing a single Trie node."""
__slots__ = ('children', 'exception', 'leaf', 'private')
def __init__(self):
self.children = None
self.exception = None
self.leaf = False
self.private = False
class Trie(object):
"""An adhoc Trie data structure to store tlds in reverse notation order."""
def __init__(self):
self.root = TrieNode()
self.__nodes = 0
def __len__(self):
return self.__nodes
def add(self, tld, private=False):
node = self.root
# Iterating over the tld parts in reverse order
for part in reversed(tld.split('.')):
if part.startswith('!'):
node.exception = part[1:]
break
# To save up some RAM, we initialize the children dict only
# when strictly necessary
if node.children is None:
node.children = {}
child = node.children.get(part)
if child is None:
child = TrieNode()
node.children[part] = child
node = child
node.leaf = True
if private:
node.private = True
self.__nodes += 1
[docs]def update_tld_names(fail_silently=False,
tld_names_source_url=None,
tld_names_local_path=None):
"""Update the local copy of TLDs file.
:param fail_silently: If set to True, no exceptions is raised on
failure but boolean False returned.
:param tld_names_source_url:
:param tld_names_local_path:
:type fail_silently: bool
:type tld_names_source_url: str
:type tld_names_local_path: str
:return: True on success, False on failure.
:rtype: bool
"""
if not tld_names_source_url:
tld_names_source_url = get_setting('NAMES_SOURCE_URL')
if not tld_names_local_path:
tld_names_local_path = get_setting('NAMES_LOCAL_PATH')
try:
remote_file = urlopen(tld_names_source_url)
local_file = codecs.open(
project_dir(tld_names_local_path),
'wb',
encoding='utf8'
)
local_file.write(remote_file.read().decode('utf8'))
local_file.close()
remote_file.close()
except Exception as err:
if fail_silently:
return False
raise TldIOError(err)
return True
[docs]def update_tld_names_cli():
"""CLI wrapper for update_tld_names.
Since update_tld_names returns True on success, we need to negate the
result to match CLI semantics.
"""
return int(not update_tld_names())
[docs]def get_tld_names(fail_silently=False,
retry_count=0,
tld_names_local_path=None):
"""Build the ``tlds`` list if empty. Recursive.
:param fail_silently: If set to True, no exceptions are raised and None
is returned on failure.
:param retry_count: If greater than 1, we raise an exception in order
to avoid infinite loops.
:param tld_names_local_path:
:type fail_silently: bool
:type retry_count: int
:type tld_names_local_path: str
:return: List of TLD names
:rtype: obj:`tld.utils.Trie`
"""
if not tld_names_local_path:
tld_names_local_path = get_setting('NAMES_LOCAL_PATH')
if retry_count > 1:
if fail_silently:
return None
else:
raise TldIOError
global tld_names
# If already loaded, return
if (
tld_names_local_path in tld_names
and tld_names[tld_names_local_path] is not None
):
return tld_names
local_file = None
try:
# Load the TLD names file
local_file = codecs.open(
project_dir(tld_names_local_path),
'r',
encoding='utf8'
)
trie = Trie()
# Make a list of it all, strip all garbage
private_section = False
for line in local_file:
if '===BEGIN PRIVATE DOMAINS===' in line:
private_section = True
# Puny code TLD names
if '// xn--' in line:
line = line.split()[1]
if line[0] == '/' or line[0] == '\n':
continue
trie.add(
u'{0}'.format(line.strip()),
private=private_section
)
tld_names[tld_names_local_path] = trie
local_file.close()
except IOError as err:
# Grab the file
update_tld_names(
fail_silently=fail_silently,
tld_names_local_path=tld_names_local_path
)
# Increment ``retry_count`` in order to avoid infinite loops
retry_count += 1
# Run again
return get_tld_names(
fail_silently,
retry_count,
tld_names_local_path=tld_names_local_path
)
except Exception as err:
if fail_silently:
return None
else:
raise err
finally:
try:
local_file.close()
except Exception:
pass
return tld_names
[docs]def process_url(url,
fail_silently=False,
fix_protocol=False,
search_public=True,
search_private=True,
tld_names_local_path=None):
"""Process URL.
:param url:
:param fail_silently:
:param fix_protocol:
:param search_public:
:param search_private:
:param tld_names_local_path:
:return:
"""
if not (search_public or search_private):
raise TldImproperlyConfigured(
"Either `search_public` or `search_private` (or both) shall be "
"set to True."
)
if not tld_names_local_path:
tld_names_local_path = get_setting('NAMES_LOCAL_PATH')
# Init
tld_names = get_tld_names(
fail_silently=fail_silently,
tld_names_local_path=tld_names_local_path
)
if not isinstance(url, SplitResult):
url = url.lower()
if fix_protocol:
if (
not url.startswith('//')
and not (
url.startswith('http://') or url.startswith('https://')
)
):
url = 'https://{}'.format(url)
# Get parsed URL as we might need it later
parsed_url = urlsplit(url)
else:
parsed_url = url
# Get (sub) domain name
domain_name = parsed_url.hostname
if not domain_name:
if fail_silently:
return None, None, parsed_url
else:
raise TldBadUrl(url=url)
domain_parts = domain_name.split('.')
# Now we query our Trie iterating on the domain parts in reverse order
node = tld_names[tld_names_local_path].root
current_length = 0
tld_length = 0
match = None
for i in reversed(range(len(domain_parts))):
part = domain_parts[i]
# Cannot go deeper
if node.children is None:
break
# Exception
if part == node.exception:
break
child = node.children.get(part)
# Wildcards
if child is None:
child = node.children.get('*')
# If the current part is not in current node's children, we can stop
if child is None:
break
# Else we move deeper and increment our tld offset
current_length += 1
node = child
if node.leaf:
tld_length = current_length
match = node
# Checking the node we finished on is a leaf and is one we allow
if (
(match is None) or
(not match.leaf) or
(not search_public and not match.private) or
(not search_private and match.private)
):
if fail_silently:
return None, None, parsed_url
else:
raise TldDomainNotFound(domain_name=domain_name)
if len(domain_parts) == tld_length:
non_zero_i = -1 # hostname = tld
else:
non_zero_i = max(1, len(domain_parts) - tld_length)
return domain_parts, non_zero_i, parsed_url
[docs]def get_fld(url,
fail_silently=False,
fix_protocol=False,
search_public=True,
search_private=True,
tld_names_local_path=None,
**kwargs):
"""Extract the first level domain.
Extract the top level domain based on the mozilla's effective TLD names
dat file. Returns a string. May throw ``TldBadUrl`` or
``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD
match found respectively.
:param url: URL to get top level domain from.
:param fail_silently: If set to True, no exceptions are raised and None
is returned on failure.
:param fix_protocol: If set to True, missing or wrong protocol is
ignored (https is appended instead).
:param search_public: If set to True, search in public domains.
:param search_private: If set to True, search in private domains.
:param tld_names_local_path:
:type url: str
:type fail_silently: bool
:type fix_protocol: bool
:type search_public: bool
:type search_private: bool
:type tld_names_local_path: str
:return: String with top level domain (if ``as_object`` argument
is set to False) or a ``tld.utils.Result`` object (if ``as_object``
argument is set to True); returns None on failure.
:rtype: str
"""
if 'as_object' in kwargs:
raise TldImproperlyConfigured(
"`as_object` argument is deprecated for `get_fld`. Use `get_tld` "
"instead."
)
domain_parts, non_zero_i, parsed_url = process_url(
url=url,
fail_silently=fail_silently,
fix_protocol=fix_protocol,
search_public=search_public,
search_private=search_private,
tld_names_local_path=tld_names_local_path
)
if domain_parts is None:
return None
if non_zero_i < 0:
# hostname = tld
return text_type(parsed_url.hostname)
return text_type(".").join(domain_parts[non_zero_i-1:])
[docs]def get_tld(url,
fail_silently=False,
as_object=False,
fix_protocol=False,
search_public=True,
search_private=True,
tld_names_local_path=None,):
"""Extract the top level domain.
Extract the top level domain based on the mozilla's effective TLD names
dat file. Returns a string. May throw ``TldBadUrl`` or
``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD
match found respectively.
:param url: URL to get top level domain from.
:param fail_silently: If set to True, no exceptions are raised and None
is returned on failure.
:param as_object: If set to True, ``tld.utils.Result`` object is returned,
``domain``, ``suffix`` and ``tld`` properties.
:param fix_protocol: If set to True, missing or wrong protocol is
ignored (https is appended instead).
:param search_public: If set to True, search in public domains.
:param search_private: If set to True, search in private domains.
:param tld_names_local_path:
:type url: str
:type fail_silently: bool
:type as_object: bool
:type fix_protocol: bool
:type search_public: bool
:type search_private: bool
:type tld_names_local_path: str
:return: String with top level domain (if ``as_object`` argument
is set to False) or a ``tld.utils.Result`` object (if ``as_object``
argument is set to True); returns None on failure.
:rtype: str
"""
domain_parts, non_zero_i, parsed_url = process_url(
url=url,
fail_silently=fail_silently,
fix_protocol=fix_protocol,
search_public=search_public,
search_private=search_private,
tld_names_local_path=tld_names_local_path
)
if domain_parts is None:
return None
if not as_object:
if non_zero_i < 0:
# hostname = tld
return text_type(parsed_url.hostname)
return text_type(".").join(domain_parts[non_zero_i:])
if non_zero_i < 0:
# hostname = tld
subdomain = text_type("")
domain = text_type("")
_tld = text_type(parsed_url.hostname)
else:
subdomain = text_type(".").join(domain_parts[:non_zero_i-1])
domain = text_type(".").join(
domain_parts[non_zero_i-1:non_zero_i]
)
_tld = text_type(".").join(domain_parts[non_zero_i:])
return Result(
subdomain=subdomain,
domain=domain,
tld=_tld,
parsed_url=parsed_url
)
[docs]def parse_tld(url,
fail_silently=False,
fix_protocol=False,
search_public=True,
search_private=True,
tld_names_local_path=None):
"""Parse TLD into parts.
:param url:
:param fail_silently:
:param fix_protocol:
:param search_public:
:param search_private:
:param tld_names_local_path:
:return:
:rtype: tuple
"""
try:
obj = get_tld(
url,
fail_silently=fail_silently,
as_object=True,
fix_protocol=fix_protocol,
search_public=search_public,
search_private=search_private,
tld_names_local_path=tld_names_local_path
)
_tld = obj.tld
domain = obj.domain
subdomain = obj.subdomain
except (
TldBadUrl,
TldDomainNotFound,
TldImproperlyConfigured,
TldIOError
):
_tld = None
domain = None
subdomain = None
return _tld, domain, subdomain
[docs]def is_tld(value,
search_public=True,
search_private=True,
tld_names_local_path=None):
"""Check if given URL is tld.
:param value: URL to get top level domain from.
:param search_public: If set to True, search in public domains.
:param search_private: If set to True, search in private domains.
:param tld_names_local_path:
:type value: str
:type search_public: bool
:type search_private: bool
:type tld_names_local_path: str
:return:
:rtype: bool
"""
_tld = get_tld(
url=value,
fail_silently=True,
fix_protocol=True,
search_public=search_public,
search_private=search_private,
tld_names_local_path=tld_names_local_path
)
return value == _tld
[docs]def get_tld_names_container():
"""Get container of all tld names.
:return:
:rtype dict:
"""
global tld_names
return tld_names
[docs]def reset_tld_names(tld_names_local_path=None):
"""Reset the ``tld_names`` to empty value.
If ``tld_names_local_path`` is given, removes specified
entry from ``tld_names`` instead.
:param tld_names_local_path:
:type tld_names_local_path: str
:return:
"""
global tld_names
if tld_names_local_path:
tld_names.pop(tld_names_local_path, None)
else:
tld_names = {}