Source code for tld.utils

__title__ = 'tld.utils'
__version__ = '0.5'
__build__ = 0x000005
__author__ = 'Artur Barseghyan'
__all__ = ('update_tld_names', 'get_tld')

import os

#from urlparse import urlparse
#import urllib2
from six.moves.urllib.parse import urlparse
from six.moves.urllib.request import urlopen

from tld.conf import get_setting
from tld.exceptions import TldIOError, TldDomainNotFound, TldBadUrl

PROJECT_DIR = lambda base : os.path.abspath(os.path.join(os.path.dirname(__file__), base).replace('\\','/'))

_ = lambda x: x

tld_names = []

[docs]def update_tld_names(fail_silently=False):
    """
    Updates the local copy of TLDs file.

    :param bool fail_silently: If set to True, no exceptions is raised on failure but boolean False returned.
    :return bool: True on success, False on failure.
    """
    TLD_NAMES_SOURCE_URL = get_setting('NAMES_SOURCE_URL')
    TLD_NAMES_LOCAL_PATH = get_setting('NAMES_LOCAL_PATH')
    try:
        remote_file = urlopen(TLD_NAMES_SOURCE_URL)
        local_file = open(PROJECT_DIR(TLD_NAMES_LOCAL_PATH), 'wb')
        local_file.write(remote_file.read())
        local_file.close()
        remote_file.close()
    except Exception as e:
        if fail_silently:
            return False
        raise TldIOError(e)

    return True

[docs]def get_tld(url, active_only=False, fail_silently=False):
    """
    Extracts the top level domain based on the mozilla's effective TLD names dat file. Returns a string. May throw
    ``TldBadUrl`` or ``TldDomainNotFound`` exceptions if there's bad URL provided or no TLD match found respectively.

    :param url: URL to get top level domain from.
    :param active_only: If set to True, only active patterns are matched.
    :param fail_silently: If set to True, no exceptions are raised and None is returned on failure.
    :return: String with top level domain or None on failure.
    """
    TLD_NAMES_LOCAL_PATH = get_setting('NAMES_LOCAL_PATH')

    def init(retry_count=0):
        """
        Build the ``tlds`` list if empty. Recursive.

        :param retry_count: If greater than 1, we raise an exception in order to avoid infinite loops.
        :return: Returns interable
        """
        if retry_count > 1:
            if fail_silently:
                return None
            else:
                raise TldIOError

        global tld_names

        # If already loaded, return
        if len(tld_names):
            return tld_names

        local_file = None
        try:
            # Load the TLD names file
            local_file = open(PROJECT_DIR(TLD_NAMES_LOCAL_PATH))
            # Make a list of it all, strip all garbage
            tld_names = list(set([line.strip() for line in local_file if line[0] not in '/\n']))
            local_file.close()
        except IOError as e:
            update_tld_names() # Grab the file
            retry_count += 1 # Increment ``retry_count`` in order to avoid infinite loops
            return init(retry_count) # Run again
        except Exception as e:
            try:
                local_file.close()
            except:
                pass

            if fail_silently:
                return None
            else:
                raise e

        return tld_names

    init() # Init

    # Get (sub) domain name
    domain_name = urlparse(url).netloc

    if not domain_name:
        if fail_silently:
            return None
        else:
            raise TldBadUrl(url=url)

    domain_parts = domain_name.split('.')

    # Looping from much to less (for example if we have a domain named "v3.api.google.co.uk" we'll try
    # "v3.api.google.co.uk", then "api.google.co.uk", then "api.google.co.uk", then "google.co.uk", then
    # "co.uk" and finally "uk". If the last one does not match any TLDs, we throw a <TldDomainNotFound>
    # exception.
    for i in range(0, len(domain_parts)):
        sliced_domain_parts = domain_parts[i:]

        match = '.'.join(sliced_domain_parts)
        wildcard_match = '.'.join(['*'] + sliced_domain_parts[1:])
        inactive_match = "!%s" % match

        # Match tlds
        if (match in tld_names or wildcard_match in tld_names or (active_only is False and inactive_match in tld_names)):
            return ".".join(domain_parts[i-1:])

    if fail_silently:
        return None
    else:
        raise TldDomainNotFound(domain_name=domain_name)