205 lines
8.0 KiB
Python
205 lines
8.0 KiB
Python
import collections
|
|
import collections.abc
|
|
import ipaddress
|
|
import numbers
|
|
import re
|
|
|
|
from .chars import SUB_DELIMS
|
|
from .encoding import uriencode
|
|
from .split import uriunsplit
|
|
|
|
# RFC 3986 3.1: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
|
|
_SCHEME_RE = re.compile(b'^[A-Za-z][A-Za-z0-9+.-]*$')
|
|
|
|
# RFC 3986 3.2: authority = [ userinfo "@" ] host [ ":" port ]
|
|
_AUTHORITY_RE_BYTES = re.compile(b'^(?:(.*)@)?(.*?)(?::([0-9]*))?$')
|
|
_AUTHORITY_RE_STR = re.compile(u'^(?:(.*)@)?(.*?)(?::([0-9]*))?$')
|
|
|
|
# safe component characters
|
|
_SAFE_USERINFO = SUB_DELIMS + ':'
|
|
_SAFE_HOST = SUB_DELIMS
|
|
_SAFE_PATH = SUB_DELIMS + ':@/'
|
|
_SAFE_QUERY = SUB_DELIMS + ':@/?'
|
|
_SAFE_FRAGMENT = SUB_DELIMS + ':@/?'
|
|
|
|
|
|
def _scheme(scheme):
|
|
if _SCHEME_RE.match(scheme):
|
|
return scheme.lower()
|
|
else:
|
|
raise ValueError('Invalid scheme component')
|
|
|
|
|
|
def _authority(userinfo, host, port, encoding):
|
|
authority = []
|
|
|
|
if userinfo is not None:
|
|
authority.append(uriencode(userinfo, _SAFE_USERINFO, encoding))
|
|
authority.append(b'@')
|
|
|
|
if isinstance(host, ipaddress.IPv6Address):
|
|
authority.append(b'[' + host.compressed.encode() + b']')
|
|
elif isinstance(host, ipaddress.IPv4Address):
|
|
authority.append(host.compressed.encode())
|
|
elif isinstance(host, bytes):
|
|
authority.append(_host(host))
|
|
elif host is not None:
|
|
authority.append(_host(host.encode('utf-8')))
|
|
|
|
if isinstance(port, numbers.Number):
|
|
authority.append(_port(str(port).encode()))
|
|
elif isinstance(port, bytes):
|
|
authority.append(_port(port))
|
|
elif port is not None:
|
|
authority.append(_port(port.encode()))
|
|
|
|
return b''.join(authority) if authority else None
|
|
|
|
|
|
def _ip_literal(address):
|
|
if address.startswith('v'):
|
|
raise ValueError('Address mechanism not supported')
|
|
else:
|
|
return b'[' + ipaddress.IPv6Address(address).compressed.encode() + b']'
|
|
|
|
|
|
def _host(host):
|
|
# RFC 3986 3.2.3: Although host is case-insensitive, producers and
|
|
# normalizers should use lowercase for registered names and
|
|
# hexadecimal addresses for the sake of uniformity, while only
|
|
# using uppercase letters for percent-encodings.
|
|
if host.startswith(b'[') and host.endswith(b']'):
|
|
return _ip_literal(host[1:-1].decode())
|
|
# check for IPv6 addresses as returned by SplitResult.gethost()
|
|
try:
|
|
return _ip_literal(host.decode('utf-8'))
|
|
except ValueError:
|
|
return uriencode(host, _SAFE_HOST, 'utf-8').lower()
|
|
|
|
|
|
def _port(port):
|
|
# RFC 3986 3.2.3: URI producers and normalizers should omit the
|
|
# port component and its ":" delimiter if port is empty or if its
|
|
# value would be the same as that of the scheme's default.
|
|
if port.lstrip(b'0123456789'):
|
|
raise ValueError('Invalid port subcomponent')
|
|
elif port:
|
|
return b':' + port
|
|
else:
|
|
return b''
|
|
|
|
|
|
def _querylist(items, sep, encoding):
|
|
terms = []
|
|
append = terms.append
|
|
safe = _SAFE_QUERY.replace(sep, '')
|
|
for key, value in items:
|
|
name = uriencode(key, safe, encoding)
|
|
if value is None:
|
|
append(name)
|
|
elif isinstance(value, (bytes, str)):
|
|
append(name + b'=' + uriencode(value, safe, encoding))
|
|
else:
|
|
append(name + b'=' + uriencode(str(value), safe, encoding))
|
|
return sep.encode('ascii').join(terms)
|
|
|
|
|
|
def _querydict(mapping, sep, encoding):
|
|
items = []
|
|
for key, value in mapping.items():
|
|
if isinstance(value, (bytes, str)):
|
|
items.append((key, value))
|
|
elif isinstance(value, collections.abc.Iterable):
|
|
items.extend([(key, v) for v in value])
|
|
else:
|
|
items.append((key, value))
|
|
return _querylist(items, sep, encoding)
|
|
|
|
|
|
def uricompose(scheme=None, authority=None, path='', query=None,
|
|
fragment=None, userinfo=None, host=None, port=None,
|
|
querysep='&', encoding='utf-8'):
|
|
"""Compose a URI reference string from its individual components."""
|
|
|
|
# RFC 3986 3.1: Scheme names consist of a sequence of characters
|
|
# beginning with a letter and followed by any combination of
|
|
# letters, digits, plus ("+"), period ("."), or hyphen ("-").
|
|
# Although schemes are case-insensitive, the canonical form is
|
|
# lowercase and documents that specify schemes must do so with
|
|
# lowercase letters. An implementation should accept uppercase
|
|
# letters as equivalent to lowercase in scheme names (e.g., allow
|
|
# "HTTP" as well as "http") for the sake of robustness but should
|
|
# only produce lowercase scheme names for consistency.
|
|
if isinstance(scheme, bytes):
|
|
scheme = _scheme(scheme)
|
|
elif scheme is not None:
|
|
scheme = _scheme(scheme.encode())
|
|
|
|
# authority must be string type or three-item iterable
|
|
if authority is None:
|
|
authority = (None, None, None)
|
|
elif isinstance(authority, bytes):
|
|
authority = _AUTHORITY_RE_BYTES.match(authority).groups()
|
|
elif isinstance(authority, str):
|
|
authority = _AUTHORITY_RE_STR.match(authority).groups()
|
|
elif not isinstance(authority, collections.abc.Iterable):
|
|
raise TypeError('Invalid authority type')
|
|
elif len(authority) != 3:
|
|
raise ValueError('Invalid authority length')
|
|
authority = _authority(
|
|
userinfo if userinfo is not None else authority[0],
|
|
host if host is not None else authority[1],
|
|
port if port is not None else authority[2],
|
|
encoding
|
|
)
|
|
|
|
# RFC 3986 3.3: If a URI contains an authority component, then the
|
|
# path component must either be empty or begin with a slash ("/")
|
|
# character. If a URI does not contain an authority component,
|
|
# then the path cannot begin with two slash characters ("//").
|
|
path = uriencode(path, _SAFE_PATH, encoding)
|
|
if authority is not None and path and not path.startswith(b'/'):
|
|
raise ValueError('Invalid path with authority component')
|
|
if authority is None and path.startswith(b'//'):
|
|
raise ValueError('Invalid path without authority component')
|
|
|
|
# RFC 3986 4.2: A path segment that contains a colon character
|
|
# (e.g., "this:that") cannot be used as the first segment of a
|
|
# relative-path reference, as it would be mistaken for a scheme
|
|
# name. Such a segment must be preceded by a dot-segment (e.g.,
|
|
# "./this:that") to make a relative-path reference.
|
|
if scheme is None and authority is None and not path.startswith(b'/'):
|
|
if b':' in path.partition(b'/')[0]:
|
|
path = b'./' + path
|
|
|
|
# RFC 3986 3.4: The characters slash ("/") and question mark ("?")
|
|
# may represent data within the query component. Beware that some
|
|
# older, erroneous implementations may not handle such data
|
|
# correctly when it is used as the base URI for relative
|
|
# references (Section 5.1), apparently because they fail to
|
|
# distinguish query data from path data when looking for
|
|
# hierarchical separators. However, as query components are often
|
|
# used to carry identifying information in the form of "key=value"
|
|
# pairs and one frequently used value is a reference to another
|
|
# URI, it is sometimes better for usability to avoid percent-
|
|
# encoding those characters.
|
|
if isinstance(query, (bytes, str)):
|
|
query = uriencode(query, _SAFE_QUERY, encoding)
|
|
elif isinstance(query, collections.abc.Mapping):
|
|
query = _querydict(query, querysep, encoding)
|
|
elif isinstance(query, collections.abc.Iterable):
|
|
query = _querylist(query, querysep, encoding)
|
|
elif query is not None:
|
|
raise TypeError('Invalid query type')
|
|
|
|
# RFC 3986 3.5: The characters slash ("/") and question mark ("?")
|
|
# are allowed to represent data within the fragment identifier.
|
|
# Beware that some older, erroneous implementations may not handle
|
|
# this data correctly when it is used as the base URI for relative
|
|
# references.
|
|
if fragment is not None:
|
|
fragment = uriencode(fragment, _SAFE_FRAGMENT, encoding)
|
|
|
|
# return URI reference as `str`
|
|
return uriunsplit((scheme, authority, path, query, fragment)).decode()
|