400 lines
13 KiB
Python
400 lines
13 KiB
Python
import collections
|
|
import collections.abc
|
|
import ipaddress
|
|
import re
|
|
|
|
from .encoding import uridecode
|
|
|
|
_URI_COMPONENTS = ('scheme', 'authority', 'path', 'query', 'fragment')
|
|
|
|
|
|
def _ip_literal(address):
|
|
# RFC 3986 3.2.2: In anticipation of future, as-yet-undefined IP
|
|
# literal address formats, an implementation may use an optional
|
|
# version flag to indicate such a format explicitly rather than
|
|
# rely on heuristic determination.
|
|
#
|
|
# IP-literal = "[" ( IPv6address / IPvFuture ) "]"
|
|
#
|
|
# IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
|
|
#
|
|
# If a URI containing an IP-literal that starts with "v"
|
|
# (case-insensitive), indicating that the version flag is present,
|
|
# is dereferenced by an application that does not know the meaning
|
|
# of that version flag, then the application should return an
|
|
# appropriate error for "address mechanism not supported".
|
|
if isinstance(address, bytes):
|
|
address = address.decode('ascii')
|
|
if address.startswith(u'v'):
|
|
raise ValueError('address mechanism not supported')
|
|
return ipaddress.IPv6Address(address)
|
|
|
|
|
|
def _ipv4_address(address):
|
|
try:
|
|
if isinstance(address, bytes):
|
|
return ipaddress.IPv4Address(address.decode('ascii'))
|
|
else:
|
|
return ipaddress.IPv4Address(address)
|
|
except ValueError:
|
|
return None
|
|
|
|
|
|
class SplitResult(collections.namedtuple('SplitResult', _URI_COMPONENTS)):
|
|
"""Base class to hold :func:`urisplit` results."""
|
|
|
|
__slots__ = () # prevent creation of instance dictionary
|
|
|
|
@property
|
|
def userinfo(self):
|
|
authority = self.authority
|
|
if authority is None:
|
|
return None
|
|
userinfo, present, _ = authority.rpartition(self.AT)
|
|
if present:
|
|
return userinfo
|
|
else:
|
|
return None
|
|
|
|
@property
|
|
def host(self):
|
|
authority = self.authority
|
|
if authority is None:
|
|
return None
|
|
_, _, hostinfo = authority.rpartition(self.AT)
|
|
host, _, port = hostinfo.rpartition(self.COLON)
|
|
if port.lstrip(self.DIGITS):
|
|
return hostinfo
|
|
else:
|
|
return host
|
|
|
|
@property
|
|
def port(self):
|
|
authority = self.authority
|
|
if authority is None:
|
|
return None
|
|
_, present, port = authority.rpartition(self.COLON)
|
|
if present and not port.lstrip(self.DIGITS):
|
|
return port
|
|
else:
|
|
return None
|
|
|
|
def geturi(self):
|
|
"""Return the re-combined version of the original URI reference as a
|
|
string.
|
|
|
|
"""
|
|
scheme, authority, path, query, fragment = self
|
|
|
|
# RFC 3986 5.3. Component Recomposition
|
|
result = []
|
|
if scheme is not None:
|
|
result.extend([scheme, self.COLON])
|
|
if authority is not None:
|
|
result.extend([self.SLASH, self.SLASH, authority])
|
|
result.append(path)
|
|
if query is not None:
|
|
result.extend([self.QUEST, query])
|
|
if fragment is not None:
|
|
result.extend([self.HASH, fragment])
|
|
return self.EMPTY.join(result)
|
|
|
|
def getscheme(self, default=None):
|
|
"""Return the URI scheme in canonical (lowercase) form, or `default`
|
|
if the original URI reference did not contain a scheme component.
|
|
|
|
"""
|
|
scheme = self.scheme
|
|
if scheme is None:
|
|
return default
|
|
elif isinstance(scheme, bytes):
|
|
return scheme.decode('ascii').lower()
|
|
else:
|
|
return scheme.lower()
|
|
|
|
def getauthority(self, default=None, encoding='utf-8', errors='strict'):
|
|
"""Return the decoded userinfo, host and port subcomponents of the URI
|
|
authority as a three-item tuple.
|
|
|
|
"""
|
|
# TBD: (userinfo, host, port) kwargs, default string?
|
|
if default is None:
|
|
default = (None, None, None)
|
|
elif not isinstance(default, collections.abc.Iterable):
|
|
raise TypeError('Invalid default type')
|
|
elif len(default) != 3:
|
|
raise ValueError('Invalid default length')
|
|
# TODO: this could be much more efficient by using a dedicated regex
|
|
return (
|
|
self.getuserinfo(default[0], encoding, errors),
|
|
self.gethost(default[1], errors),
|
|
self.getport(default[2])
|
|
)
|
|
|
|
def getuserinfo(self, default=None, encoding='utf-8', errors='strict'):
|
|
"""Return the decoded userinfo subcomponent of the URI authority, or
|
|
`default` if the original URI reference did not contain a
|
|
userinfo field.
|
|
|
|
"""
|
|
userinfo = self.userinfo
|
|
if userinfo is None:
|
|
return default
|
|
else:
|
|
return uridecode(userinfo, encoding, errors)
|
|
|
|
def gethost(self, default=None, errors='strict'):
|
|
"""Return the decoded host subcomponent of the URI authority as a
|
|
string or an :mod:`ipaddress` address object, or `default` if
|
|
the original URI reference did not contain a host.
|
|
|
|
"""
|
|
host = self.host
|
|
if host is None or (not host and default is not None):
|
|
return default
|
|
elif host.startswith(self.LBRACKET) and host.endswith(self.RBRACKET):
|
|
return _ip_literal(host[1:-1])
|
|
elif host.startswith(self.LBRACKET) or host.endswith(self.RBRACKET):
|
|
raise ValueError('Invalid host %r' % host)
|
|
# TODO: faster check for IPv4 address?
|
|
return _ipv4_address(host) or uridecode(host, 'utf-8', errors).lower()
|
|
|
|
def getport(self, default=None):
|
|
"""Return the port subcomponent of the URI authority as an
|
|
:class:`int`, or `default` if the original URI reference did
|
|
not contain a port or if the port was empty.
|
|
|
|
"""
|
|
port = self.port
|
|
if port:
|
|
return int(port)
|
|
else:
|
|
return default
|
|
|
|
def getpath(self, encoding='utf-8', errors='strict'):
|
|
"""Return the normalized decoded URI path."""
|
|
path = self.__remove_dot_segments(self.path)
|
|
return uridecode(path, encoding, errors)
|
|
|
|
def getquery(self, default=None, encoding='utf-8', errors='strict'):
|
|
"""Return the decoded query string, or `default` if the original URI
|
|
reference did not contain a query component.
|
|
|
|
"""
|
|
query = self.query
|
|
if query is None:
|
|
return default
|
|
else:
|
|
return uridecode(query, encoding, errors)
|
|
|
|
def getquerydict(self, sep='&', encoding='utf-8', errors='strict'):
|
|
"""Split the query component into individual `name=value` pairs
|
|
separated by `sep` and return a dictionary of query variables.
|
|
The dictionary keys are the unique query variable names and
|
|
the values are lists of values for each name.
|
|
|
|
"""
|
|
dict = collections.defaultdict(list)
|
|
for name, value in self.getquerylist(sep, encoding, errors):
|
|
dict[name].append(value)
|
|
return dict
|
|
|
|
def getquerylist(self, sep='&', encoding='utf-8', errors='strict'):
|
|
"""Split the query component into individual `name=value` pairs
|
|
separated by `sep`, and return a list of `(name, value)`
|
|
tuples.
|
|
|
|
"""
|
|
if not self.query:
|
|
return []
|
|
elif isinstance(sep, type(self.query)):
|
|
qsl = self.query.split(sep)
|
|
elif isinstance(sep, bytes):
|
|
qsl = self.query.split(sep.decode('ascii'))
|
|
else:
|
|
qsl = self.query.split(sep.encode('ascii'))
|
|
items = []
|
|
for parts in [qs.partition(self.EQ) for qs in qsl if qs]:
|
|
name = uridecode(parts[0], encoding, errors)
|
|
if parts[1]:
|
|
value = uridecode(parts[2], encoding, errors)
|
|
else:
|
|
value = None
|
|
items.append((name, value))
|
|
return items
|
|
|
|
def getfragment(self, default=None, encoding='utf-8', errors='strict'):
|
|
"""Return the decoded fragment identifier, or `default` if the
|
|
original URI reference did not contain a fragment component.
|
|
|
|
"""
|
|
fragment = self.fragment
|
|
if fragment is None:
|
|
return default
|
|
else:
|
|
return uridecode(fragment, encoding, errors)
|
|
|
|
def isuri(self):
|
|
"""Return :const:`True` if this is a URI."""
|
|
return self.scheme is not None
|
|
|
|
def isabsuri(self):
|
|
"""Return :const:`True` if this is an absolute URI."""
|
|
return self.scheme is not None and self.fragment is None
|
|
|
|
def isnetpath(self):
|
|
"""Return :const:`True` if this is a network-path reference."""
|
|
return self.scheme is None and self.authority is not None
|
|
|
|
def isabspath(self):
|
|
"""Return :const:`True` if this is an absolute-path reference."""
|
|
return (self.scheme is None and self.authority is None and
|
|
self.path.startswith(self.SLASH))
|
|
|
|
def isrelpath(self):
|
|
"""Return :const:`True` if this is a relative-path reference."""
|
|
return (self.scheme is None and self.authority is None and
|
|
not self.path.startswith(self.SLASH))
|
|
|
|
def issamedoc(self):
|
|
"""Return :const:`True` if this is a same-document reference."""
|
|
return (self.scheme is None and self.authority is None and
|
|
not self.path and self.query is None)
|
|
|
|
def transform(self, ref, strict=False):
|
|
"""Transform a URI reference relative to `self` into a
|
|
:class:`SplitResult` representing its target URI.
|
|
|
|
"""
|
|
scheme, authority, path, query, fragment = self.RE.match(ref).groups()
|
|
|
|
# RFC 3986 5.2.2. Transform References
|
|
if scheme is not None and (strict or scheme != self.scheme):
|
|
path = self.__remove_dot_segments(path)
|
|
elif authority is not None:
|
|
scheme = self.scheme
|
|
path = self.__remove_dot_segments(path)
|
|
elif not path:
|
|
scheme = self.scheme
|
|
authority = self.authority
|
|
path = self.path
|
|
query = self.query if query is None else query
|
|
elif path.startswith(self.SLASH):
|
|
scheme = self.scheme
|
|
authority = self.authority
|
|
path = self.__remove_dot_segments(path)
|
|
else:
|
|
scheme = self.scheme
|
|
authority = self.authority
|
|
path = self.__remove_dot_segments(self.__merge(path))
|
|
return type(self)(scheme, authority, path, query, fragment)
|
|
|
|
def __merge(self, path):
|
|
# RFC 3986 5.2.3. Merge Paths
|
|
if self.authority is not None and not self.path:
|
|
return self.SLASH + path
|
|
else:
|
|
parts = self.path.rpartition(self.SLASH)
|
|
return parts[1].join((parts[0], path))
|
|
|
|
@classmethod
|
|
def __remove_dot_segments(cls, path):
|
|
# RFC 3986 5.2.4. Remove Dot Segments
|
|
pseg = []
|
|
for s in path.split(cls.SLASH):
|
|
if s == cls.DOT:
|
|
continue
|
|
elif s != cls.DOTDOT:
|
|
pseg.append(s)
|
|
elif len(pseg) == 1 and not pseg[0]:
|
|
continue
|
|
elif pseg and pseg[-1] != cls.DOTDOT:
|
|
pseg.pop()
|
|
else:
|
|
pseg.append(s)
|
|
# adjust for trailing '/.' or '/..'
|
|
if path.rpartition(cls.SLASH)[2] in (cls.DOT, cls.DOTDOT):
|
|
pseg.append(cls.EMPTY)
|
|
if path and len(pseg) == 1 and pseg[0] == cls.EMPTY:
|
|
pseg.insert(0, cls.DOT)
|
|
return cls.SLASH.join(pseg)
|
|
|
|
|
|
class SplitResultBytes(SplitResult):
|
|
|
|
__slots__ = () # prevent creation of instance dictionary
|
|
|
|
# RFC 3986 Appendix B
|
|
RE = re.compile(br"""
|
|
(?:([A-Za-z][A-Za-z0-9+.-]*):)? # scheme (RFC 3986 3.1)
|
|
(?://([^/?#]*))? # authority
|
|
([^?#]*) # path
|
|
(?:\?([^#]*))? # query
|
|
(?:\#(.*))? # fragment
|
|
""", flags=re.VERBOSE)
|
|
|
|
# RFC 3986 2.2 gen-delims
|
|
COLON, SLASH, QUEST, HASH, LBRACKET, RBRACKET, AT = (
|
|
b':', b'/', b'?', b'#', b'[', b']', b'@'
|
|
)
|
|
|
|
# RFC 3986 3.3 dot-segments
|
|
DOT, DOTDOT = b'.', b'..'
|
|
|
|
EMPTY, EQ = b'', b'='
|
|
|
|
DIGITS = b'0123456789'
|
|
|
|
|
|
class SplitResultString(SplitResult):
|
|
|
|
__slots__ = () # prevent creation of instance dictionary
|
|
|
|
# RFC 3986 Appendix B
|
|
RE = re.compile(r"""
|
|
(?:([A-Za-z][A-Za-z0-9+.-]*):)? # scheme (RFC 3986 3.1)
|
|
(?://([^/?#]*))? # authority
|
|
([^?#]*) # path
|
|
(?:\?([^#]*))? # query
|
|
(?:\#(.*))? # fragment
|
|
""", flags=re.VERBOSE)
|
|
|
|
# RFC 3986 2.2 gen-delims
|
|
COLON, SLASH, QUEST, HASH, LBRACKET, RBRACKET, AT = (
|
|
u':', u'/', u'?', u'#', u'[', u']', u'@'
|
|
)
|
|
|
|
# RFC 3986 3.3 dot-segments
|
|
DOT, DOTDOT = u'.', u'..'
|
|
|
|
EMPTY, EQ = u'', u'='
|
|
|
|
DIGITS = u'0123456789'
|
|
|
|
|
|
def urisplit(uristring):
|
|
"""Split a well-formed URI reference string into a tuple with five
|
|
components corresponding to a URI's general structure::
|
|
|
|
<scheme>://<authority>/<path>?<query>#<fragment>
|
|
|
|
"""
|
|
if isinstance(uristring, bytes):
|
|
result = SplitResultBytes
|
|
else:
|
|
result = SplitResultString
|
|
return result(*result.RE.match(uristring).groups())
|
|
|
|
|
|
def uriunsplit(parts):
|
|
"""Combine the elements of a five-item iterable into a URI reference's
|
|
string representation.
|
|
|
|
"""
|
|
scheme, authority, path, query, fragment = parts
|
|
if isinstance(path, bytes):
|
|
result = SplitResultBytes
|
|
else:
|
|
result = SplitResultString
|
|
return result(scheme, authority, path, query, fragment).geturi()
|