Python3 Migrate

This commit is contained in:
MariuszC
2020-01-18 20:01:00 +01:00
parent ea05af2d15
commit 6cd7e0fe44
691 changed files with 201846 additions and 598 deletions

View File

@@ -0,0 +1,40 @@
"""RFC 3986 compliant, scheme-agnostic replacement for `urllib.parse`.
This module defines RFC 3986 compliant replacements for the most
commonly used functions of the Python Standard Library
:mod:`urllib.parse` module.
"""
from .chars import GEN_DELIMS, RESERVED, SUB_DELIMS, UNRESERVED
from .classify import isabspath, isabsuri, isnetpath, isrelpath
from .classify import issamedoc, isuri
from .compose import uricompose
from .defrag import DefragResult, uridefrag
from .encoding import uridecode, uriencode
from .join import urijoin
from .split import SplitResult, urisplit, uriunsplit
__all__ = (
'GEN_DELIMS',
'RESERVED',
'SUB_DELIMS',
'UNRESERVED',
'DefragResult',
'SplitResult',
'isabspath',
'isabsuri',
'isnetpath',
'isrelpath',
'issamedoc',
'isuri',
'uricompose',
'uridecode',
'uridefrag',
'uriencode',
'urijoin',
'urisplit',
'uriunsplit'
)
__version__ = '3.0.0'

View File

@@ -0,0 +1,23 @@
# RFC 3986 2.2. Reserved Characters
#
# reserved = gen-delims / sub-delims
#
# gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@"
#
# sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
# / "*" / "+" / "," / ";" / "="
#
GEN_DELIMS = ':/?#[]@'
SUB_DELIMS = "!$&'()*+,;="
RESERVED = GEN_DELIMS + SUB_DELIMS
# RFC 3986 2.3. Unreserved Characters
#
# unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
#
UNRESERVED = (
'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz'
'0123456789'
'-._~'
)

View File

@@ -0,0 +1,33 @@
from .split import urisplit
# TODO: use specialized checks/regexes for performance
def isuri(uristring):
"""Return :const:`True` if `uristring` is a URI."""
return urisplit(uristring).isuri()
def isabsuri(uristring):
"""Return :const:`True` if `uristring` is an absolute URI."""
return urisplit(uristring).isabsuri()
def isnetpath(uristring):
"""Return :const:`True` if `uristring` is a network-path reference."""
return urisplit(uristring).isnetpath()
def isabspath(uristring):
"""Return :const:`True` if `uristring` is an absolute-path reference."""
return urisplit(uristring).isabspath()
def isrelpath(uristring):
"""Return :const:`True` if `uristring` is a relative-path reference."""
return urisplit(uristring).isrelpath()
def issamedoc(uristring):
"""Return :const:`True` if `uristring` is a same-document reference."""
return urisplit(uristring).issamedoc()

View File

@@ -0,0 +1,204 @@
import collections
import collections.abc
import ipaddress
import numbers
import re
from .chars import SUB_DELIMS
from .encoding import uriencode
from .split import uriunsplit
# RFC 3986 3.1: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
_SCHEME_RE = re.compile(b'^[A-Za-z][A-Za-z0-9+.-]*$')
# RFC 3986 3.2: authority = [ userinfo "@" ] host [ ":" port ]
_AUTHORITY_RE_BYTES = re.compile(b'^(?:(.*)@)?(.*?)(?::([0-9]*))?$')
_AUTHORITY_RE_STR = re.compile(u'^(?:(.*)@)?(.*?)(?::([0-9]*))?$')
# safe component characters
_SAFE_USERINFO = SUB_DELIMS + ':'
_SAFE_HOST = SUB_DELIMS
_SAFE_PATH = SUB_DELIMS + ':@/'
_SAFE_QUERY = SUB_DELIMS + ':@/?'
_SAFE_FRAGMENT = SUB_DELIMS + ':@/?'
def _scheme(scheme):
if _SCHEME_RE.match(scheme):
return scheme.lower()
else:
raise ValueError('Invalid scheme component')
def _authority(userinfo, host, port, encoding):
authority = []
if userinfo is not None:
authority.append(uriencode(userinfo, _SAFE_USERINFO, encoding))
authority.append(b'@')
if isinstance(host, ipaddress.IPv6Address):
authority.append(b'[' + host.compressed.encode() + b']')
elif isinstance(host, ipaddress.IPv4Address):
authority.append(host.compressed.encode())
elif isinstance(host, bytes):
authority.append(_host(host))
elif host is not None:
authority.append(_host(host.encode('utf-8')))
if isinstance(port, numbers.Number):
authority.append(_port(str(port).encode()))
elif isinstance(port, bytes):
authority.append(_port(port))
elif port is not None:
authority.append(_port(port.encode()))
return b''.join(authority) if authority else None
def _ip_literal(address):
if address.startswith('v'):
raise ValueError('Address mechanism not supported')
else:
return b'[' + ipaddress.IPv6Address(address).compressed.encode() + b']'
def _host(host):
# RFC 3986 3.2.3: Although host is case-insensitive, producers and
# normalizers should use lowercase for registered names and
# hexadecimal addresses for the sake of uniformity, while only
# using uppercase letters for percent-encodings.
if host.startswith(b'[') and host.endswith(b']'):
return _ip_literal(host[1:-1].decode())
# check for IPv6 addresses as returned by SplitResult.gethost()
try:
return _ip_literal(host.decode('utf-8'))
except ValueError:
return uriencode(host, _SAFE_HOST, 'utf-8').lower()
def _port(port):
# RFC 3986 3.2.3: URI producers and normalizers should omit the
# port component and its ":" delimiter if port is empty or if its
# value would be the same as that of the scheme's default.
if port.lstrip(b'0123456789'):
raise ValueError('Invalid port subcomponent')
elif port:
return b':' + port
else:
return b''
def _querylist(items, sep, encoding):
terms = []
append = terms.append
safe = _SAFE_QUERY.replace(sep, '')
for key, value in items:
name = uriencode(key, safe, encoding)
if value is None:
append(name)
elif isinstance(value, (bytes, str)):
append(name + b'=' + uriencode(value, safe, encoding))
else:
append(name + b'=' + uriencode(str(value), safe, encoding))
return sep.encode('ascii').join(terms)
def _querydict(mapping, sep, encoding):
items = []
for key, value in mapping.items():
if isinstance(value, (bytes, str)):
items.append((key, value))
elif isinstance(value, collections.abc.Iterable):
items.extend([(key, v) for v in value])
else:
items.append((key, value))
return _querylist(items, sep, encoding)
def uricompose(scheme=None, authority=None, path='', query=None,
fragment=None, userinfo=None, host=None, port=None,
querysep='&', encoding='utf-8'):
"""Compose a URI reference string from its individual components."""
# RFC 3986 3.1: Scheme names consist of a sequence of characters
# beginning with a letter and followed by any combination of
# letters, digits, plus ("+"), period ("."), or hyphen ("-").
# Although schemes are case-insensitive, the canonical form is
# lowercase and documents that specify schemes must do so with
# lowercase letters. An implementation should accept uppercase
# letters as equivalent to lowercase in scheme names (e.g., allow
# "HTTP" as well as "http") for the sake of robustness but should
# only produce lowercase scheme names for consistency.
if isinstance(scheme, bytes):
scheme = _scheme(scheme)
elif scheme is not None:
scheme = _scheme(scheme.encode())
# authority must be string type or three-item iterable
if authority is None:
authority = (None, None, None)
elif isinstance(authority, bytes):
authority = _AUTHORITY_RE_BYTES.match(authority).groups()
elif isinstance(authority, str):
authority = _AUTHORITY_RE_STR.match(authority).groups()
elif not isinstance(authority, collections.abc.Iterable):
raise TypeError('Invalid authority type')
elif len(authority) != 3:
raise ValueError('Invalid authority length')
authority = _authority(
userinfo if userinfo is not None else authority[0],
host if host is not None else authority[1],
port if port is not None else authority[2],
encoding
)
# RFC 3986 3.3: If a URI contains an authority component, then the
# path component must either be empty or begin with a slash ("/")
# character. If a URI does not contain an authority component,
# then the path cannot begin with two slash characters ("//").
path = uriencode(path, _SAFE_PATH, encoding)
if authority is not None and path and not path.startswith(b'/'):
raise ValueError('Invalid path with authority component')
if authority is None and path.startswith(b'//'):
raise ValueError('Invalid path without authority component')
# RFC 3986 4.2: A path segment that contains a colon character
# (e.g., "this:that") cannot be used as the first segment of a
# relative-path reference, as it would be mistaken for a scheme
# name. Such a segment must be preceded by a dot-segment (e.g.,
# "./this:that") to make a relative-path reference.
if scheme is None and authority is None and not path.startswith(b'/'):
if b':' in path.partition(b'/')[0]:
path = b'./' + path
# RFC 3986 3.4: The characters slash ("/") and question mark ("?")
# may represent data within the query component. Beware that some
# older, erroneous implementations may not handle such data
# correctly when it is used as the base URI for relative
# references (Section 5.1), apparently because they fail to
# distinguish query data from path data when looking for
# hierarchical separators. However, as query components are often
# used to carry identifying information in the form of "key=value"
# pairs and one frequently used value is a reference to another
# URI, it is sometimes better for usability to avoid percent-
# encoding those characters.
if isinstance(query, (bytes, str)):
query = uriencode(query, _SAFE_QUERY, encoding)
elif isinstance(query, collections.abc.Mapping):
query = _querydict(query, querysep, encoding)
elif isinstance(query, collections.abc.Iterable):
query = _querylist(query, querysep, encoding)
elif query is not None:
raise TypeError('Invalid query type')
# RFC 3986 3.5: The characters slash ("/") and question mark ("?")
# are allowed to represent data within the fragment identifier.
# Beware that some older, erroneous implementations may not handle
# this data correctly when it is used as the base URI for relative
# references.
if fragment is not None:
fragment = uriencode(fragment, _SAFE_FRAGMENT, encoding)
# return URI reference as `str`
return uriunsplit((scheme, authority, path, query, fragment)).decode()

View File

@@ -0,0 +1,41 @@
import collections
from .encoding import uridecode
class DefragResult(collections.namedtuple('DefragResult', 'uri fragment')):
"""Class to hold :func:`uridefrag` results."""
__slots__ = () # prevent creation of instance dictionary
def geturi(self):
"""Return the recombined version of the original URI as a string."""
fragment = self.fragment
if fragment is None:
return self.uri
elif isinstance(fragment, bytes):
return self.uri + b'#' + fragment
else:
return self.uri + u'#' + fragment
def getfragment(self, default=None, encoding='utf-8', errors='strict'):
"""Return the decoded fragment identifier, or `default` if the
original URI did not contain a fragment component.
"""
fragment = self.fragment
if fragment is not None:
return uridecode(fragment, encoding, errors)
else:
return default
def uridefrag(uristring):
"""Remove an existing fragment component from a URI reference string.
"""
if isinstance(uristring, bytes):
parts = uristring.partition(b'#')
else:
parts = uristring.partition(u'#')
return DefragResult(parts[0], parts[2] if parts[1] else None)

View File

@@ -0,0 +1,53 @@
from string import hexdigits as _hex
from .chars import UNRESERVED
# RFC 3986 2.1: For consistency, URI producers and normalizers should
# use uppercase hexadecimal digits for all percent-encodings.
def _pctenc(byte):
return ('%%%02X' % byte).encode()
_unreserved = frozenset(UNRESERVED.encode())
_encoded = {
b'': [bytes([i]) if i in _unreserved else _pctenc(i) for i in range(256)]
}
_decoded = {
(a + b).encode(): bytes.fromhex(a + b) for a in _hex for b in _hex
}
def uriencode(uristring, safe='', encoding='utf-8', errors='strict'):
"""Encode a URI string or string component."""
if not isinstance(uristring, bytes):
uristring = uristring.encode(encoding, errors)
if not isinstance(safe, bytes):
safe = safe.encode('ascii')
try:
encoded = _encoded[safe]
except KeyError:
encoded = _encoded[b''][:]
for i in safe:
encoded[i] = bytes([i])
_encoded[safe] = encoded
return b''.join(map(encoded.__getitem__, uristring))
def uridecode(uristring, encoding='utf-8', errors='strict'):
"""Decode a URI string or string component."""
if not isinstance(uristring, bytes):
uristring = uristring.encode(encoding or 'ascii', errors)
parts = uristring.split(b'%')
result = [parts[0]]
append = result.append
decode = _decoded.get
for s in parts[1:]:
append(decode(s[:2], b'%' + s[:2]))
append(s[2:])
if encoding is not None:
return b''.join(result).decode(encoding, errors)
else:
return b''.join(result)

View File

@@ -0,0 +1,14 @@
from .split import urisplit
def urijoin(base, ref, strict=False):
"""Convert a URI reference relative to a base URI to its target URI
string.
"""
if isinstance(base, type(ref)):
return urisplit(base).transform(ref, strict).geturi()
elif isinstance(base, bytes):
return urisplit(base.decode()).transform(ref, strict).geturi()
else:
return urisplit(base).transform(ref.decode(), strict).geturi()

View File

@@ -0,0 +1,399 @@
import collections
import collections.abc
import ipaddress
import re
from .encoding import uridecode
_URI_COMPONENTS = ('scheme', 'authority', 'path', 'query', 'fragment')
def _ip_literal(address):
# RFC 3986 3.2.2: In anticipation of future, as-yet-undefined IP
# literal address formats, an implementation may use an optional
# version flag to indicate such a format explicitly rather than
# rely on heuristic determination.
#
# IP-literal = "[" ( IPv6address / IPvFuture ) "]"
#
# IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" )
#
# If a URI containing an IP-literal that starts with "v"
# (case-insensitive), indicating that the version flag is present,
# is dereferenced by an application that does not know the meaning
# of that version flag, then the application should return an
# appropriate error for "address mechanism not supported".
if isinstance(address, bytes):
address = address.decode('ascii')
if address.startswith(u'v'):
raise ValueError('address mechanism not supported')
return ipaddress.IPv6Address(address)
def _ipv4_address(address):
try:
if isinstance(address, bytes):
return ipaddress.IPv4Address(address.decode('ascii'))
else:
return ipaddress.IPv4Address(address)
except ValueError:
return None
class SplitResult(collections.namedtuple('SplitResult', _URI_COMPONENTS)):
"""Base class to hold :func:`urisplit` results."""
__slots__ = () # prevent creation of instance dictionary
@property
def userinfo(self):
authority = self.authority
if authority is None:
return None
userinfo, present, _ = authority.rpartition(self.AT)
if present:
return userinfo
else:
return None
@property
def host(self):
authority = self.authority
if authority is None:
return None
_, _, hostinfo = authority.rpartition(self.AT)
host, _, port = hostinfo.rpartition(self.COLON)
if port.lstrip(self.DIGITS):
return hostinfo
else:
return host
@property
def port(self):
authority = self.authority
if authority is None:
return None
_, present, port = authority.rpartition(self.COLON)
if present and not port.lstrip(self.DIGITS):
return port
else:
return None
def geturi(self):
"""Return the re-combined version of the original URI reference as a
string.
"""
scheme, authority, path, query, fragment = self
# RFC 3986 5.3. Component Recomposition
result = []
if scheme is not None:
result.extend([scheme, self.COLON])
if authority is not None:
result.extend([self.SLASH, self.SLASH, authority])
result.append(path)
if query is not None:
result.extend([self.QUEST, query])
if fragment is not None:
result.extend([self.HASH, fragment])
return self.EMPTY.join(result)
def getscheme(self, default=None):
"""Return the URI scheme in canonical (lowercase) form, or `default`
if the original URI reference did not contain a scheme component.
"""
scheme = self.scheme
if scheme is None:
return default
elif isinstance(scheme, bytes):
return scheme.decode('ascii').lower()
else:
return scheme.lower()
def getauthority(self, default=None, encoding='utf-8', errors='strict'):
"""Return the decoded userinfo, host and port subcomponents of the URI
authority as a three-item tuple.
"""
# TBD: (userinfo, host, port) kwargs, default string?
if default is None:
default = (None, None, None)
elif not isinstance(default, collections.abc.Iterable):
raise TypeError('Invalid default type')
elif len(default) != 3:
raise ValueError('Invalid default length')
# TODO: this could be much more efficient by using a dedicated regex
return (
self.getuserinfo(default[0], encoding, errors),
self.gethost(default[1], errors),
self.getport(default[2])
)
def getuserinfo(self, default=None, encoding='utf-8', errors='strict'):
"""Return the decoded userinfo subcomponent of the URI authority, or
`default` if the original URI reference did not contain a
userinfo field.
"""
userinfo = self.userinfo
if userinfo is None:
return default
else:
return uridecode(userinfo, encoding, errors)
def gethost(self, default=None, errors='strict'):
"""Return the decoded host subcomponent of the URI authority as a
string or an :mod:`ipaddress` address object, or `default` if
the original URI reference did not contain a host.
"""
host = self.host
if host is None or (not host and default is not None):
return default
elif host.startswith(self.LBRACKET) and host.endswith(self.RBRACKET):
return _ip_literal(host[1:-1])
elif host.startswith(self.LBRACKET) or host.endswith(self.RBRACKET):
raise ValueError('Invalid host %r' % host)
# TODO: faster check for IPv4 address?
return _ipv4_address(host) or uridecode(host, 'utf-8', errors).lower()
def getport(self, default=None):
"""Return the port subcomponent of the URI authority as an
:class:`int`, or `default` if the original URI reference did
not contain a port or if the port was empty.
"""
port = self.port
if port:
return int(port)
else:
return default
def getpath(self, encoding='utf-8', errors='strict'):
"""Return the normalized decoded URI path."""
path = self.__remove_dot_segments(self.path)
return uridecode(path, encoding, errors)
def getquery(self, default=None, encoding='utf-8', errors='strict'):
"""Return the decoded query string, or `default` if the original URI
reference did not contain a query component.
"""
query = self.query
if query is None:
return default
else:
return uridecode(query, encoding, errors)
def getquerydict(self, sep='&', encoding='utf-8', errors='strict'):
"""Split the query component into individual `name=value` pairs
separated by `sep` and return a dictionary of query variables.
The dictionary keys are the unique query variable names and
the values are lists of values for each name.
"""
dict = collections.defaultdict(list)
for name, value in self.getquerylist(sep, encoding, errors):
dict[name].append(value)
return dict
def getquerylist(self, sep='&', encoding='utf-8', errors='strict'):
"""Split the query component into individual `name=value` pairs
separated by `sep`, and return a list of `(name, value)`
tuples.
"""
if not self.query:
return []
elif isinstance(sep, type(self.query)):
qsl = self.query.split(sep)
elif isinstance(sep, bytes):
qsl = self.query.split(sep.decode('ascii'))
else:
qsl = self.query.split(sep.encode('ascii'))
items = []
for parts in [qs.partition(self.EQ) for qs in qsl if qs]:
name = uridecode(parts[0], encoding, errors)
if parts[1]:
value = uridecode(parts[2], encoding, errors)
else:
value = None
items.append((name, value))
return items
def getfragment(self, default=None, encoding='utf-8', errors='strict'):
"""Return the decoded fragment identifier, or `default` if the
original URI reference did not contain a fragment component.
"""
fragment = self.fragment
if fragment is None:
return default
else:
return uridecode(fragment, encoding, errors)
def isuri(self):
"""Return :const:`True` if this is a URI."""
return self.scheme is not None
def isabsuri(self):
"""Return :const:`True` if this is an absolute URI."""
return self.scheme is not None and self.fragment is None
def isnetpath(self):
"""Return :const:`True` if this is a network-path reference."""
return self.scheme is None and self.authority is not None
def isabspath(self):
"""Return :const:`True` if this is an absolute-path reference."""
return (self.scheme is None and self.authority is None and
self.path.startswith(self.SLASH))
def isrelpath(self):
"""Return :const:`True` if this is a relative-path reference."""
return (self.scheme is None and self.authority is None and
not self.path.startswith(self.SLASH))
def issamedoc(self):
"""Return :const:`True` if this is a same-document reference."""
return (self.scheme is None and self.authority is None and
not self.path and self.query is None)
def transform(self, ref, strict=False):
"""Transform a URI reference relative to `self` into a
:class:`SplitResult` representing its target URI.
"""
scheme, authority, path, query, fragment = self.RE.match(ref).groups()
# RFC 3986 5.2.2. Transform References
if scheme is not None and (strict or scheme != self.scheme):
path = self.__remove_dot_segments(path)
elif authority is not None:
scheme = self.scheme
path = self.__remove_dot_segments(path)
elif not path:
scheme = self.scheme
authority = self.authority
path = self.path
query = self.query if query is None else query
elif path.startswith(self.SLASH):
scheme = self.scheme
authority = self.authority
path = self.__remove_dot_segments(path)
else:
scheme = self.scheme
authority = self.authority
path = self.__remove_dot_segments(self.__merge(path))
return type(self)(scheme, authority, path, query, fragment)
def __merge(self, path):
# RFC 3986 5.2.3. Merge Paths
if self.authority is not None and not self.path:
return self.SLASH + path
else:
parts = self.path.rpartition(self.SLASH)
return parts[1].join((parts[0], path))
@classmethod
def __remove_dot_segments(cls, path):
# RFC 3986 5.2.4. Remove Dot Segments
pseg = []
for s in path.split(cls.SLASH):
if s == cls.DOT:
continue
elif s != cls.DOTDOT:
pseg.append(s)
elif len(pseg) == 1 and not pseg[0]:
continue
elif pseg and pseg[-1] != cls.DOTDOT:
pseg.pop()
else:
pseg.append(s)
# adjust for trailing '/.' or '/..'
if path.rpartition(cls.SLASH)[2] in (cls.DOT, cls.DOTDOT):
pseg.append(cls.EMPTY)
if path and len(pseg) == 1 and pseg[0] == cls.EMPTY:
pseg.insert(0, cls.DOT)
return cls.SLASH.join(pseg)
class SplitResultBytes(SplitResult):
__slots__ = () # prevent creation of instance dictionary
# RFC 3986 Appendix B
RE = re.compile(br"""
(?:([A-Za-z][A-Za-z0-9+.-]*):)? # scheme (RFC 3986 3.1)
(?://([^/?#]*))? # authority
([^?#]*) # path
(?:\?([^#]*))? # query
(?:\#(.*))? # fragment
""", flags=re.VERBOSE)
# RFC 3986 2.2 gen-delims
COLON, SLASH, QUEST, HASH, LBRACKET, RBRACKET, AT = (
b':', b'/', b'?', b'#', b'[', b']', b'@'
)
# RFC 3986 3.3 dot-segments
DOT, DOTDOT = b'.', b'..'
EMPTY, EQ = b'', b'='
DIGITS = b'0123456789'
class SplitResultString(SplitResult):
__slots__ = () # prevent creation of instance dictionary
# RFC 3986 Appendix B
RE = re.compile(r"""
(?:([A-Za-z][A-Za-z0-9+.-]*):)? # scheme (RFC 3986 3.1)
(?://([^/?#]*))? # authority
([^?#]*) # path
(?:\?([^#]*))? # query
(?:\#(.*))? # fragment
""", flags=re.VERBOSE)
# RFC 3986 2.2 gen-delims
COLON, SLASH, QUEST, HASH, LBRACKET, RBRACKET, AT = (
u':', u'/', u'?', u'#', u'[', u']', u'@'
)
# RFC 3986 3.3 dot-segments
DOT, DOTDOT = u'.', u'..'
EMPTY, EQ = u'', u'='
DIGITS = u'0123456789'
def urisplit(uristring):
"""Split a well-formed URI reference string into a tuple with five
components corresponding to a URI's general structure::
<scheme>://<authority>/<path>?<query>#<fragment>
"""
if isinstance(uristring, bytes):
result = SplitResultBytes
else:
result = SplitResultString
return result(*result.RE.match(uristring).groups())
def uriunsplit(parts):
"""Combine the elements of a five-item iterable into a URI reference's
string representation.
"""
scheme, authority, path, query, fragment = parts
if isinstance(path, bytes):
result = SplitResultBytes
else:
result = SplitResultString
return result(scheme, authority, path, query, fragment).geturi()