url.py 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290
  1. from __future__ import absolute_import
  2. import re
  3. from collections import namedtuple
  4. from ..exceptions import LocationParseError
  5. from ..packages import six, rfc3986
  6. from ..packages.rfc3986.exceptions import RFC3986Exception, ValidationError
  7. from ..packages.rfc3986.validators import Validator
  8. from ..packages.rfc3986 import abnf_regexp, normalizers, compat, misc
  9. url_attrs = ['scheme', 'auth', 'host', 'port', 'path', 'query', 'fragment']
  10. # We only want to normalize urls with an HTTP(S) scheme.
  11. # urllib3 infers URLs without a scheme (None) to be http.
  12. NORMALIZABLE_SCHEMES = ('http', 'https', None)
  13. # Regex for detecting URLs with schemes. RFC 3986 Section 3.1
  14. SCHEME_REGEX = re.compile(r"^(?:[a-zA-Z][a-zA-Z0-9+\-]*:|/)")
  15. PATH_CHARS = abnf_regexp.UNRESERVED_CHARS_SET | abnf_regexp.SUB_DELIMITERS_SET | {':', '@', '/'}
  16. QUERY_CHARS = FRAGMENT_CHARS = PATH_CHARS | {'?'}
  17. class Url(namedtuple('Url', url_attrs)):
  18. """
  19. Data structure for representing an HTTP URL. Used as a return value for
  20. :func:`parse_url`. Both the scheme and host are normalized as they are
  21. both case-insensitive according to RFC 3986.
  22. """
  23. __slots__ = ()
  24. def __new__(cls, scheme=None, auth=None, host=None, port=None, path=None,
  25. query=None, fragment=None):
  26. if path and not path.startswith('/'):
  27. path = '/' + path
  28. if scheme is not None:
  29. scheme = scheme.lower()
  30. return super(Url, cls).__new__(cls, scheme, auth, host, port, path,
  31. query, fragment)
  32. @property
  33. def hostname(self):
  34. """For backwards-compatibility with urlparse. We're nice like that."""
  35. return self.host
  36. @property
  37. def request_uri(self):
  38. """Absolute path including the query string."""
  39. uri = self.path or '/'
  40. if self.query is not None:
  41. uri += '?' + self.query
  42. return uri
  43. @property
  44. def netloc(self):
  45. """Network location including host and port"""
  46. if self.port:
  47. return '%s:%d' % (self.host, self.port)
  48. return self.host
  49. @property
  50. def url(self):
  51. """
  52. Convert self into a url
  53. This function should more or less round-trip with :func:`.parse_url`. The
  54. returned url may not be exactly the same as the url inputted to
  55. :func:`.parse_url`, but it should be equivalent by the RFC (e.g., urls
  56. with a blank port will have : removed).
  57. Example: ::
  58. >>> U = parse_url('http://google.com/mail/')
  59. >>> U.url
  60. 'http://google.com/mail/'
  61. >>> Url('http', 'username:password', 'host.com', 80,
  62. ... '/path', 'query', 'fragment').url
  63. 'http://username:password@host.com:80/path?query#fragment'
  64. """
  65. scheme, auth, host, port, path, query, fragment = self
  66. url = u''
  67. # We use "is not None" we want things to happen with empty strings (or 0 port)
  68. if scheme is not None:
  69. url += scheme + u'://'
  70. if auth is not None:
  71. url += auth + u'@'
  72. if host is not None:
  73. url += host
  74. if port is not None:
  75. url += u':' + str(port)
  76. if path is not None:
  77. url += path
  78. if query is not None:
  79. url += u'?' + query
  80. if fragment is not None:
  81. url += u'#' + fragment
  82. return url
  83. def __str__(self):
  84. return self.url
  85. def split_first(s, delims):
  86. """
  87. .. deprecated:: 1.25
  88. Given a string and an iterable of delimiters, split on the first found
  89. delimiter. Return two split parts and the matched delimiter.
  90. If not found, then the first part is the full input string.
  91. Example::
  92. >>> split_first('foo/bar?baz', '?/=')
  93. ('foo', 'bar?baz', '/')
  94. >>> split_first('foo/bar?baz', '123')
  95. ('foo/bar?baz', '', None)
  96. Scales linearly with number of delims. Not ideal for large number of delims.
  97. """
  98. min_idx = None
  99. min_delim = None
  100. for d in delims:
  101. idx = s.find(d)
  102. if idx < 0:
  103. continue
  104. if min_idx is None or idx < min_idx:
  105. min_idx = idx
  106. min_delim = d
  107. if min_idx is None or min_idx < 0:
  108. return s, '', None
  109. return s[:min_idx], s[min_idx + 1:], min_delim
  110. def _encode_invalid_chars(component, allowed_chars, encoding='utf-8'):
  111. """Percent-encodes a URI component without reapplying
  112. onto an already percent-encoded component. Based on
  113. rfc3986.normalizers.encode_component()
  114. """
  115. if component is None:
  116. return component
  117. # Try to see if the component we're encoding is already percent-encoded
  118. # so we can skip all '%' characters but still encode all others.
  119. percent_encodings = len(normalizers.PERCENT_MATCHER.findall(
  120. compat.to_str(component, encoding)))
  121. uri_bytes = component.encode('utf-8', 'surrogatepass')
  122. is_percent_encoded = percent_encodings == uri_bytes.count(b'%')
  123. encoded_component = bytearray()
  124. for i in range(0, len(uri_bytes)):
  125. # Will return a single character bytestring on both Python 2 & 3
  126. byte = uri_bytes[i:i+1]
  127. byte_ord = ord(byte)
  128. if ((is_percent_encoded and byte == b'%')
  129. or (byte_ord < 128 and byte.decode() in allowed_chars)):
  130. encoded_component.extend(byte)
  131. continue
  132. encoded_component.extend('%{0:02x}'.format(byte_ord).encode().upper())
  133. return encoded_component.decode(encoding)
  134. def parse_url(url):
  135. """
  136. Given a url, return a parsed :class:`.Url` namedtuple. Best-effort is
  137. performed to parse incomplete urls. Fields not provided will be None.
  138. This parser is RFC 3986 compliant.
  139. :param str url: URL to parse into a :class:`.Url` namedtuple.
  140. Partly backwards-compatible with :mod:`urlparse`.
  141. Example::
  142. >>> parse_url('http://google.com/mail/')
  143. Url(scheme='http', host='google.com', port=None, path='/mail/', ...)
  144. >>> parse_url('google.com:80')
  145. Url(scheme=None, host='google.com', port=80, path=None, ...)
  146. >>> parse_url('/foo?bar')
  147. Url(scheme=None, host=None, port=None, path='/foo', query='bar', ...)
  148. """
  149. if not url:
  150. # Empty
  151. return Url()
  152. is_string = not isinstance(url, six.binary_type)
  153. # RFC 3986 doesn't like URLs that have a host but don't start
  154. # with a scheme and we support URLs like that so we need to
  155. # detect that problem and add an empty scheme indication.
  156. # We don't get hurt on path-only URLs here as it's stripped
  157. # off and given an empty scheme anyways.
  158. if not SCHEME_REGEX.search(url):
  159. url = "//" + url
  160. def idna_encode(name):
  161. if name and any([ord(x) > 128 for x in name]):
  162. try:
  163. import idna
  164. except ImportError:
  165. raise LocationParseError("Unable to parse URL without the 'idna' module")
  166. try:
  167. return idna.encode(name.lower(), strict=True, std3_rules=True)
  168. except idna.IDNAError:
  169. raise LocationParseError(u"Name '%s' is not a valid IDNA label" % name)
  170. return name
  171. try:
  172. split_iri = misc.IRI_MATCHER.match(compat.to_str(url)).groupdict()
  173. iri_ref = rfc3986.IRIReference(
  174. split_iri['scheme'], split_iri['authority'],
  175. _encode_invalid_chars(split_iri['path'], PATH_CHARS),
  176. _encode_invalid_chars(split_iri['query'], QUERY_CHARS),
  177. _encode_invalid_chars(split_iri['fragment'], FRAGMENT_CHARS)
  178. )
  179. has_authority = iri_ref.authority is not None
  180. uri_ref = iri_ref.encode(idna_encoder=idna_encode)
  181. except (ValueError, RFC3986Exception):
  182. return six.raise_from(LocationParseError(url), None)
  183. # rfc3986 strips the authority if it's invalid
  184. if has_authority and uri_ref.authority is None:
  185. raise LocationParseError(url)
  186. # Only normalize schemes we understand to not break http+unix
  187. # or other schemes that don't follow RFC 3986.
  188. if uri_ref.scheme is None or uri_ref.scheme.lower() in NORMALIZABLE_SCHEMES:
  189. uri_ref = uri_ref.normalize()
  190. # Validate all URIReference components and ensure that all
  191. # components that were set before are still set after
  192. # normalization has completed.
  193. validator = Validator()
  194. try:
  195. validator.check_validity_of(
  196. *validator.COMPONENT_NAMES
  197. ).validate(uri_ref)
  198. except ValidationError:
  199. return six.raise_from(LocationParseError(url), None)
  200. # For the sake of backwards compatibility we put empty
  201. # string values for path if there are any defined values
  202. # beyond the path in the URL.
  203. # TODO: Remove this when we break backwards compatibility.
  204. path = uri_ref.path
  205. if not path:
  206. if (uri_ref.query is not None
  207. or uri_ref.fragment is not None):
  208. path = ""
  209. else:
  210. path = None
  211. # Ensure that each part of the URL is a `str` for
  212. # backwards compatibility.
  213. def to_input_type(x):
  214. if x is None:
  215. return None
  216. elif not is_string and not isinstance(x, six.binary_type):
  217. return x.encode('utf-8')
  218. return x
  219. return Url(
  220. scheme=to_input_type(uri_ref.scheme),
  221. auth=to_input_type(uri_ref.userinfo),
  222. host=to_input_type(uri_ref.host),
  223. port=int(uri_ref.port) if uri_ref.port is not None else None,
  224. path=to_input_type(path),
  225. query=to_input_type(uri_ref.query),
  226. fragment=to_input_type(uri_ref.fragment)
  227. )
  228. def get_host(url):
  229. """
  230. Deprecated. Use :func:`parse_url` instead.
  231. """
  232. p = parse_url(url)
  233. return p.scheme or 'http', p.hostname, p.port