-
-
Notifications
You must be signed in to change notification settings - Fork 32.2k
gh-76960: Fix urljoin() and urldefrag() for URIs with empty components #123273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -349,18 +349,19 @@ def _encode(t): | |
split = (scheme,) + split | ||
self.checkRoundtrips(url, parsed, split) | ||
|
||
def checkJoin(self, base, relurl, expected): | ||
def checkJoin(self, base, relurl, expected, *, relroundtrip=True): | ||
with self.subTest(base=base, relurl=relurl): | ||
self.assertEqual(urllib.parse.urljoin(base, relurl), expected) | ||
baseb = base.encode('ascii') | ||
relurlb = relurl.encode('ascii') | ||
expectedb = expected.encode('ascii') | ||
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) | ||
|
||
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) | ||
self.assertEqual(urllib.parse.urljoin(base, relurl), expected) | ||
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) | ||
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) | ||
if relroundtrip: | ||
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl)) | ||
self.assertEqual(urllib.parse.urljoin(base, relurl), expected) | ||
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb)) | ||
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb) | ||
|
||
def test_unparse_parse(self): | ||
str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',] | ||
|
@@ -526,8 +527,6 @@ def test_RFC3986(self): | |
|
||
def test_urljoins(self): | ||
self.checkJoin(SIMPLE_BASE, 'g:h','g:h') | ||
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') | ||
self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d') | ||
self.checkJoin(SIMPLE_BASE, 'g','http://a/b/c/g') | ||
self.checkJoin(SIMPLE_BASE, './g','http://a/b/c/g') | ||
self.checkJoin(SIMPLE_BASE, 'g/','http://a/b/c/g/') | ||
|
@@ -548,8 +547,6 @@ def test_urljoins(self): | |
self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h') | ||
self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h') | ||
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g') | ||
self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d') | ||
self.checkJoin(SIMPLE_BASE, 'http:?y','http://a/b/c/d?y') | ||
Comment on lines
-551
to
-552
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Duplicated in new tests below. |
||
self.checkJoin(SIMPLE_BASE, 'http:g?y','http://a/b/c/g?y') | ||
self.checkJoin(SIMPLE_BASE, 'http:g?y/./x','http://a/b/c/g?y/./x') | ||
self.checkJoin('http:///', '..','http:///') | ||
|
@@ -579,6 +576,53 @@ def test_urljoins(self): | |
# issue 23703: don't duplicate filename | ||
self.checkJoin('a', 'b', 'b') | ||
|
||
# Test with empty (but defined) components. | ||
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f') | ||
self.checkJoin(RFC1808_BASE, '#', 'http://a/b/c/d;p?q#', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, '#z', 'http://a/b/c/d;p?q#z') | ||
self.checkJoin(RFC1808_BASE, '?', 'http://a/b/c/d;p?', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, '?#z', 'http://a/b/c/d;p?#z', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, '?y', 'http://a/b/c/d;p?y') | ||
self.checkJoin(RFC1808_BASE, ';', 'http://a/b/c/;') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Making a note, using parameter There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. And this as per the rfc3986 requirement as given in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From RFC3986's point of view, Other example: >>> urljoin('http:/a/b/c/d;?#f', '#z')
'http:///a/b/c/d#z' Currently it removes |
||
self.checkJoin(RFC1808_BASE, ';?y', 'http://a/b/c/;?y') | ||
self.checkJoin(RFC1808_BASE, ';#z', 'http://a/b/c/;#z') | ||
self.checkJoin(RFC1808_BASE, ';x', 'http://a/b/c/;x') | ||
self.checkJoin(RFC1808_BASE, '/w', 'http://a/w') | ||
self.checkJoin(RFC1808_BASE, '//', 'http://a/b/c/d;p?q#f') | ||
self.checkJoin(RFC1808_BASE, '//#z', 'http://a/b/c/d;p?q#z') | ||
self.checkJoin(RFC1808_BASE, '//?y', 'http://a/b/c/d;p?y') | ||
self.checkJoin(RFC1808_BASE, '//;x', 'http://;x') | ||
self.checkJoin(RFC1808_BASE, '///w', 'http://a/w') | ||
self.checkJoin(RFC1808_BASE, '//v', 'http://v') | ||
# For backward compatibility with RFC1630, the scheme name is allowed | ||
# to be present in a relative reference if it is the same as the base | ||
# URI scheme. | ||
self.checkJoin(RFC1808_BASE, 'http:', 'http://a/b/c/d;p?q#f') | ||
self.checkJoin(RFC1808_BASE, 'http:#', 'http://a/b/c/d;p?q#', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'http:#z', 'http://a/b/c/d;p?q#z') | ||
self.checkJoin(RFC1808_BASE, 'http:?', 'http://a/b/c/d;p?', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'http:?#z', 'http://a/b/c/d;p?#z', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'http:?y', 'http://a/b/c/d;p?y') | ||
self.checkJoin(RFC1808_BASE, 'http:;', 'http://a/b/c/;') | ||
self.checkJoin(RFC1808_BASE, 'http:;?y', 'http://a/b/c/;?y') | ||
self.checkJoin(RFC1808_BASE, 'http:;#z', 'http://a/b/c/;#z') | ||
self.checkJoin(RFC1808_BASE, 'http:;x', 'http://a/b/c/;x') | ||
self.checkJoin(RFC1808_BASE, 'http:/w', 'http://a/w') | ||
self.checkJoin(RFC1808_BASE, 'http://', 'http://a/b/c/d;p?q#f') | ||
self.checkJoin(RFC1808_BASE, 'http://#z', 'http://a/b/c/d;p?q#z') | ||
self.checkJoin(RFC1808_BASE, 'http://?y', 'http://a/b/c/d;p?y') | ||
self.checkJoin(RFC1808_BASE, 'http://;x', 'http://;x') | ||
self.checkJoin(RFC1808_BASE, 'http:///w', 'http://a/w') | ||
self.checkJoin(RFC1808_BASE, 'http://v', 'http://v') | ||
# Different scheme is not ignored. | ||
self.checkJoin(RFC1808_BASE, 'https:', 'https:', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'https:#', 'https:#', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'https:#z', 'https:#z', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'https:?', 'https:?', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'https:?y', 'https:?y', relroundtrip=False) | ||
self.checkJoin(RFC1808_BASE, 'https:;', 'https:;') | ||
self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x') | ||
|
||
def test_RFC2732(self): | ||
str_cases = [ | ||
('http://Test.python.org:5432/foo/', 'test.python.org', 5432), | ||
|
@@ -641,16 +685,31 @@ def test_urldefrag(self): | |
('http://python.org/p?q', 'http://python.org/p?q', ''), | ||
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'), | ||
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''), | ||
('http://a/b/c;p?q#f', 'http://a/b/c;p?q', 'f'), | ||
('http://a/b/c;p?q#', 'http://a/b/c;p?q', ''), | ||
('http://a/b/c;p?q', 'http://a/b/c;p?q', ''), | ||
('http://a/b/c;p?#f', 'http://a/b/c;p?', 'f'), | ||
('http://a/b/c;p#f', 'http://a/b/c;p', 'f'), | ||
('http://a/b/c;?q#f', 'http://a/b/c;?q', 'f'), | ||
('http://a/b/c?q#f', 'http://a/b/c?q', 'f'), | ||
('http:///b/c;p?q#f', 'http:///b/c;p?q', 'f'), | ||
('http:b/c;p?q#f', 'http:b/c;p?q', 'f'), | ||
('http:;?q#f', 'http:;?q', 'f'), | ||
('http:?q#f', 'http:?q', 'f'), | ||
('//a/b/c;p?q#f', '//a/b/c;p?q', 'f'), | ||
('://a/b/c;p?q#f', '://a/b/c;p?q', 'f'), | ||
] | ||
def _encode(t): | ||
return type(t)(x.encode('ascii') for x in t) | ||
bytes_cases = [_encode(x) for x in str_cases] | ||
for url, defrag, frag in str_cases + bytes_cases: | ||
result = urllib.parse.urldefrag(url) | ||
self.assertEqual(result.geturl(), url) | ||
self.assertEqual(result, (defrag, frag)) | ||
self.assertEqual(result.url, defrag) | ||
self.assertEqual(result.fragment, frag) | ||
with self.subTest(url): | ||
result = urllib.parse.urldefrag(url) | ||
hash = '#' if isinstance(url, str) else b'#' | ||
self.assertEqual(result.geturl(), url.rstrip(hash)) | ||
self.assertEqual(result, (defrag, frag)) | ||
self.assertEqual(result.url, defrag) | ||
self.assertEqual(result.fragment, frag) | ||
|
||
def test_urlsplit_scoped_IPv6(self): | ||
p = urllib.parse.urlsplit('http://[FE80::822a:a8ff:fe49:470c%tESt]:1234') | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -392,20 +392,23 @@ def urlparse(url, scheme='', allow_fragments=True): | |
Note that % escapes are not expanded. | ||
""" | ||
url, scheme, _coerce_result = _coerce_args(url, scheme) | ||
splitresult = urlsplit(url, scheme, allow_fragments) | ||
scheme, netloc, url, query, fragment = splitresult | ||
if scheme in uses_params and ';' in url: | ||
url, params = _splitparams(url) | ||
else: | ||
params = '' | ||
result = ParseResult(scheme, netloc, url, params, query, fragment) | ||
scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments) | ||
result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '') | ||
return _coerce_result(result) | ||
|
||
def _splitparams(url): | ||
def _urlparse(url, scheme=None, allow_fragments=True): | ||
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) | ||
if (scheme or '') in uses_params and ';' in url: | ||
url, params = _splitparams(url, allow_none=True) | ||
else: | ||
params = None | ||
return (scheme, netloc, url, params, query, fragment) | ||
|
||
def _splitparams(url, allow_none=False): | ||
if '/' in url: | ||
i = url.find(';', url.rfind('/')) | ||
if i < 0: | ||
return url, '' | ||
return url, None if allow_none else '' | ||
else: | ||
i = url.find(';') | ||
return url[:i], url[i+1:] | ||
|
@@ -472,17 +475,23 @@ def urlsplit(url, scheme='', allow_fragments=True): | |
""" | ||
|
||
url, scheme, _coerce_result = _coerce_args(url, scheme) | ||
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments) | ||
v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '') | ||
return _coerce_result(v) | ||
|
||
def _urlsplit(url, scheme=None, allow_fragments=True): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
# Only lstrip url as some applications rely on preserving trailing space. | ||
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both) | ||
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE) | ||
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) | ||
|
||
for b in _UNSAFE_URL_BYTES_TO_REMOVE: | ||
url = url.replace(b, "") | ||
scheme = scheme.replace(b, "") | ||
if scheme is not None: | ||
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE) | ||
for b in _UNSAFE_URL_BYTES_TO_REMOVE: | ||
scheme = scheme.replace(b, "") | ||
|
||
allow_fragments = bool(allow_fragments) | ||
netloc = query = fragment = '' | ||
netloc = query = fragment = None | ||
i = url.find(':') | ||
if i > 0 and url[0].isascii() and url[0].isalpha(): | ||
for c in url[:i]: | ||
|
@@ -503,8 +512,7 @@ def urlsplit(url, scheme='', allow_fragments=True): | |
if '?' in url: | ||
url, query = url.split('?', 1) | ||
_checknetloc(netloc) | ||
v = SplitResult(scheme, netloc, url, query, fragment) | ||
return _coerce_result(v) | ||
return (scheme, netloc, url, query, fragment) | ||
|
||
def urlunparse(components): | ||
"""Put a parsed URL back together again. This may result in a | ||
|
@@ -513,9 +521,15 @@ def urlunparse(components): | |
(the draft states that these are equivalent).""" | ||
scheme, netloc, url, params, query, fragment, _coerce_result = ( | ||
_coerce_args(*components)) | ||
if not netloc: | ||
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): | ||
netloc = '' | ||
else: | ||
netloc = None | ||
if params: | ||
url = "%s;%s" % (url, params) | ||
return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) | ||
return _coerce_result(_urlunsplit(scheme or None, netloc, url, | ||
query or None, fragment or None)) | ||
|
||
def urlunsplit(components): | ||
"""Combine the elements of a tuple as returned by urlsplit() into a | ||
|
@@ -525,20 +539,27 @@ def urlunsplit(components): | |
empty query; the RFC states that these are equivalent).""" | ||
scheme, netloc, url, query, fragment, _coerce_result = ( | ||
_coerce_args(*components)) | ||
if netloc: | ||
if not netloc: | ||
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'): | ||
netloc = '' | ||
else: | ||
netloc = None | ||
return _coerce_result(_urlunsplit(scheme or None, netloc, url, | ||
query or None, fragment or None)) | ||
|
||
def _urlunsplit(scheme, netloc, url, query, fragment): | ||
if netloc is not None: | ||
if url and url[:1] != '/': url = '/' + url | ||
url = '//' + netloc + url | ||
elif url[:2] == '//': | ||
url = '//' + url | ||
elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'): | ||
url = '//' + url | ||
if scheme: | ||
url = scheme + ':' + url | ||
if query: | ||
if query is not None: | ||
url = url + '?' + query | ||
if fragment: | ||
if fragment is not None: | ||
url = url + '#' + fragment | ||
return _coerce_result(url) | ||
return url | ||
|
||
def urljoin(base, url, allow_fragments=True): | ||
"""Join a base URL and a possibly relative URL to form an absolute | ||
|
@@ -549,26 +570,29 @@ def urljoin(base, url, allow_fragments=True): | |
return base | ||
|
||
base, url, _coerce_result = _coerce_args(base, url) | ||
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ | ||
urlparse(base, '', allow_fragments) | ||
scheme, netloc, path, params, query, fragment = \ | ||
urlparse(url, bscheme, allow_fragments) | ||
bscheme, bnetloc, bpath, bquery, bfragment = \ | ||
_urlsplit(base, None, allow_fragments) | ||
scheme, netloc, path, query, fragment = \ | ||
_urlsplit(url, None, allow_fragments) | ||
|
||
if scheme is None: | ||
scheme = bscheme | ||
if scheme != bscheme or scheme not in uses_relative: | ||
return _coerce_result(url) | ||
if scheme in uses_netloc: | ||
if netloc: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to RFC 3986, this should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Being consistent with Ruby, Go, and Java (and usually with browser/curl) somes seems to be more beneficial, and going with that is expected by the programmer. |
||
return _coerce_result(urlunparse((scheme, netloc, path, | ||
params, query, fragment))) | ||
return _coerce_result(_urlunsplit(scheme, netloc, path, | ||
query, fragment)) | ||
netloc = bnetloc | ||
|
||
if not path and not params: | ||
if not path: | ||
path = bpath | ||
params = bparams | ||
if not query: | ||
if query is None: | ||
query = bquery | ||
return _coerce_result(urlunparse((scheme, netloc, path, | ||
params, query, fragment))) | ||
if fragment is None: | ||
fragment = bfragment | ||
Comment on lines
+592
to
+593
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This contradicts RFC 3986 according to which the target fragment is always set to the relative reference fragment. But this means that an empty string (which has no defined fragment) should remove fragment. I think this is an error in RFC 3986. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Just to be explicit, this means we are keeping the existing behavior when a relative fragment is given an empty string (or None). Is that right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Practically, this only affects 4 cases: an empty string,
But maybe RFC 3986 is wrong in other place. Previous RFCs contained special statement for empty relative URI. If it is missed in RFC 3986 by mistake, then we can only add special case for empty relative URI and preserve the current behavior for We can only add a special case for empty relative URI (it is already here for performance reasons), with possible addition of |
||
return _coerce_result(_urlunsplit(scheme, netloc, path, | ||
query, fragment)) | ||
|
||
base_parts = bpath.split('/') | ||
if base_parts[-1] != '': | ||
|
@@ -605,8 +629,8 @@ def urljoin(base, url, allow_fragments=True): | |
# then we need to append the trailing '/' | ||
resolved_path.append('') | ||
|
||
return _coerce_result(urlunparse((scheme, netloc, '/'.join( | ||
resolved_path) or '/', params, query, fragment))) | ||
return _coerce_result(_urlunsplit(scheme, netloc, '/'.join( | ||
resolved_path) or '/', query, fragment)) | ||
|
||
|
||
def urldefrag(url): | ||
|
@@ -618,12 +642,12 @@ def urldefrag(url): | |
""" | ||
url, _coerce_result = _coerce_args(url) | ||
if '#' in url: | ||
s, n, p, a, q, frag = urlparse(url) | ||
defrag = urlunparse((s, n, p, a, q, '')) | ||
s, n, p, q, frag = _urlsplit(url) | ||
defrag = _urlunsplit(s, n, p, q, None) | ||
else: | ||
frag = '' | ||
defrag = url | ||
return _coerce_result(DefragResult(defrag, frag)) | ||
return _coerce_result(DefragResult(defrag, frag or '')) | ||
|
||
_hexdig = '0123456789ABCDEFabcdef' | ||
_hextobyte = None | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Fix :func:`urllib.parse.urljoin` and :func:`urllib.parse.urldefrag` for URIs | ||
containing empty components. For example, :func:`!urljoin()` with relative | ||
reference "?" now sets empty query and removes fragment. | ||
Preserve empty components (authority, params, query, fragment) in :func:`!urljoin`. | ||
Preserve empty components (authority, params, query) in :func:`!urldefrag`. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Duplicated below.