Skip to content

gh-76960: Fix urljoin() and urldefrag() for URIs with empty components #123273

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 73 additions & 14 deletions Lib/test/test_urlparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,18 +349,19 @@ def _encode(t):
split = (scheme,) + split
self.checkRoundtrips(url, parsed, split)

def checkJoin(self, base, relurl, expected):
def checkJoin(self, base, relurl, expected, *, relroundtrip=True):
with self.subTest(base=base, relurl=relurl):
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
baseb = base.encode('ascii')
relurlb = relurl.encode('ascii')
expectedb = expected.encode('ascii')
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)

relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl))
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)
if relroundtrip:
relurl = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurl))
self.assertEqual(urllib.parse.urljoin(base, relurl), expected)
relurlb = urllib.parse.urlunsplit(urllib.parse.urlsplit(relurlb))
self.assertEqual(urllib.parse.urljoin(baseb, relurlb), expectedb)

def test_unparse_parse(self):
str_cases = ['Python', './Python','x-newscheme://foo.com/stuff','x://y','x:/y','x:/','/',]
Expand Down Expand Up @@ -526,8 +527,6 @@ def test_RFC3986(self):

def test_urljoins(self):
self.checkJoin(SIMPLE_BASE, 'g:h','g:h')
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d')
Comment on lines -529 to -530
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicated below.

self.checkJoin(SIMPLE_BASE, 'g','http://a/b/c/g')
self.checkJoin(SIMPLE_BASE, './g','http://a/b/c/g')
self.checkJoin(SIMPLE_BASE, 'g/','http://a/b/c/g/')
Expand All @@ -548,8 +547,6 @@ def test_urljoins(self):
self.checkJoin(SIMPLE_BASE, 'g/./h','http://a/b/c/g/h')
self.checkJoin(SIMPLE_BASE, 'g/../h','http://a/b/c/h')
self.checkJoin(SIMPLE_BASE, 'http:g','http://a/b/c/g')
self.checkJoin(SIMPLE_BASE, 'http:','http://a/b/c/d')
self.checkJoin(SIMPLE_BASE, 'http:?y','http://a/b/c/d?y')
Comment on lines -551 to -552
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Duplicated in new tests below.

self.checkJoin(SIMPLE_BASE, 'http:g?y','http://a/b/c/g?y')
self.checkJoin(SIMPLE_BASE, 'http:g?y/./x','http://a/b/c/g?y/./x')
self.checkJoin('http:///', '..','http:///')
Expand Down Expand Up @@ -579,6 +576,53 @@ def test_urljoins(self):
# issue 23703: don't duplicate filename
self.checkJoin('a', 'b', 'b')

# Test with empty (but defined) components.
self.checkJoin(RFC1808_BASE, '', 'http://a/b/c/d;p?q#f')
self.checkJoin(RFC1808_BASE, '#', 'http://a/b/c/d;p?q#', relroundtrip=False)
self.checkJoin(RFC1808_BASE, '#z', 'http://a/b/c/d;p?q#z')
self.checkJoin(RFC1808_BASE, '?', 'http://a/b/c/d;p?', relroundtrip=False)
self.checkJoin(RFC1808_BASE, '?#z', 'http://a/b/c/d;p?#z', relroundtrip=False)
self.checkJoin(RFC1808_BASE, '?y', 'http://a/b/c/d;p?y')
self.checkJoin(RFC1808_BASE, ';', 'http://a/b/c/;')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Making a note, using parameter ; removes the final portion of the path in the base url /d in this case.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And this as per the rfc3986 requirement as given in the 5.4. Reference Resolution Examples and these tests handle them properly for the empty ? cases correctly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From RFC3986's point of view, ; is not a delimiter, it is a normal character. So /b/c/d;p is the old path, d;p is its last part, and it will be replaced with relative path ;.

Other example:

>>> urljoin('http:/a/b/c/d;?#f', '#z')
'http:///a/b/c/d#z'

Currently it removes ; and ? which correspond to empty params and query. But for RFC3986 they should be preserved. It is not only a matter of representation of empty component, params is not a thing in RFC3986. Removing trailing ; from path /b/c/d; is a bug.

self.checkJoin(RFC1808_BASE, ';?y', 'http://a/b/c/;?y')
self.checkJoin(RFC1808_BASE, ';#z', 'http://a/b/c/;#z')
self.checkJoin(RFC1808_BASE, ';x', 'http://a/b/c/;x')
self.checkJoin(RFC1808_BASE, '/w', 'http://a/w')
self.checkJoin(RFC1808_BASE, '//', 'http://a/b/c/d;p?q#f')
self.checkJoin(RFC1808_BASE, '//#z', 'http://a/b/c/d;p?q#z')
self.checkJoin(RFC1808_BASE, '//?y', 'http://a/b/c/d;p?y')
self.checkJoin(RFC1808_BASE, '//;x', 'http://;x')
self.checkJoin(RFC1808_BASE, '///w', 'http://a/w')
self.checkJoin(RFC1808_BASE, '//v', 'http://v')
# For backward compatibility with RFC1630, the scheme name is allowed
# to be present in a relative reference if it is the same as the base
# URI scheme.
self.checkJoin(RFC1808_BASE, 'http:', 'http://a/b/c/d;p?q#f')
self.checkJoin(RFC1808_BASE, 'http:#', 'http://a/b/c/d;p?q#', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'http:#z', 'http://a/b/c/d;p?q#z')
self.checkJoin(RFC1808_BASE, 'http:?', 'http://a/b/c/d;p?', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'http:?#z', 'http://a/b/c/d;p?#z', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'http:?y', 'http://a/b/c/d;p?y')
self.checkJoin(RFC1808_BASE, 'http:;', 'http://a/b/c/;')
self.checkJoin(RFC1808_BASE, 'http:;?y', 'http://a/b/c/;?y')
self.checkJoin(RFC1808_BASE, 'http:;#z', 'http://a/b/c/;#z')
self.checkJoin(RFC1808_BASE, 'http:;x', 'http://a/b/c/;x')
self.checkJoin(RFC1808_BASE, 'http:/w', 'http://a/w')
self.checkJoin(RFC1808_BASE, 'http://', 'http://a/b/c/d;p?q#f')
self.checkJoin(RFC1808_BASE, 'http://#z', 'http://a/b/c/d;p?q#z')
self.checkJoin(RFC1808_BASE, 'http://?y', 'http://a/b/c/d;p?y')
self.checkJoin(RFC1808_BASE, 'http://;x', 'http://;x')
self.checkJoin(RFC1808_BASE, 'http:///w', 'http://a/w')
self.checkJoin(RFC1808_BASE, 'http://v', 'http://v')
# Different scheme is not ignored.
self.checkJoin(RFC1808_BASE, 'https:', 'https:', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'https:#', 'https:#', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'https:#z', 'https:#z', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'https:?', 'https:?', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'https:?y', 'https:?y', relroundtrip=False)
self.checkJoin(RFC1808_BASE, 'https:;', 'https:;')
self.checkJoin(RFC1808_BASE, 'https:;x', 'https:;x')

def test_RFC2732(self):
str_cases = [
('http://Test.python.org:5432/foo/', 'test.python.org', 5432),
Expand Down Expand Up @@ -641,16 +685,31 @@ def test_urldefrag(self):
('http://python.org/p?q', 'http://python.org/p?q', ''),
(RFC1808_BASE, 'http://a/b/c/d;p?q', 'f'),
(RFC2396_BASE, 'http://a/b/c/d;p?q', ''),
('http://a/b/c;p?q#f', 'http://a/b/c;p?q', 'f'),
('http://a/b/c;p?q#', 'http://a/b/c;p?q', ''),
('http://a/b/c;p?q', 'http://a/b/c;p?q', ''),
('http://a/b/c;p?#f', 'http://a/b/c;p?', 'f'),
('http://a/b/c;p#f', 'http://a/b/c;p', 'f'),
('http://a/b/c;?q#f', 'http://a/b/c;?q', 'f'),
('http://a/b/c?q#f', 'http://a/b/c?q', 'f'),
('http:///b/c;p?q#f', 'http:///b/c;p?q', 'f'),
('http:b/c;p?q#f', 'http:b/c;p?q', 'f'),
('http:;?q#f', 'http:;?q', 'f'),
('http:?q#f', 'http:?q', 'f'),
('//a/b/c;p?q#f', '//a/b/c;p?q', 'f'),
('://a/b/c;p?q#f', '://a/b/c;p?q', 'f'),
]
def _encode(t):
return type(t)(x.encode('ascii') for x in t)
bytes_cases = [_encode(x) for x in str_cases]
for url, defrag, frag in str_cases + bytes_cases:
result = urllib.parse.urldefrag(url)
self.assertEqual(result.geturl(), url)
self.assertEqual(result, (defrag, frag))
self.assertEqual(result.url, defrag)
self.assertEqual(result.fragment, frag)
with self.subTest(url):
result = urllib.parse.urldefrag(url)
hash = '#' if isinstance(url, str) else b'#'
self.assertEqual(result.geturl(), url.rstrip(hash))
self.assertEqual(result, (defrag, frag))
self.assertEqual(result.url, defrag)
self.assertEqual(result.fragment, frag)

def test_urlsplit_scoped_IPv6(self):
p = urllib.parse.urlsplit('http://[FE80::822a:a8ff:fe49:470c%tESt]:1234')
Expand Down
100 changes: 62 additions & 38 deletions Lib/urllib/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,20 +392,23 @@ def urlparse(url, scheme='', allow_fragments=True):
Note that % escapes are not expanded.
"""
url, scheme, _coerce_result = _coerce_args(url, scheme)
splitresult = urlsplit(url, scheme, allow_fragments)
scheme, netloc, url, query, fragment = splitresult
if scheme in uses_params and ';' in url:
url, params = _splitparams(url)
else:
params = ''
result = ParseResult(scheme, netloc, url, params, query, fragment)
scheme, netloc, url, params, query, fragment = _urlparse(url, scheme, allow_fragments)
result = ParseResult(scheme or '', netloc or '', url, params or '', query or '', fragment or '')
return _coerce_result(result)

def _splitparams(url):
def _urlparse(url, scheme=None, allow_fragments=True):
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
if (scheme or '') in uses_params and ';' in url:
url, params = _splitparams(url, allow_none=True)
else:
params = None
return (scheme, netloc, url, params, query, fragment)

def _splitparams(url, allow_none=False):
if '/' in url:
i = url.find(';', url.rfind('/'))
if i < 0:
return url, ''
return url, None if allow_none else ''
else:
i = url.find(';')
return url[:i], url[i+1:]
Expand Down Expand Up @@ -472,17 +475,23 @@ def urlsplit(url, scheme='', allow_fragments=True):
"""

url, scheme, _coerce_result = _coerce_args(url, scheme)
scheme, netloc, url, query, fragment = _urlsplit(url, scheme, allow_fragments)
v = SplitResult(scheme or '', netloc or '', url, query or '', fragment or '')
return _coerce_result(v)

def _urlsplit(url, scheme=None, allow_fragments=True):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This refactoring is also a preparation for #67041. These private functions distinguish undefined components (None) from empty components (empty string), but public functions remove this difference. With new options added for #67041 this will be optional.

# Only lstrip url as some applications rely on preserving trailing space.
# (https://url.spec.whatwg.org/#concept-basic-url-parser would strip both)
url = url.lstrip(_WHATWG_C0_CONTROL_OR_SPACE)
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)

for b in _UNSAFE_URL_BYTES_TO_REMOVE:
url = url.replace(b, "")
scheme = scheme.replace(b, "")
if scheme is not None:
scheme = scheme.strip(_WHATWG_C0_CONTROL_OR_SPACE)
for b in _UNSAFE_URL_BYTES_TO_REMOVE:
scheme = scheme.replace(b, "")

allow_fragments = bool(allow_fragments)
netloc = query = fragment = ''
netloc = query = fragment = None
i = url.find(':')
if i > 0 and url[0].isascii() and url[0].isalpha():
for c in url[:i]:
Expand All @@ -503,8 +512,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
if '?' in url:
url, query = url.split('?', 1)
_checknetloc(netloc)
v = SplitResult(scheme, netloc, url, query, fragment)
return _coerce_result(v)
return (scheme, netloc, url, query, fragment)

def urlunparse(components):
"""Put a parsed URL back together again. This may result in a
Expand All @@ -513,9 +521,15 @@ def urlunparse(components):
(the draft states that these are equivalent)."""
scheme, netloc, url, params, query, fragment, _coerce_result = (
_coerce_args(*components))
if not netloc:
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
netloc = ''
else:
netloc = None
if params:
url = "%s;%s" % (url, params)
return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment)))
return _coerce_result(_urlunsplit(scheme or None, netloc, url,
query or None, fragment or None))

def urlunsplit(components):
"""Combine the elements of a tuple as returned by urlsplit() into a
Expand All @@ -525,20 +539,27 @@ def urlunsplit(components):
empty query; the RFC states that these are equivalent)."""
scheme, netloc, url, query, fragment, _coerce_result = (
_coerce_args(*components))
if netloc:
if not netloc:
if scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
netloc = ''
else:
netloc = None
return _coerce_result(_urlunsplit(scheme or None, netloc, url,
query or None, fragment or None))

def _urlunsplit(scheme, netloc, url, query, fragment):
if netloc is not None:
if url and url[:1] != '/': url = '/' + url
url = '//' + netloc + url
elif url[:2] == '//':
url = '//' + url
elif scheme and scheme in uses_netloc and (not url or url[:1] == '/'):
url = '//' + url
if scheme:
url = scheme + ':' + url
if query:
if query is not None:
url = url + '?' + query
if fragment:
if fragment is not None:
url = url + '#' + fragment
return _coerce_result(url)
return url

def urljoin(base, url, allow_fragments=True):
"""Join a base URL and a possibly relative URL to form an absolute
Expand All @@ -549,26 +570,29 @@ def urljoin(base, url, allow_fragments=True):
return base

base, url, _coerce_result = _coerce_args(base, url)
bscheme, bnetloc, bpath, bparams, bquery, bfragment = \
urlparse(base, '', allow_fragments)
scheme, netloc, path, params, query, fragment = \
urlparse(url, bscheme, allow_fragments)
bscheme, bnetloc, bpath, bquery, bfragment = \
_urlsplit(base, None, allow_fragments)
scheme, netloc, path, query, fragment = \
_urlsplit(url, None, allow_fragments)

if scheme is None:
scheme = bscheme
if scheme != bscheme or scheme not in uses_relative:
return _coerce_result(url)
if scheme in uses_netloc:
if netloc:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

According to RFC 3986, this should be netlog is not None (undefined). But this gives results too different from Ruby and Go (they may be wrong) and from the current Python code. I leave this for a separate issue.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Being consistent with Ruby, Go, and Java (and usually with browser/curl) somes seems to be more beneficial, and going with that is expected by the programmer.

return _coerce_result(urlunparse((scheme, netloc, path,
params, query, fragment)))
return _coerce_result(_urlunsplit(scheme, netloc, path,
query, fragment))
netloc = bnetloc

if not path and not params:
if not path:
path = bpath
params = bparams
if not query:
if query is None:
query = bquery
return _coerce_result(urlunparse((scheme, netloc, path,
params, query, fragment)))
if fragment is None:
fragment = bfragment
Comment on lines +592 to +593
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This contradicts RFC 3986 according to which the target fragment is always set to the relative reference fragment. But this means that an empty string (which has no defined fragment) should remove fragment. I think this is an error in RFC 3986.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this means that an empty string (which has no defined fragment) should remove fragment. I think this is an error in RFC 3986.

Just to be explicit, this means we are keeping the existing behavior when a relative fragment is given an empty string (or None). Is that right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Practically, this only affects 4 cases: an empty string, '//'. and URIs like 'http:' and http:// where the scheme is the same as in the base URI.

urljoin('http:/a/b/c/d;p?q#f', '') currently returns the base URI 'http:/a/b/c/d;p?q#f'. The same in Ruby and Go. According to the RFC 3986 pseudocode it should drop fragment. According to previous RFCs it should return the base URI. I think it is important to keep the current behavior in this case, so there should be some error in RFC 3986.

urljoin('http:/a/b/c/d;p?q#f', '//') currently returns the base URI without fragment 'http:/a/b/c/d;p?q', which is consistent with RFC 3986 for non-strict parser. Ruby and Go do not distinguish between '//' and '', so they preserve fragment. The current PR preserves it too.

urljoin('http:/a/b/c/d;p?q#f', 'http:') currently returns the base URI without fragment 'http:/a/b/c/d;p?q', which is consistent with RFC 3986 for non-strict parser. But Ruby and Go return 'http: which is consistent with RFC 3986 for strict parser. With this PR it will return the base URI unchanged, this is a change from the current behavior and is not consistent with RFC 3986 unless we suppose that RFC 3986 was wrong in this place.

But maybe RFC 3986 is wrong in other place. Previous RFCs contained special statement for empty relative URI. If it is missed in RFC 3986 by mistake, then we can only add special case for empty relative URI and preserve the current behavior for urljoin('http:/a/b/c/d;p?q#f', 'http:') which is consistent with the pseudocode in RFC 3986.

We can only add a special case for empty relative URI (it is already here for performance reasons), with possible addition of '//'. Then the behavior for 'http:' and 'http://' will be consistent with 3986 and match the current behavior (but Ruby and Go go other way).

return _coerce_result(_urlunsplit(scheme, netloc, path,
query, fragment))

base_parts = bpath.split('/')
if base_parts[-1] != '':
Expand Down Expand Up @@ -605,8 +629,8 @@ def urljoin(base, url, allow_fragments=True):
# then we need to append the trailing '/'
resolved_path.append('')

return _coerce_result(urlunparse((scheme, netloc, '/'.join(
resolved_path) or '/', params, query, fragment)))
return _coerce_result(_urlunsplit(scheme, netloc, '/'.join(
resolved_path) or '/', query, fragment))


def urldefrag(url):
Expand All @@ -618,12 +642,12 @@ def urldefrag(url):
"""
url, _coerce_result = _coerce_args(url)
if '#' in url:
s, n, p, a, q, frag = urlparse(url)
defrag = urlunparse((s, n, p, a, q, ''))
s, n, p, q, frag = _urlsplit(url)
defrag = _urlunsplit(s, n, p, q, None)
else:
frag = ''
defrag = url
return _coerce_result(DefragResult(defrag, frag))
return _coerce_result(DefragResult(defrag, frag or ''))

_hexdig = '0123456789ABCDEFabcdef'
_hextobyte = None
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
Fix :func:`urllib.parse.urljoin` and :func:`urllib.parse.urldefrag` for URIs
containing empty components. For example, :func:`!urljoin()` with relative
reference "?" now sets empty query and removes fragment.
Preserve empty components (authority, params, query, fragment) in :func:`!urljoin`.
Preserve empty components (authority, params, query) in :func:`!urldefrag`.
Loading