From fb92f4208df1dc2f7bf3d1a9a11f7057f5e54e74 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 21:39:08 +0000 Subject: [PATCH 01/11] GH-125866: RFC8089 file URIs in `urllib.request` Adjust `urllib.request.pathname2url` and `url2pathname()` to generate and accept file URIs as described in RFC8089. `pathname2url()` gains a new *include_scheme* argument, which defaults to false. When set to true, the returned URL includes a `file:` prefix. `url2pathname()` now automatically removes a `file:` prefix if present. On Windows, `pathname2url()` now generates URIs that begin with two slashes rather than four when given a UNC path. On other platforms, `pathname2url()` now generates URIs that begin with three slashes rather than one when given an absolute path. `url2pathname()` now performs the opposite transformation, so `file:///etc/hosts` becomes `/etc/hosts`. Furthermore, `url2pathname()` now ignores local hosts (like "localhost" or any alias) and raises `URLError` for non-local hosts. --- Doc/library/urllib.request.rst | 31 ++++++--- Doc/whatsnew/3.14.rst | 22 +++++++ Lib/nturl2path.py | 4 +- Lib/test/test_nturl2path.py | 111 +++++++++++++++++++++++++++++++++ Lib/test/test_urllib.py | 14 ++--- Lib/urllib/request.py | 90 +++++++++++++++----------- 6 files changed, 217 insertions(+), 55 deletions(-) create mode 100644 Lib/test/test_nturl2path.py diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index ce82552a3ae4be..26d7af5330537b 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -147,18 +147,33 @@ The :mod:`urllib.request` module defines the following functions: attribute to modify its position in the handlers list. -.. function:: pathname2url(path) +.. function:: pathname2url(path, include_scheme=False) - Convert the pathname *path* from the local syntax for a path to the form used in - the path component of a URL. This does not produce a complete URL. The return - value will already be quoted using the :func:`~urllib.parse.quote` function. + Convert the local pathname *path* to a percent-encoded URL. If + *include_scheme* is false (the default), the URL is returned without a + ``file:`` scheme prefix; set this argument to true to generate a complete + URL. + .. versionchanged:: 3.14 + The *include_scheme* argument was added. -.. function:: url2pathname(path) + .. versionchanged:: 3.14 + Generates :rfc:`8089`-compliant file URLs for absolute paths. URLs for + UNC paths on Windows systems begin with two slashes (previously four.) + URLs for absolute paths on non-Windows systems begin with three slashes + (previously one.) + + +.. function:: url2pathname(url) + + Convert the percent-encoded *url* to a local pathname. + + .. versionchanged:: 3.14 + Supports :rfc:`8089`-compliant file URLs. Raises :exc:`URLError` if a + scheme other than ``file:`` is used. If the URL uses a non-local + authority, then on Windows a UNC path is returned, and on other + platforms a :exc:`URLError` exception is raised. - Convert the path component *path* from a percent-encoded URL to the local syntax for a - path. This does not accept a complete URL. This function uses - :func:`~urllib.parse.unquote` to decode *path*. .. function:: getproxies() diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index a6f595ccf08bf4..f4d75d3fa4133c 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -447,6 +447,28 @@ unittest (Contributed by Jacob Walls in :gh:`80958`.) +urllib.request +-------------- + +* Improve support for ``file:`` URIs in :mod:`urllib.request`: + + * :func:`~urllib.request.pathname2url` accepts a *include_scheme* + argument, which defaults to false. When set to true, a complete URL + with a ``file:`` prefix is returned. + * :func:`~urllib.request.url2pathname` discards a ``file:`` prefix if given. + * On Windows, :func:`~urllib.request.pathname2url` generates URIs that + begin with two slashes (rather than four) when given a UNC path. + * On non-Windows platforms, :func:`~urllib.request.pathname2url` generates + URIs that begin with three slashes (rather than one) when given an + absolute path. :func:`~urllib.request.url2pathname` performs the opposite + transformation, so ``file:///etc/hosts` becomes ``/etc/hosts``. + * On non-Windows platforms, :func:`~urllib.request.url2pathname` raises + :exc:`urllib.error.URLError` if the URI includes a non-local authority, + like ``file://other-machine/etc/hosts``. + + (Contributed by Barney Gale in :gh:`125866`.) + + .. Add improved modules above alphabetically, not here at the end. Optimizations diff --git a/Lib/nturl2path.py b/Lib/nturl2path.py index 61852aff58912d..f7313062919fd0 100644 --- a/Lib/nturl2path.py +++ b/Lib/nturl2path.py @@ -1,9 +1,9 @@ """Convert a NT pathname to a file URL and vice versa. -This module only exists to provide OS-specific code +This module previously provided OS-specific code for urllib.requests, thus do not use directly. """ -# Testing is done through test_urllib. +# Testing is done through test_nturl2path. def url2pathname(url): """OS-specific conversion from a relative URL of the 'file' scheme diff --git a/Lib/test/test_nturl2path.py b/Lib/test/test_nturl2path.py new file mode 100644 index 00000000000000..1bd77efd99ef4f --- /dev/null +++ b/Lib/test/test_nturl2path.py @@ -0,0 +1,111 @@ +import nturl2path +import unittest +import urllib.parse + + +class nturl2path_Tests(unittest.TestCase): + """Test pathname2url() and url2pathname()""" + + def test_basic(self): + # Make sure simple tests pass + expected_path = "parts\\of\\a\\path" + expected_url = "parts/of/a/path" + result = nturl2path.pathname2url(expected_path) + self.assertEqual(expected_url, result, + "pathname2url() failed; %s != %s" % + (result, expected_url)) + result = nturl2path.url2pathname(expected_url) + self.assertEqual(expected_path, result, + "url2pathame() failed; %s != %s" % + (result, expected_path)) + + def test_quoting(self): + # Test automatic quoting and unquoting works for pathnam2url() and + # url2pathname() respectively + given = "needs\\quot=ing\\here" + expect = "needs/%s/here" % urllib.parse.quote("quot=ing") + result = nturl2path.pathname2url(given) + self.assertEqual(expect, result, + "pathname2url() failed; %s != %s" % + (expect, result)) + expect = given + result = nturl2path.url2pathname(result) + self.assertEqual(expect, result, + "url2pathname() failed; %s != %s" % + (expect, result)) + given = "make sure\\using_quote" + expect = "%s/using_quote" % urllib.parse.quote("make sure") + result = nturl2path.pathname2url(given) + self.assertEqual(expect, result, + "pathname2url() failed; %s != %s" % + (expect, result)) + given = "make+sure/using_unquote" + expect = "make+sure\\using_unquote" + result = nturl2path.url2pathname(given) + self.assertEqual(expect, result, + "url2pathname() failed; %s != %s" % + (expect, result)) + + def test_pathname2url(self): + # Test special prefixes are correctly handled in pathname2url() + fn = nturl2path.pathname2url + self.assertEqual(fn('\\\\?\\C:\\dir'), '///C:/dir') + self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '/server/share/dir') + self.assertEqual(fn("C:"), '///C:') + self.assertEqual(fn("C:\\"), '///C:') + self.assertEqual(fn('C:\\a\\b.c'), '///C:/a/b.c') + self.assertEqual(fn('C:\\a\\b%#c'), '///C:/a/b%25%23c') + self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') + self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") + # Long drive letter + self.assertRaises(IOError, fn, "XX:\\") + # No drive letter + self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') + self.assertEqual(fn("\\\\folder\\test\\"), '////folder/test/') + self.assertEqual(fn("\\\\\\folder\\test\\"), '/////folder/test/') + self.assertEqual(fn('\\\\some\\share\\'), '////some/share/') + self.assertEqual(fn('\\\\some\\share\\a\\b.c'), '////some/share/a/b.c') + self.assertEqual(fn('\\\\some\\share\\a\\b%#c\xe9'), '////some/share/a/b%25%23c%C3%A9') + # Round-tripping + urls = ['///C:', + '/////folder/test/', + '///C:/foo/bar/spam.foo'] + for url in urls: + self.assertEqual(fn(nturl2path.url2pathname(url)), url) + + def test_url2pathname_win(self): + fn = nturl2path.url2pathname + self.assertEqual(fn('/C:/'), 'C:\\') + self.assertEqual(fn("///C|"), 'C:') + self.assertEqual(fn("///C:"), 'C:') + self.assertEqual(fn('///C:/'), 'C:\\') + self.assertEqual(fn('/C|//'), 'C:\\') + self.assertEqual(fn('///C|/path'), 'C:\\path') + # No DOS drive + self.assertEqual(fn("///C/test/"), '\\\\\\C\\test\\') + self.assertEqual(fn("////C/test/"), '\\\\C\\test\\') + # DOS drive paths + self.assertEqual(fn('C:/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('C|/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('/C|/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') + # Non-ASCII drive letter + self.assertRaises(IOError, fn, "///\u00e8|/") + # UNC paths + self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') + self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') + self.assertEqual(fn('/////server/path/to/file'), '\\\\\\server\\path\\to\\file') + # Localhost paths + self.assertEqual(fn('//localhost/C:/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') + # Round-tripping + paths = ['C:', + r'\\\C\test\\', + r'C:\foo\bar\spam.foo'] + for path in paths: + self.assertEqual(fn(nturl2path.pathname2url(path)), path) + + +if __name__ == '__main__': + unittest.main() diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index dc852c8f02758c..d26e35a24632ae 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1551,9 +1551,9 @@ def test_pathname2url_win(self): 'test specific to POSIX pathnames') def test_pathname2url_posix(self): fn = urllib.request.pathname2url - self.assertEqual(fn('/'), '/') - self.assertEqual(fn('/a/b.c'), '/a/b.c') - self.assertEqual(fn('/a/b%#c'), '/a/b%25%23c') + self.assertEqual(fn('/'), '///') + self.assertEqual(fn('/a/b.c'), '///a/b.c') + self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c') @unittest.skipUnless(sys.platform == 'win32', 'test specific to Windows pathnames.') @@ -1595,10 +1595,10 @@ def test_url2pathname_win(self): def test_url2pathname_posix(self): fn = urllib.request.url2pathname self.assertEqual(fn('/foo/bar'), '/foo/bar') - self.assertEqual(fn('//foo/bar'), '//foo/bar') - self.assertEqual(fn('///foo/bar'), '///foo/bar') - self.assertEqual(fn('////foo/bar'), '////foo/bar') - self.assertEqual(fn('//localhost/foo/bar'), '//localhost/foo/bar') + self.assertRaises(urllib.error.URLError, fn, '//foo/bar') + self.assertEqual(fn('///foo/bar'), '/foo/bar') + self.assertEqual(fn('////foo/bar'), '//foo/bar') + self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar') class Utility_Tests(unittest.TestCase): """Testcase to test the various utility functions in the urllib.""" diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index bc35d8a80e5d03..cc01034171a61f 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1448,16 +1448,6 @@ def parse_http_list(s): return [part.strip() for part in res] class FileHandler(BaseHandler): - # Use local file or FTP depending on form of URL - def file_open(self, req): - url = req.selector - if url[:2] == '//' and url[2:3] != '/' and (req.host and - req.host != 'localhost'): - if not req.host in self.get_names(): - raise URLError("file:// scheme is supported only on localhost") - else: - return self.open_local_file(req) - # names for the localhost names = None def get_names(self): @@ -1474,8 +1464,7 @@ def get_names(self): def open_local_file(self, req): import email.utils import mimetypes - host = req.host - filename = req.selector + filename = req.full_url localfile = url2pathname(filename) try: stats = os.stat(localfile) @@ -1485,24 +1474,22 @@ def open_local_file(self, req): headers = email.message_from_string( 'Content-type: %s\nContent-length: %d\nLast-modified: %s\n' % (mtype or 'text/plain', size, modified)) - if host: - host, port = _splitport(host) - if not host or \ - (not port and _safe_gethostbyname(host) in self.get_names()): - if host: - origurl = 'file://' + host + filename - else: - origurl = 'file://' + filename - return addinfourl(open(localfile, 'rb'), headers, origurl) + return addinfourl(open(localfile, 'rb'), headers, filename) except OSError as exp: raise URLError(exp) - raise URLError('file not on local host') -def _safe_gethostbyname(host): + file_open = open_local_file + + +def _is_local_host(host): + if not host or host == 'localhost': + return True try: - return socket.gethostbyname(host) + name = socket.gethostbyname(host) except socket.gaierror: - return None + return False + return name in FileHandler().get_names() + class FTPHandler(BaseHandler): def ftp_open(self, req): @@ -1649,19 +1636,46 @@ def data_open(self, req): MAXFTPCACHE = 10 # Trim the ftp cache beyond this size -# Helper for non-unix systems -if os.name == 'nt': - from nturl2path import url2pathname, pathname2url -else: - def url2pathname(pathname): - """OS-specific conversion from a relative URL of the 'file' scheme - to a file system path; not recommended for general use.""" - return unquote(pathname) - - def pathname2url(pathname): - """OS-specific conversion from a file system path to a relative URL - of the 'file' scheme; not recommended for general use.""" - return quote(pathname) +def pathname2url(path, include_scheme=False): + """Convert the local pathname *path* to a percent-encoded URL.""" + prefix = 'file:' if include_scheme else '' + if os.name == 'nt': + path = path.replace('\\', '/') + drive, root, tail = os.path.splitroot(path) + if drive: + if drive[1:2] == ':': + prefix += '///' + elif root: + prefix += '//' + tail = quote(tail) + return prefix + drive + root + tail + +def url2pathname(url): + """Convert the percent-encoded URL *url* to a local pathname.""" + scheme, authority, path = urlsplit(url, scheme='file')[:3] + if scheme != 'file': + raise URLError(f'URI does not use "file" scheme: {url!r}') + if os.name == 'nt': + path = unquote(path) + if authority and authority != 'localhost': + # e.g. file://server/share/path + path = f'//{authority}{path}' + elif path.startswith('///'): + # e.g. file://///server/share/path + path = path[1:] + else: + if path[0:1] == '/' and path[2:3] in ':|': + # e.g. file:////c:/path + path = path[1:] + if path[1:2] == '|': + # e.g. file:///c|path + path = path[:1] + ':' + path[2:] + path = path.replace('/', '\\') + else: + if not _is_local_host(authority): + raise URLError(f'file URI not on local host: {url!r}') + path = unquote(path) + return path ftpcache = {} From 8086522ff6f06f091820d5c45c4e50bee994fd0f Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 22:26:54 +0000 Subject: [PATCH 02/11] Windows test fixes #1 --- Lib/test/test_nturl2path.py | 2 +- Lib/test/test_urllib.py | 26 +++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Lib/test/test_nturl2path.py b/Lib/test/test_nturl2path.py index 1bd77efd99ef4f..28e550f4b357a5 100644 --- a/Lib/test/test_nturl2path.py +++ b/Lib/test/test_nturl2path.py @@ -73,7 +73,7 @@ def test_pathname2url(self): for url in urls: self.assertEqual(fn(nturl2path.url2pathname(url)), url) - def test_url2pathname_win(self): + def test_url2pathname(self): fn = nturl2path.url2pathname self.assertEqual(fn('/C:/'), 'C:\\') self.assertEqual(fn("///C|"), 'C:') diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index d26e35a24632ae..b8ef50e8f6c339 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1523,26 +1523,26 @@ def test_quoting(self): def test_pathname2url_win(self): # Test special prefixes are correctly handled in pathname2url() fn = urllib.request.pathname2url - self.assertEqual(fn('\\\\?\\C:\\dir'), '///C:/dir') - self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '/server/share/dir') + self.assertEqual(fn('\\\\?\\C:\\dir'), '//?/C:/dir') + self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '//?/unc/server/share/dir') self.assertEqual(fn("C:"), '///C:') - self.assertEqual(fn("C:\\"), '///C:') + self.assertEqual(fn("C:\\"), '///C:/') self.assertEqual(fn('C:\\a\\b.c'), '///C:/a/b.c') self.assertEqual(fn('C:\\a\\b%#c'), '///C:/a/b%25%23c') self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") # Long drive letter - self.assertRaises(IOError, fn, "XX:\\") + self.assertEqual(fn("XX:\\"), "file:XX:/") # No drive letter self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') - self.assertEqual(fn("\\\\folder\\test\\"), '////folder/test/') - self.assertEqual(fn("\\\\\\folder\\test\\"), '/////folder/test/') - self.assertEqual(fn('\\\\some\\share\\'), '////some/share/') - self.assertEqual(fn('\\\\some\\share\\a\\b.c'), '////some/share/a/b.c') - self.assertEqual(fn('\\\\some\\share\\a\\b%#c\xe9'), '////some/share/a/b%25%23c%C3%A9') + self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') + self.assertEqual(fn("\\\\\\folder\\test\\"), '///folder/test/') + self.assertEqual(fn('\\\\some\\share\\'), '//some/share/') + self.assertEqual(fn('\\\\some\\share\\a\\b.c'), '//some/share/a/b.c') + self.assertEqual(fn('\\\\some\\share\\a\\b%#c\xe9'), '//some/share/a/b%25%23c%C3%A9') # Round-tripping urls = ['///C:', - '/////folder/test/', + '//folder/test/', '///C:/foo/bar/spam.foo'] for url in urls: self.assertEqual(fn(urllib.request.url2pathname(url)), url) @@ -1563,7 +1563,7 @@ def test_url2pathname_win(self): self.assertEqual(fn("///C|"), 'C:') self.assertEqual(fn("///C:"), 'C:') self.assertEqual(fn('///C:/'), 'C:\\') - self.assertEqual(fn('/C|//'), 'C:\\') + self.assertEqual(fn('/C|//'), 'C:\\\\') self.assertEqual(fn('///C|/path'), 'C:\\path') # No DOS drive self.assertEqual(fn("///C/test/"), '\\\\\\C\\test\\') @@ -1575,7 +1575,7 @@ def test_url2pathname_win(self): self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') # Non-ASCII drive letter - self.assertRaises(IOError, fn, "///\u00e8|/") + self.assertEqual(fn("///\u00e8|/"), "\\\u00e8|\\") # UNC paths self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') @@ -1585,7 +1585,7 @@ def test_url2pathname_win(self): self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') # Round-tripping paths = ['C:', - r'\\\C\test\\', + r'\C\test\\', r'C:\foo\bar\spam.foo'] for path in paths: self.assertEqual(fn(urllib.request.pathname2url(path)), path) From 653b9ae485d46149911ee188848a10d0d3b47a69 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 22:47:37 +0000 Subject: [PATCH 03/11] Windows test fixes #2 --- Lib/test/test_urllib.py | 4 ++-- Lib/test/test_urllib2.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index b8ef50e8f6c339..a212202bbb9bcc 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1532,7 +1532,7 @@ def test_pathname2url_win(self): self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") # Long drive letter - self.assertEqual(fn("XX:\\"), "file:XX:/") + self.assertEqual(fn("XX:\\"), "XX%3A/") # No drive letter self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') @@ -1566,7 +1566,7 @@ def test_url2pathname_win(self): self.assertEqual(fn('/C|//'), 'C:\\\\') self.assertEqual(fn('///C|/path'), 'C:\\path') # No DOS drive - self.assertEqual(fn("///C/test/"), '\\\\\\C\\test\\') + self.assertEqual(fn("///C/test/"), '\\C\\test\\') self.assertEqual(fn("////C/test/"), '\\\\C\\test\\') # DOS drive paths self.assertEqual(fn('C:/path/to/file'), 'C:\\path\\to\\file') diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index b90ccc2f125b93..d1e33f3b6cbbba 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -43,10 +43,6 @@ def test___all__(self): context = {} exec('from urllib.%s import *' % module, context) del context['__builtins__'] - if module == 'request' and os.name == 'nt': - u, p = context.pop('url2pathname'), context.pop('pathname2url') - self.assertEqual(u.__module__, 'nturl2path') - self.assertEqual(p.__module__, 'nturl2path') for k, v in context.items(): self.assertEqual(v.__module__, 'urllib.%s' % module, "%r is exposed in 'urllib.%s' but defined in %r" % From 73c8bcf08ec578d8aa3a96aedcacb2d29d8f1a36 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 22:56:55 +0000 Subject: [PATCH 04/11] Windows test fixes #3, also fix docs lint --- Doc/whatsnew/3.14.rst | 2 +- Lib/test/test_urllib.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/whatsnew/3.14.rst b/Doc/whatsnew/3.14.rst index f4d75d3fa4133c..5bf0bdb22f3864 100644 --- a/Doc/whatsnew/3.14.rst +++ b/Doc/whatsnew/3.14.rst @@ -461,7 +461,7 @@ urllib.request * On non-Windows platforms, :func:`~urllib.request.pathname2url` generates URIs that begin with three slashes (rather than one) when given an absolute path. :func:`~urllib.request.url2pathname` performs the opposite - transformation, so ``file:///etc/hosts` becomes ``/etc/hosts``. + transformation, so ``file:///etc/hosts`` becomes ``/etc/hosts``. * On non-Windows platforms, :func:`~urllib.request.url2pathname` raises :exc:`urllib.error.URLError` if the URI includes a non-local authority, like ``file://other-machine/etc/hosts``. diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index a212202bbb9bcc..958fd06725a56b 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1534,7 +1534,7 @@ def test_pathname2url_win(self): # Long drive letter self.assertEqual(fn("XX:\\"), "XX%3A/") # No drive letter - self.assertEqual(fn("\\folder\\test\\"), '/folder/test/') + self.assertEqual(fn("\\folder\\test\\"), '///folder/test/') self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') self.assertEqual(fn("\\\\\\folder\\test\\"), '///folder/test/') self.assertEqual(fn('\\\\some\\share\\'), '//some/share/') From d4befde7c5185fd6e974b883df63c02bf372df38 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 23:14:29 +0000 Subject: [PATCH 05/11] Windows test fixes #4 --- Lib/test/test_urllib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 958fd06725a56b..16aa1751b09203 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -713,7 +713,7 @@ def constructLocalFileUrl(self, filePath): filePath.encode("utf-8") except UnicodeEncodeError: raise unittest.SkipTest("filePath is not encodable to utf8") - return "file://%s" % urllib.request.pathname2url(filePath) + return urllib.request.pathname2url(filePath, include_scheme=True) def createNewTempFile(self, data=b""): """Creates a new temporary file containing the specified data, @@ -1569,7 +1569,7 @@ def test_url2pathname_win(self): self.assertEqual(fn("///C/test/"), '\\C\\test\\') self.assertEqual(fn("////C/test/"), '\\\\C\\test\\') # DOS drive paths - self.assertEqual(fn('C:/path/to/file'), 'C:\\path\\to\\file') + self.assertEqual(fn('file:C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('/C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') From 1ff26c0ca289f48119b07f58876303686dddac61 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 23:32:59 +0000 Subject: [PATCH 06/11] Windows test fixes #5 --- Lib/test/test_urllib.py | 4 ++-- Lib/urllib/request.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 16aa1751b09203..5527374020b80e 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1575,11 +1575,11 @@ def test_url2pathname_win(self): self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') # Non-ASCII drive letter - self.assertEqual(fn("///\u00e8|/"), "\\\u00e8|\\") + self.assertEqual(fn("///\u00e8|/"), "u00e8:\\") # UNC paths self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') - self.assertEqual(fn('/////server/path/to/file'), '\\\\\\server\\path\\to\\file') + self.assertEqual(fn('/////server/path/to/file'), '\\\\server\\path\\to\\file') # Localhost paths self.assertEqual(fn('//localhost/C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index cc01034171a61f..28839a40b7833d 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1654,7 +1654,7 @@ def url2pathname(url): """Convert the percent-encoded URL *url* to a local pathname.""" scheme, authority, path = urlsplit(url, scheme='file')[:3] if scheme != 'file': - raise URLError(f'URI does not use "file" scheme: {url!r}') + raise URLError(f'URL {url!r} uses non-`file` scheme {scheme!r}') if os.name == 'nt': path = unquote(path) if authority and authority != 'localhost': @@ -1673,7 +1673,7 @@ def url2pathname(url): path = path.replace('/', '\\') else: if not _is_local_host(authority): - raise URLError(f'file URI not on local host: {url!r}') + raise URLError(f'URL {url!r} uses non-local authority {authority!r}') path = unquote(path) return path From 26db297e1a0b9ead5036fd85a056e7e96c29ba0e Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 29 Oct 2024 23:54:30 +0000 Subject: [PATCH 07/11] Windows test fixes #6 --- Lib/test/test_urllib.py | 2 +- Lib/test/test_urllib2.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 5527374020b80e..62e21fba3b5a16 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1575,7 +1575,7 @@ def test_url2pathname_win(self): self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') # Non-ASCII drive letter - self.assertEqual(fn("///\u00e8|/"), "u00e8:\\") + self.assertEqual(fn("///\u00e8|/"), "\u00e8:\\") # UNC paths self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index d1e33f3b6cbbba..e4fa835ba6d98f 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -823,14 +823,15 @@ def test_file(self): urls = [ "file://localhost%s" % urlpath, "file://%s" % urlpath, - "file://%s%s" % (socket.gethostbyname('localhost'), urlpath), ] - try: - localaddr = socket.gethostbyname(socket.gethostname()) - except socket.gaierror: - localaddr = '' - if localaddr: - urls.append("file://%s%s" % (localaddr, urlpath)) + if os.name == 'nt': + urls.append("file://%s%s" % (socket.gethostbyname('localhost'), urlpath)) + try: + localaddr = socket.gethostbyname(socket.gethostname()) + except socket.gaierror: + localaddr = '' + if localaddr: + urls.append("file://%s%s" % (localaddr, urlpath)) for url in urls: f = open(TESTFN, "wb") From 7584b6538e89831dace0f5e21ce732dab9c9df15 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 30 Oct 2024 00:14:51 +0000 Subject: [PATCH 08/11] Windows test fixes #7, also fix missing docs ref --- Doc/library/urllib.request.rst | 9 +++++---- Lib/test/test_urllib2.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Doc/library/urllib.request.rst b/Doc/library/urllib.request.rst index 26d7af5330537b..52f011e01fb085 100644 --- a/Doc/library/urllib.request.rst +++ b/Doc/library/urllib.request.rst @@ -169,10 +169,11 @@ The :mod:`urllib.request` module defines the following functions: Convert the percent-encoded *url* to a local pathname. .. versionchanged:: 3.14 - Supports :rfc:`8089`-compliant file URLs. Raises :exc:`URLError` if a - scheme other than ``file:`` is used. If the URL uses a non-local - authority, then on Windows a UNC path is returned, and on other - platforms a :exc:`URLError` exception is raised. + Supports :rfc:`8089`-compliant file URLs. Raises + :exc:`~urllib.error.URLError` if a scheme other than ``file:`` is used. + If the URL uses a non-local authority, then on Windows a UNC path is + returned, and on other platforms a :exc:`~urllib.error.URLError` + exception is raised. .. function:: getproxies() diff --git a/Lib/test/test_urllib2.py b/Lib/test/test_urllib2.py index e4fa835ba6d98f..312478d7afc0a4 100644 --- a/Lib/test/test_urllib2.py +++ b/Lib/test/test_urllib2.py @@ -824,7 +824,7 @@ def test_file(self): "file://localhost%s" % urlpath, "file://%s" % urlpath, ] - if os.name == 'nt': + if os.name != 'nt': urls.append("file://%s%s" % (socket.gethostbyname('localhost'), urlpath)) try: localaddr = socket.gethostbyname(socket.gethostname()) From a70584c2ba5bb07c45fe33d5fbf01cf41a8dd3c0 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 30 Oct 2024 01:01:34 +0000 Subject: [PATCH 09/11] Add NEWS blurb. --- ...2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst diff --git a/Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst b/Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst new file mode 100644 index 00000000000000..cb0cdaa8fc0962 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2024-10-30-01-00-24.gh-issue-125866.hj0R4P.rst @@ -0,0 +1,15 @@ +Improve support for ``file:`` URIs in :mod:`urllib.request`: + +* :func:`~urllib.request.pathname2url` accepts a *include_scheme* + argument, which defaults to false. When set to true, a complete URL + with a ``file:`` prefix is returned. +* :func:`~urllib.request.url2pathname` discards a ``file:`` prefix if given. +* On Windows, :func:`~urllib.request.pathname2url` generates URIs that + begin with two slashes (rather than four) when given a UNC path. +* On non-Windows platforms, :func:`~urllib.request.pathname2url` generates + URIs that begin with three slashes (rather than one) when given an + absolute path. :func:`~urllib.request.url2pathname` performs the opposite + transformation, so ``file:///etc/hosts`` becomes ``/etc/hosts``. +* On non-Windows platforms, :func:`~urllib.request.url2pathname` raises + :exc:`urllib.error.URLError` if the URI includes a non-local authority, + like ``file://other-machine/etc/hosts``. From 985ef04710f7d251934b7ae95b887986aafa7d11 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 30 Oct 2024 01:24:05 +0000 Subject: [PATCH 10/11] Comments --- Lib/test/test_urllib.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 62e21fba3b5a16..50172e8aaae2f6 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1526,15 +1526,17 @@ def test_pathname2url_win(self): self.assertEqual(fn('\\\\?\\C:\\dir'), '//?/C:/dir') self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '//?/unc/server/share/dir') self.assertEqual(fn("C:"), '///C:') + # Path root is meaningful and should be preserved. self.assertEqual(fn("C:\\"), '///C:/') self.assertEqual(fn('C:\\a\\b.c'), '///C:/a/b.c') self.assertEqual(fn('C:\\a\\b%#c'), '///C:/a/b%25%23c') self.assertEqual(fn('C:\\a\\b\xe9'), '///C:/a/b%C3%A9') self.assertEqual(fn('C:\\foo\\bar\\spam.foo'), "///C:/foo/bar/spam.foo") - # Long drive letter + # Long drive letter: treat as relative path, like ntpath.isabs()/splitroot() self.assertEqual(fn("XX:\\"), "XX%3A/") - # No drive letter + # No drive letter: use empty authority self.assertEqual(fn("\\folder\\test\\"), '///folder/test/') + # UNC paths: UNC server becomes URL authority self.assertEqual(fn("\\\\folder\\test\\"), '//folder/test/') self.assertEqual(fn("\\\\\\folder\\test\\"), '///folder/test/') self.assertEqual(fn('\\\\some\\share\\'), '//some/share/') @@ -1551,6 +1553,7 @@ def test_pathname2url_win(self): 'test specific to POSIX pathnames') def test_pathname2url_posix(self): fn = urllib.request.pathname2url + # Absolute paths: use zero-length authority. self.assertEqual(fn('/'), '///') self.assertEqual(fn('/a/b.c'), '///a/b.c') self.assertEqual(fn('/a/b%#c'), '///a/b%25%23c') @@ -1568,19 +1571,19 @@ def test_url2pathname_win(self): # No DOS drive self.assertEqual(fn("///C/test/"), '\\C\\test\\') self.assertEqual(fn("////C/test/"), '\\\\C\\test\\') - # DOS drive paths + # DOS drive paths: see RFC 8089 (D.2.) self.assertEqual(fn('file:C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('/C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('///C|/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn("///C|/foo/bar/spam.foo"), 'C:\\foo\\bar\\spam.foo') - # Non-ASCII drive letter + # Non-ASCII drive letter: treat as real DOS drive, like ntpath.isabs()/splitroot() self.assertEqual(fn("///\u00e8|/"), "\u00e8:\\") - # UNC paths + # UNC paths: see RFC 8089 (E.3.) self.assertEqual(fn('//server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('////server/path/to/file'), '\\\\server\\path\\to\\file') self.assertEqual(fn('/////server/path/to/file'), '\\\\server\\path\\to\\file') - # Localhost paths + # Localhost paths: see RFC 8989 (2.) self.assertEqual(fn('//localhost/C:/path/to/file'), 'C:\\path\\to\\file') self.assertEqual(fn('//localhost/C|/path/to/file'), 'C:\\path\\to\\file') # Round-tripping @@ -1595,7 +1598,9 @@ def test_url2pathname_win(self): def test_url2pathname_posix(self): fn = urllib.request.url2pathname self.assertEqual(fn('/foo/bar'), '/foo/bar') + # URI from a machine called 'foo': should raise URLError self.assertRaises(urllib.error.URLError, fn, '//foo/bar') + # URI with empty or local authority: discard authority section self.assertEqual(fn('///foo/bar'), '/foo/bar') self.assertEqual(fn('////foo/bar'), '//foo/bar') self.assertEqual(fn('//localhost/foo/bar'), '/foo/bar') From 4b99bfc522f88deecb9d35f56764d58872375a8c Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 30 Oct 2024 19:44:29 +0000 Subject: [PATCH 11/11] Improve handling of special UNC prefixes --- Lib/test/test_urllib.py | 4 ++-- Lib/urllib/request.py | 7 +++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/Lib/test/test_urllib.py b/Lib/test/test_urllib.py index 50172e8aaae2f6..0cb481f9310c32 100644 --- a/Lib/test/test_urllib.py +++ b/Lib/test/test_urllib.py @@ -1523,8 +1523,8 @@ def test_quoting(self): def test_pathname2url_win(self): # Test special prefixes are correctly handled in pathname2url() fn = urllib.request.pathname2url - self.assertEqual(fn('\\\\?\\C:\\dir'), '//?/C:/dir') - self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '//?/unc/server/share/dir') + self.assertEqual(fn('\\\\?\\C:\\dir'), '///C:/dir') + self.assertEqual(fn('\\\\?\\unc\\server\\share\\dir'), '//server/share/dir') self.assertEqual(fn("C:"), '///C:') # Path root is meaningful and should be preserved. self.assertEqual(fn("C:\\"), '///C:/') diff --git a/Lib/urllib/request.py b/Lib/urllib/request.py index 28839a40b7833d..5c8bceaa4b4be7 100644 --- a/Lib/urllib/request.py +++ b/Lib/urllib/request.py @@ -1643,9 +1643,16 @@ def pathname2url(path, include_scheme=False): path = path.replace('\\', '/') drive, root, tail = os.path.splitroot(path) if drive: + # Handle special UNC prefixes + if drive[:4] == '//?/': + drive = drive[4:] + if drive[:4].upper() == 'UNC/': + drive = '//' + drive[4:] + # DOS drives are preceded by three slashes if drive[1:2] == ':': prefix += '///' elif root: + # Rooted paths are preceded by two slashes prefix += '//' tail = quote(tail) return prefix + drive + root + tail