From 0a908755ea5523c71aa81a0a7d5ca443835bfd19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Fri, 12 Mar 2021 16:42:22 +0100 Subject: [PATCH 1/2] Expose the separator parameter added upstream to parse_qs and parse_qsl --- tests/test_url.py | 15 ++++++++ w3lib/url.py | 87 ++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 85 insertions(+), 17 deletions(-) diff --git a/tests/test_url.py b/tests/test_url.py index 07695500..ccaa5f8d 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -271,6 +271,8 @@ def test_url_query_parameter(self): None) self.assertEqual(url_query_parameter("product.html?id=", "id", keep_blank_values=1), '') + self.assertEqual(url_query_parameter("product.html?id=200;foo=bar", "id", separator=';'), + '200') def test_url_query_parameter_2(self): """ @@ -343,6 +345,14 @@ def test_add_or_replace_parameter_fail(self): 'http://domain/test?arg1=v3&arg2=v2' ) + @pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164") + def test_add_or_replace_parameter_semicolon(self): + url = 'http://domain/test?arg1=v1;arg2=v2;arg3=v3' + self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4', separator=';'), + 'http://domain/test?arg1=v1;arg2=v2;arg3=v3;arg4=v4') + self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3', separator=';'), + 'http://domain/test?arg1=v1;arg2=v2;arg3=nv3') + def test_add_or_replace_parameters(self): url = 'http://domain/test' self.assertEqual(add_or_replace_parameters(url, {'arg': 'v'}), @@ -480,6 +490,11 @@ def test_typical_usage(self): self.assertEqual(canonicalize_url("http://www.example.com/do?&a=1"), "http://www.example.com/do?a=1") + @pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164") + def test_typical_usage_semicolon(self): + self.assertEqual(canonicalize_url("http://www.example.com/do?c=1;b=2;a=3", query_separator=';'), + "http://www.example.com/do?a=3;b=2;c=1") + def test_port_number(self): self.assertEqual(canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"), "http://www.example.com:8888/do?a=1&b=2&c=3") diff --git a/w3lib/url.py b/w3lib/url.py index bf12745d..07c20ea5 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -10,15 +10,46 @@ import warnings import string from collections import namedtuple +from inspect import getfullargspec + import six -from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, - urldefrag, urlencode, urlparse, - quote, parse_qs, parse_qsl, - ParseResult, unquote, urlunparse) +from six.moves.urllib.parse import ( + urljoin, + urlsplit, + urlunsplit, + urldefrag, + urlencode, + urlparse, + quote, + parse_qs as _parse_qs, + parse_qsl as _parse_qsl, + ParseResult, + unquote, + urlunparse, +) from six.moves.urllib.request import pathname2url, url2pathname + from w3lib.util import to_bytes, to_native_str, to_unicode +_REMOVE_SEPARATOR = 'separator' not in getfullargspec(_parse_qs)[0] + + +def _handle_separator(func, *args, **kwargs): + if _REMOVE_SEPARATOR: + kwargs.pop('separator', None) + return func(*args, **kwargs) + + +def parse_qs(*args, **kwargs): + return _handle_separator(_parse_qs, *args, **kwargs) + + +def parse_qsl(*args, **kwargs): + return _handle_separator(_parse_qsl, *args, **kwargs) + + + # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): return (to_unicode(quote(error.object[error.start:error.end])), error.end) @@ -117,7 +148,14 @@ def is_url(text): return text.partition("://")[0] in ('file', 'http', 'https') -def url_query_parameter(url, parameter, default=None, keep_blank_values=0): +def url_query_parameter( + url, + parameter, + default=None, + keep_blank_values=0, + *, + separator='&', +): """Return the value of a url parameter, given the url and parameter name General case: @@ -148,7 +186,8 @@ def url_query_parameter(url, parameter, default=None, keep_blank_values=0): queryparams = parse_qs( urlsplit(str(url))[3], - keep_blank_values=keep_blank_values + keep_blank_values=keep_blank_values, + separator=separator, ) return queryparams.get(parameter, [default])[0] @@ -209,9 +248,13 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u url += '#' + fragment return url -def _add_or_replace_parameters(url, params): +def _add_or_replace_parameters(url, params, *, separator='&'): parsed = urlsplit(url) - current_args = parse_qsl(parsed.query, keep_blank_values=True) + current_args = parse_qsl( + parsed.query, + keep_blank_values=True, + separator=separator, + ) new_args = [] seen_params = set() @@ -229,7 +272,7 @@ def _add_or_replace_parameters(url, params): return urlunsplit(parsed._replace(query=query)) -def add_or_replace_parameter(url, name, new_value): +def add_or_replace_parameter(url, name, new_value, *, separator='&'): """Add or remove a parameter to a given url >>> import w3lib.url @@ -242,10 +285,10 @@ def add_or_replace_parameter(url, name, new_value): >>> """ - return _add_or_replace_parameters(url, {name: new_value}) + return _add_or_replace_parameters(url, {name: new_value}, separator=separator) -def add_or_replace_parameters(url, new_parameters): +def add_or_replace_parameters(url, new_parameters, *, separator='&'): """Add or remove a parameters to a given url >>> import w3lib.url @@ -257,7 +300,7 @@ def add_or_replace_parameters(url, new_parameters): >>> """ - return _add_or_replace_parameters(url, new_parameters) + return _add_or_replace_parameters(url, new_parameters, separator=separator) def path_to_file_uri(path): @@ -425,8 +468,14 @@ def _safe_ParseResult(parts, encoding='utf8', path_encoding='utf8'): ) -def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, - encoding=None): +def canonicalize_url( + url, + keep_blank_values=True, + keep_fragments=False, + encoding=None, + *, + query_separator='&', +): r"""Canonicalize the given url by applying the following procedures: - sort query arguments, first by key, then by value @@ -471,7 +520,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, # sort values, # and percent-encode them back if six.PY2: - keyvals = parse_qsl(query, keep_blank_values) + keyvals = parse_qsl(query, keep_blank_values, separator=query_separator) else: # Python3's urllib.parse.parse_qsl does not work as wanted # for percent-encoded characters that do not match passed encoding, @@ -496,7 +545,11 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, # Similar considerations apply to query parts. The functionality of # IRIs (namely, to be able to include non-ASCII characters) can only be # used if the query part is encoded in UTF-8. - keyvals = parse_qsl_to_bytes(query, keep_blank_values) + keyvals = parse_qsl_to_bytes( + query, + keep_blank_values, + separator=query_separator, + ) keyvals.sort() query = urlencode(keyvals) @@ -545,7 +598,7 @@ def parse_url(url, encoding=None): if not six.PY2: from urllib.parse import _coerce_args, unquote_to_bytes - def parse_qsl_to_bytes(qs, keep_blank_values=False): + def parse_qsl_to_bytes(qs, keep_blank_values=False, *, separator='&'): """Parse a query given as a string argument. Data are returned as a list of name, value pairs as bytes. From d81fad95da8cf8d8f2d8250951bc38b2f8df5610 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sat, 20 Mar 2021 12:34:48 +0100 Subject: [PATCH 2/2] Fix style issue --- w3lib/url.py | 1 - 1 file changed, 1 deletion(-) diff --git a/w3lib/url.py b/w3lib/url.py index 07c20ea5..360da583 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -49,7 +49,6 @@ def parse_qsl(*args, **kwargs): return _handle_separator(_parse_qsl, *args, **kwargs) - # error handling function for bytes-to-Unicode decoding errors with URLs def _quote_byte(error): return (to_unicode(quote(error.object[error.start:error.end])), error.end)