Skip to content

Commit 6a48367

Browse files
authored
Merge a8e50d2 into a6e8c8d
2 parents a6e8c8d + a8e50d2 commit 6a48367

File tree

2 files changed

+64
-12
lines changed

2 files changed

+64
-12
lines changed

tests/test_url.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,10 @@ def test_url_query_parameter(self):
849849
self.assertEqual(
850850
url_query_parameter("product.html?id=", "id", keep_blank_values=1), ""
851851
)
852+
self.assertEqual(
853+
url_query_parameter("product.html?id=200;foo=bar", "id", separator=';'),
854+
'200',
855+
)
852856

853857
def test_url_query_parameter_2(self):
854858
"""
@@ -958,6 +962,14 @@ def test_add_or_replace_parameter_fail(self):
958962
"http://domain/test?arg1=v3&arg2=v2",
959963
)
960964

965+
@pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164")
966+
def test_add_or_replace_parameter_semicolon(self):
967+
url = 'http://domain/test?arg1=v1;arg2=v2;arg3=v3'
968+
self.assertEqual(add_or_replace_parameter(url, 'arg4', 'v4', separator=';'),
969+
'http://domain/test?arg1=v1;arg2=v2;arg3=v3;arg4=v4')
970+
self.assertEqual(add_or_replace_parameter(url, 'arg3', 'nv3', separator=';'),
971+
'http://domain/test?arg1=v1;arg2=v2;arg3=nv3')
972+
961973
def test_add_or_replace_parameters(self):
962974
url = "http://domain/test"
963975
self.assertEqual(
@@ -1157,6 +1169,11 @@ def test_typical_usage(self):
11571169
"http://www.example.com/do?a=1",
11581170
)
11591171

1172+
@pytest.mark.xfail(reason="https://github.com/scrapy/w3lib/issues/164")
1173+
def test_typical_usage_semicolon(self):
1174+
self.assertEqual(canonicalize_url("http://www.example.com/do?c=1;b=2;a=3", query_separator=';'),
1175+
"http://www.example.com/do?a=3;b=2;c=1")
1176+
11601177
def test_port_number(self):
11611178
self.assertEqual(
11621179
canonicalize_url("http://www.example.com:8888/do?a=1&b=2&c=3"),

w3lib/url.py

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import posixpath
99
import re
1010
import string
11+
from inspect import getfullargspec
1112
from typing import (
1213
cast,
1314
Callable,
@@ -20,8 +21,8 @@
2021
Union,
2122
)
2223
from urllib.parse import (
23-
parse_qs,
24-
parse_qsl,
24+
parse_qs as _parse_qs,
25+
parse_qsl as _parse_qsl,
2526
ParseResult,
2627
quote,
2728
unquote_to_bytes,
@@ -41,6 +42,23 @@
4142
from ._url import _SPECIAL_SCHEMES
4243

4344

45+
_REMOVE_SEPARATOR = 'separator' not in getfullargspec(_parse_qs)[0]
46+
47+
48+
def _handle_separator(func, *args, **kwargs):
49+
if _REMOVE_SEPARATOR:
50+
kwargs.pop('separator', None)
51+
return func(*args, **kwargs)
52+
53+
54+
def parse_qs(*args, **kwargs):
55+
return _handle_separator(_parse_qs, *args, **kwargs)
56+
57+
58+
def parse_qsl(*args, **kwargs):
59+
return _handle_separator(_parse_qsl, *args, **kwargs)
60+
61+
4462
# error handling function for bytes-to-Unicode decoding errors with URLs
4563
def _quote_byte(error: UnicodeError) -> Tuple[str, int]:
4664
error = cast(AnyUnicodeError, error)
@@ -200,6 +218,8 @@ def url_query_parameter(
200218
parameter: str,
201219
default: Optional[str] = None,
202220
keep_blank_values: Union[bool, int] = 0,
221+
*,
222+
separator: str = '&',
203223
) -> Optional[str]:
204224
"""Return the value of a url parameter, given the url and parameter name
205225
@@ -230,7 +250,9 @@ def url_query_parameter(
230250
"""
231251

232252
queryparams = parse_qs(
233-
urlsplit(str(url))[3], keep_blank_values=bool(keep_blank_values)
253+
urlsplit(str(url))[3],
254+
keep_blank_values=bool(keep_blank_values),
255+
separator=separator,
234256
)
235257
if parameter in queryparams:
236258
return queryparams[parameter][0]
@@ -305,9 +327,13 @@ def url_query_cleaner(
305327
return url
306328

307329

308-
def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
330+
def _add_or_replace_parameters(url: str, params: Dict[str, str], *, separator: str = '&') -> str:
309331
parsed = urlsplit(url)
310-
current_args = parse_qsl(parsed.query, keep_blank_values=True)
332+
current_args = parse_qsl(
333+
parsed.query,
334+
keep_blank_values=True,
335+
separator=separator,
336+
)
311337

312338
new_args = []
313339
seen_params = set()
@@ -327,7 +353,7 @@ def _add_or_replace_parameters(url: str, params: Dict[str, str]) -> str:
327353
return urlunsplit(parsed._replace(query=query))
328354

329355

330-
def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
356+
def add_or_replace_parameter(url: str, name: str, new_value: str, *, separator: str = '&') -> str:
331357
"""Add or remove a parameter to a given url
332358
333359
>>> import w3lib.url
@@ -340,10 +366,10 @@ def add_or_replace_parameter(url: str, name: str, new_value: str) -> str:
340366
>>>
341367
342368
"""
343-
return _add_or_replace_parameters(url, {name: new_value})
369+
return _add_or_replace_parameters(url, {name: new_value}, separator=separator)
344370

345371

346-
def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
372+
def add_or_replace_parameters(url: str, new_parameters: Dict[str, str], *, separator: str = '&') -> str:
347373
"""Add or remove a parameters to a given url
348374
349375
>>> import w3lib.url
@@ -355,7 +381,7 @@ def add_or_replace_parameters(url: str, new_parameters: Dict[str, str]) -> str:
355381
>>>
356382
357383
"""
358-
return _add_or_replace_parameters(url, new_parameters)
384+
return _add_or_replace_parameters(url, new_parameters, separator=separator)
359385

360386

361387
def path_to_file_uri(path: str) -> str:
@@ -528,6 +554,8 @@ def canonicalize_url(
528554
keep_blank_values: bool = True,
529555
keep_fragments: bool = False,
530556
encoding: Optional[str] = None,
557+
*,
558+
query_separator: str = '&'
531559
) -> str:
532560
r"""Canonicalize the given url by applying the following procedures:
533561
@@ -600,7 +628,11 @@ def canonicalize_url(
600628
# Similar considerations apply to query parts. The functionality of
601629
# IRIs (namely, to be able to include non-ASCII characters) can only be
602630
# used if the query part is encoded in UTF-8.
603-
keyvals = parse_qsl_to_bytes(query, keep_blank_values)
631+
keyvals = parse_qsl_to_bytes(
632+
query,
633+
keep_blank_values,
634+
separator=query_separator,
635+
)
604636

605637
keyvals.sort()
606638
query = urlencode(keyvals)
@@ -642,7 +674,10 @@ def parse_url(
642674

643675

644676
def parse_qsl_to_bytes(
645-
qs: str, keep_blank_values: bool = False
677+
qs: str,
678+
keep_blank_values: bool = False,
679+
*,
680+
separator: str = '&',
646681
) -> List[Tuple[bytes, bytes]]:
647682
"""Parse a query given as a string argument.
648683
@@ -665,7 +700,7 @@ def parse_qsl_to_bytes(
665700
# with unquote_to_bytes(s)
666701
coerce_args = cast(Callable[..., Tuple[str, Callable[..., bytes]]], _coerce_args)
667702
qs, _coerce_result = coerce_args(qs)
668-
pairs = [s2 for s1 in qs.split("&") for s2 in s1.split(";")]
703+
pairs = qs.split(separator)
669704
r = []
670705
for name_value in pairs:
671706
if not name_value:

0 commit comments

Comments
 (0)