From 27937581e7b551772823da2c3954ca2d07000fb3 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Tue, 9 Nov 2021 15:28:07 +0200 Subject: [PATCH 1/5] Add python_requires to help pip --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 743abe43..538f6cb8 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ include_package_data=True, zip_safe=False, platforms=["Any"], + python_requires=">=3.6", classifiers=[ "Development Status :: 5 - Production/Stable", "License :: OSI Approved :: BSD License", From 313a723ec01cb2023bda51b6cde72df8976a17b7 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Tue, 9 Nov 2021 15:26:26 +0200 Subject: [PATCH 2/5] Remove redundant code for EOL Python 2.7 and 3.5 --- setup.cfg | 2 -- tox.ini | 2 +- w3lib/encoding.py | 5 +---- 3 files changed, 2 insertions(+), 7 deletions(-) delete mode 100644 setup.cfg diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 2a9acf13..00000000 --- a/setup.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[bdist_wheel] -universal = 1 diff --git a/tox.ini b/tox.ini index 4e8e4767..20883f68 100644 --- a/tox.ini +++ b/tox.ini @@ -4,7 +4,7 @@ # and then run "tox" from this directory. [tox] -envlist = py27, pypy, py35, py36, py37, py38, pypy3, docs, security, flake8, pylint, black +envlist = py36, py37, py38, pypy3, docs, security, flake8, pylint, black [testenv] deps = diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 32252105..74034adf 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -2,7 +2,6 @@ Functions for handling encoding of web pages """ import re, codecs, encodings -from sys import version_info from typing import Callable, Match, Optional, Tuple, Union, cast from w3lib._types import AnyUnicodeError, StrOrBytes from w3lib.util import to_native_str @@ -208,9 +207,7 @@ def to_unicode(data_str: bytes, encoding: str) -> str: Characters that cannot be converted will be converted to ``\\ufffd`` (the unicode replacement character). """ - return data_str.decode( - encoding, "replace" if version_info[0:2] >= (3, 3) else "w3lib_replace" - ) + return data_str.decode(encoding, "replace") def html_to_unicode( From e22a503a412088c46deb2e1838d6d284b01804c1 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Tue, 9 Nov 2021 15:49:36 +0200 Subject: [PATCH 3/5] Fix DeprecationWarning: The w3lib.utils.to_native_str function is deprecated and will be removed in a future release. Please use w3lib.utils.to_unicode instead. --- w3lib/encoding.py | 4 ++-- w3lib/http.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 74034adf..8a3adbfd 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -4,7 +4,7 @@ import re, codecs, encodings from typing import Callable, Match, Optional, Tuple, Union, cast from w3lib._types import AnyUnicodeError, StrOrBytes -from w3lib.util import to_native_str +import w3lib.util _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I) @@ -92,7 +92,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]: or match.group("xmlcharset") ) if encoding: - return resolve_encoding(to_native_str(encoding)) + return resolve_encoding(w3lib.util.to_unicode(encoding)) return None diff --git a/w3lib/http.py b/w3lib/http.py index 4ea31fad..bf87330b 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -1,6 +1,6 @@ from base64 import urlsafe_b64encode from typing import Any, List, MutableMapping, Optional, AnyStr, Sequence, Union, Mapping -from w3lib.util import to_bytes, to_native_str +from w3lib.util import to_bytes, to_unicode HeadersDictInput = Mapping[bytes, Union[Any, Sequence]] HeadersDictOutput = MutableMapping[bytes, List[bytes]] @@ -97,7 +97,7 @@ def basic_auth_header( """ - auth = "%s:%s" % (to_native_str(username), to_native_str(password)) + auth = "%s:%s" % (to_unicode(username), to_unicode(password)) # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html From 09d9ce249b4a3e43a5e3a718868c038bb8ef6a1f Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Tue, 9 Nov 2021 15:55:12 +0200 Subject: [PATCH 4/5] Upgrade Python syntax with pyupgrade --py36-plus --- tests/test_encoding.py | 2 +- tests/test_url.py | 2 +- w3lib/encoding.py | 2 +- w3lib/html.py | 4 +--- w3lib/http.py | 2 +- 5 files changed, 5 insertions(+), 7 deletions(-) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 33d7f110..dfda2032 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -149,7 +149,7 @@ def _assert_encoding(self, content_type, body, expected_encoding, expected_unico else: self.assertTrue( body_unicode in expected_unicode, - "%s is not in %s" % (body_unicode, expected_unicode), + f"{body_unicode} is not in {expected_unicode}", ) def test_content_type_and_conversion(self): diff --git a/tests/test_url.py b/tests/test_url.py index fe9ee999..b1299dee 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1033,7 +1033,7 @@ def test_bytes_uri(self): def test_unicode_uri(self): result = parse_data_uri("data:,é") - self.assertEqual(result.data, "é".encode("utf-8")) + self.assertEqual(result.data, "é".encode()) def test_default_mediatype(self): result = parse_data_uri("data:;charset=iso-8859-7,%be%d3%be") diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 8a3adbfd..84c0e28b 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -162,7 +162,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]: (codecs.BOM_UTF16_LE, "utf-16-le"), (codecs.BOM_UTF8, "utf-8"), ] -_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) +_FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE} def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]: diff --git a/w3lib/html.py b/w3lib/html.py index 634d90f5..8c5c32de 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -228,9 +228,7 @@ def remove_tags_with_content( utext = to_unicode(text, encoding) if which_ones: - tags = "|".join( - [r"<%s\b.*?|<%s\s*/>" % (tag, tag, tag) for tag in which_ones] - ) + tags = "|".join([fr"<{tag}\b.*?|<{tag}\s*/>" for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) utext = retags.sub("", utext) return utext diff --git a/w3lib/http.py b/w3lib/http.py index bf87330b..e14e4345 100644 --- a/w3lib/http.py +++ b/w3lib/http.py @@ -97,7 +97,7 @@ def basic_auth_header( """ - auth = "%s:%s" % (to_unicode(username), to_unicode(password)) + auth = f"{to_unicode(username)}:{to_unicode(password)}" # XXX: RFC 2617 doesn't define encoding, but ISO-8859-1 # seems to be the most widely used encoding here. See also: # http://greenbytes.de/tech/webdav/draft-ietf-httpauth-basicauth-enc-latest.html From f4ab57a18c92f5395a8e9075ca58201cf8095dfb Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Tue, 9 Nov 2021 16:23:09 +0200 Subject: [PATCH 5/5] Fix pylint --- tests/test_html.py | 2 +- tests/test_url.py | 16 ++++------------ w3lib/encoding.py | 1 + w3lib/url.py | 4 +++- 4 files changed, 9 insertions(+), 14 deletions(-) diff --git a/tests/test_html.py b/tests/test_html.py index f6ca90d2..d61a15ca 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -124,7 +124,7 @@ def test_missing_semicolon(self): ): self.assertEqual(replace_entities(entity, encoding="cp1252"), result) self.assertEqual( - replace_entities("x%sy" % entity, encoding="cp1252"), "x%sy" % result + replace_entities(f"x{entity}y", encoding="cp1252"), f"x{result}y" ) def test_encoding(self): diff --git a/tests/test_url.py b/tests/test_url.py index b1299dee..f721bd62 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -266,12 +266,8 @@ def test_safe_url_idna_encoding_failure(self): # DNS label too long self.assertEqual( - safe_url_string( - "http://www.{label}.com/résumé?q=résumé".format(label="example" * 11) - ), - "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( - label="example" * 11 - ), + safe_url_string(f"http://www.{'example' * 11}.com/résumé?q=résumé"), + f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", ) def test_safe_url_port_number(self): @@ -971,12 +967,8 @@ def test_canonicalize_url_idna_exceptions(self): # DNS label too long self.assertEqual( - canonicalize_url( - "http://www.{label}.com/résumé?q=résumé".format(label="example" * 11) - ), - "http://www.{label}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9".format( - label="example" * 11 - ), + canonicalize_url(f"http://www.{'example' * 11}.com/résumé?q=résumé"), + f"http://www.{'example' * 11}.com/r%C3%A9sum%C3%A9?q=r%C3%A9sum%C3%A9", ) def test_preserve_nonfragment_hash(self): diff --git a/w3lib/encoding.py b/w3lib/encoding.py index 84c0e28b..86b678be 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -45,6 +45,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]: _XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P[\w-]+)") # check for meta tags, or xml decl. and stop search if a body tag is encountered +# pylint: disable=consider-using-f-string _BODY_ENCODING_PATTERN = ( r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)" % (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE) diff --git a/w3lib/url.py b/w3lib/url.py index 71398516..0592a8bf 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -319,7 +319,7 @@ def path_to_file_uri(path: str) -> str: x = pathname2url(os.path.abspath(path)) if os.name == "nt": x = x.replace("|", ":") # http://bugs.python.org/issue5861 - return "file:///%s" % x.lstrip("/") + return f"file:///{x.lstrip('/')}" def file_uri_to_path(uri: str) -> str: @@ -344,6 +344,7 @@ def any_to_uri(uri_or_path: str) -> str: _char = set(map(chr, range(127))) # RFC 2045 token. +# pylint: disable=consider-using-f-string _token = r"[{}]+".format( re.escape( "".join( @@ -359,6 +360,7 @@ def any_to_uri(uri_or_path: str) -> str: ) # RFC 822 quoted-string, without surrounding quotation marks. +# pylint: disable=consider-using-f-string _quoted_string = r"(?:[{}]|(?:\\[{}]))*".format( re.escape("".join(_char - {'"', "\\", "\r"})), re.escape("".join(_char)) )