diff --git a/tests/test_html.py b/tests/test_html.py index d4861ba7..1e637b0f 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -372,6 +372,30 @@ def test_get_base_url(self): get_base_url(text, baseurl.encode("ascii")), "http://example.org/something" ) + def test_base_url_in_comment(self): + self.assertEqual( + get_base_url(""""""), "" + ) + self.assertEqual( + get_base_url(""" """ + ), + "http://example_2.com/", + ) + + self.assertEqual( + get_base_url( + """ """ + ), + "http://example_3.com/", + ) + def test_relative_url_with_absolute_path(self): baseurl = "https://example.org" text = """\ diff --git a/w3lib/html.py b/w3lib/html.py index a4be0542..a31d42bd 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -311,7 +311,7 @@ def get_base_url( """ - utext = to_unicode(text, encoding) + utext: str = remove_comments(text, encoding=encoding) m = _baseurl_re.search(utext) if m: return urljoin(