From 11b5d2607ffd93b873d4e32c0b1eb8e213281922 Mon Sep 17 00:00:00 2001 From: Pengyu CHEN Date: Tue, 25 Oct 2016 14:35:07 +0800 Subject: [PATCH 1/4] Added: Removing comments before extracting base URLs. Not a solution to #70, but does help in some cases. --- tests/test_html.py | 2 ++ w3lib/html.py | 1 + 2 files changed, 3 insertions(+) diff --git a/tests/test_html.py b/tests/test_html.py index 68133cb5..0ead3709 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -248,6 +248,8 @@ def test_get_base_url(self): self.assertEqual(get_base_url(text, baseurl), 'http://example.org/something') self.assertEqual(get_base_url(text, baseurl.encode('ascii')), 'http://example.org/something') + def test_base_url_in_comment(self): + self.assertEqual(get_base_url(''''''), '') def test_relative_url_with_absolute_path(self): baseurl = 'https://example.org' diff --git a/w3lib/html.py b/w3lib/html.py index 24d01a55..90a9fd05 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -281,6 +281,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'): """ text = to_unicode(text, encoding) + text = remove_comments(text) m = _baseurl_re.search(text) if m: return moves.urllib.parse.urljoin( From 4d605f719125cf9e26e831d68f23861df9eb940d Mon Sep 17 00:00:00 2001 From: Felipe Boff Nunes Date: Fri, 4 Nov 2022 14:41:30 -0300 Subject: [PATCH 2/4] small refactor --- w3lib/html.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/w3lib/html.py b/w3lib/html.py index adda5ed4..a31d42bd 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -311,8 +311,7 @@ def get_base_url( """ - utext = to_unicode(text, encoding) - utext = remove_comments(utext) + utext: str = remove_comments(text, encoding=encoding) m = _baseurl_re.search(utext) if m: return urljoin( From 95e4e9796a398027f5bce4a91e445a7171be281e Mon Sep 17 00:00:00 2001 From: Felipe Boff Nunes Date: Fri, 4 Nov 2022 14:41:43 -0300 Subject: [PATCH 3/4] add unit_tests --- tests/test_html.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_html.py b/tests/test_html.py index bbf7f1bb..1336a769 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -374,7 +374,21 @@ def test_get_base_url(self): def test_base_url_in_comment(self): self.assertEqual(get_base_url(''''''), '') + self.assertEqual(get_base_url(''' ''' + ), + "http://example_2.com/" + ) + self.assertEqual( + get_base_url( + ''' ''' + ), + "http://example_3.com/" + ) def test_relative_url_with_absolute_path(self): baseurl = "https://example.org" From a0136634c57557da6fd6efe3964eea2abe21b8bd Mon Sep 17 00:00:00 2001 From: Felipe Boff Nunes Date: Fri, 4 Nov 2022 14:43:36 -0300 Subject: [PATCH 4/4] black --- tests/test_html.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/test_html.py b/tests/test_html.py index 1336a769..1e637b0f 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -373,21 +373,27 @@ def test_get_base_url(self): ) def test_base_url_in_comment(self): - self.assertEqual(get_base_url(''''''), '') - self.assertEqual(get_base_url('''"""), "" + ) + self.assertEqual( + get_base_url(""" ''' + """ """ ), - "http://example_2.com/" + "http://example_2.com/", ) self.assertEqual( get_base_url( - ''' ''' + """ """ ), - "http://example_3.com/" + "http://example_3.com/", ) def test_relative_url_with_absolute_path(self):