Skip to content

Commit fb70566

Browse files
authored
Merge pull request #77 from starrify/remove-comments-for-base-url
[MRG+1] Added: Removing comments before extracting base URLs. Not a solution to #70, but does help in some cases.
2 parents dc29296 + a013663 commit fb70566

File tree

2 files changed

+25
-1
lines changed

2 files changed

+25
-1
lines changed

tests/test_html.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,30 @@ def test_get_base_url(self):
372372
get_base_url(text, baseurl.encode("ascii")), "http://example.org/something"
373373
)
374374

375+
def test_base_url_in_comment(self):
376+
self.assertEqual(
377+
get_base_url("""<!-- <base href="http://example.com/"/> -->"""), ""
378+
)
379+
self.assertEqual(
380+
get_base_url("""<!-- <base href="http://example.com/"/>"""), ""
381+
)
382+
self.assertEqual(
383+
get_base_url("""<!-- <base href="http://example.com/"/> --"""), ""
384+
)
385+
self.assertEqual(
386+
get_base_url(
387+
"""<!-- <!-- <base href="http://example.com/"/> -- --> <base href="http://example_2.com/"/> """
388+
),
389+
"http://example_2.com/",
390+
)
391+
392+
self.assertEqual(
393+
get_base_url(
394+
"""<!-- <base href="http://example.com/"/> --> <!-- <base href="http://example_2.com/"/> --> <base href="http://example_3.com/"/>"""
395+
),
396+
"http://example_3.com/",
397+
)
398+
375399
def test_relative_url_with_absolute_path(self):
376400
baseurl = "https://example.org"
377401
text = """\

w3lib/html.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,7 @@ def get_base_url(
311311
312312
"""
313313

314-
utext = to_unicode(text, encoding)
314+
utext: str = remove_comments(text, encoding=encoding)
315315
m = _baseurl_re.search(utext)
316316
if m:
317317
return urljoin(

0 commit comments

Comments
 (0)