test_htmlparser.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. """Tests to ensure that the html.parser tree builder generates good
  2. trees."""
  3. from pdb import set_trace
  4. import pickle
  5. import warnings
  6. from bs4.builder import (
  7. HTMLParserTreeBuilder,
  8. XMLParsedAsHTMLWarning,
  9. )
  10. from bs4.builder._htmlparser import BeautifulSoupHTMLParser
  11. from . import SoupTest, HTMLTreeBuilderSmokeTest
  12. class TestHTMLParserTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
  13. default_builder = HTMLParserTreeBuilder
  14. def test_namespaced_system_doctype(self):
  15. # html.parser can't handle namespaced doctypes, so skip this one.
  16. pass
  17. def test_namespaced_public_doctype(self):
  18. # html.parser can't handle namespaced doctypes, so skip this one.
  19. pass
  20. def test_builder_is_pickled(self):
  21. """Unlike most tree builders, HTMLParserTreeBuilder and will
  22. be restored after pickling.
  23. """
  24. tree = self.soup("<a><b>foo</a>")
  25. dumped = pickle.dumps(tree, 2)
  26. loaded = pickle.loads(dumped)
  27. assert isinstance(loaded.builder, type(tree.builder))
  28. def test_redundant_empty_element_closing_tags(self):
  29. self.assert_soup('<br></br><br></br><br></br>', "<br/><br/><br/>")
  30. self.assert_soup('</br></br></br>', "")
  31. def test_empty_element(self):
  32. # This verifies that any buffered data present when the parser
  33. # finishes working is handled.
  34. self.assert_soup("foo &# bar", "foo &amp;# bar")
  35. def test_tracking_line_numbers(self):
  36. # The html.parser TreeBuilder keeps track of line number and
  37. # position of each element.
  38. markup = "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>"
  39. soup = self.soup(markup)
  40. assert 2 == soup.p.sourceline
  41. assert 3 == soup.p.sourcepos
  42. assert "sourceline" == soup.p.find('sourceline').name
  43. # You can deactivate this behavior.
  44. soup = self.soup(markup, store_line_numbers=False)
  45. assert "sourceline" == soup.p.sourceline.name
  46. assert "sourcepos" == soup.p.sourcepos.name
  47. def test_on_duplicate_attribute(self):
  48. # The html.parser tree builder has a variety of ways of
  49. # handling a tag that contains the same attribute multiple times.
  50. markup = '<a class="cls" href="url1" href="url2" href="url3" id="id">'
  51. # If you don't provide any particular value for
  52. # on_duplicate_attribute, later values replace earlier values.
  53. soup = self.soup(markup)
  54. assert "url3" == soup.a['href']
  55. assert ["cls"] == soup.a['class']
  56. assert "id" == soup.a['id']
  57. # You can also get this behavior explicitly.
  58. def assert_attribute(on_duplicate_attribute, expected):
  59. soup = self.soup(
  60. markup, on_duplicate_attribute=on_duplicate_attribute
  61. )
  62. assert expected == soup.a['href']
  63. # Verify that non-duplicate attributes are treated normally.
  64. assert ["cls"] == soup.a['class']
  65. assert "id" == soup.a['id']
  66. assert_attribute(None, "url3")
  67. assert_attribute(BeautifulSoupHTMLParser.REPLACE, "url3")
  68. # You can ignore subsequent values in favor of the first.
  69. assert_attribute(BeautifulSoupHTMLParser.IGNORE, "url1")
  70. # And you can pass in a callable that does whatever you want.
  71. def accumulate(attrs, key, value):
  72. if not isinstance(attrs[key], list):
  73. attrs[key] = [attrs[key]]
  74. attrs[key].append(value)
  75. assert_attribute(accumulate, ["url1", "url2", "url3"])
  76. def test_html5_attributes(self):
  77. # The html.parser TreeBuilder can convert any entity named in
  78. # the HTML5 spec to a sequence of Unicode characters, and
  79. # convert those Unicode characters to a (potentially
  80. # different) named entity on the way out.
  81. for input_element, output_unicode, output_element in (
  82. ("&RightArrowLeftArrow;", '\u21c4', b'&rlarr;'),
  83. ('&models;', '\u22a7', b'&models;'),
  84. ('&Nfr;', '\U0001d511', b'&Nfr;'),
  85. ('&ngeqq;', '\u2267\u0338', b'&ngeqq;'),
  86. ('&not;', '\xac', b'&not;'),
  87. ('&Not;', '\u2aec', b'&Not;'),
  88. ('&quot;', '"', b'"'),
  89. ('&there4;', '\u2234', b'&there4;'),
  90. ('&Therefore;', '\u2234', b'&there4;'),
  91. ('&therefore;', '\u2234', b'&there4;'),
  92. ("&fjlig;", 'fj', b'fj'),
  93. ("&sqcup;", '\u2294', b'&sqcup;'),
  94. ("&sqcups;", '\u2294\ufe00', b'&sqcups;'),
  95. ("&apos;", "'", b"'"),
  96. ("&verbar;", "|", b"|"),
  97. ):
  98. markup = '<div>%s</div>' % input_element
  99. div = self.soup(markup).div
  100. without_element = div.encode()
  101. expect = b"<div>%s</div>" % output_unicode.encode("utf8")
  102. assert without_element == expect
  103. with_element = div.encode(formatter="html")
  104. expect = b"<div>%s</div>" % output_element
  105. assert with_element == expect
  106. class TestHTMLParserSubclass(SoupTest):
  107. def test_error(self):
  108. """Verify that our HTMLParser subclass implements error() in a way
  109. that doesn't cause a crash.
  110. """
  111. parser = BeautifulSoupHTMLParser()
  112. with warnings.catch_warnings(record=True) as warns:
  113. parser.error("don't crash")
  114. [warning] = warns
  115. assert "don't crash" == str(warning.message)