test_lxml.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199
  1. """Tests to ensure that the lxml tree builder generates good trees."""
  2. import pickle
  3. import re
  4. import warnings
  5. try:
  6. import lxml.etree
  7. LXML_PRESENT = True
  8. LXML_VERSION = lxml.etree.LXML_VERSION
  9. except ImportError as e:
  10. LXML_PRESENT = False
  11. LXML_VERSION = (0,)
  12. if LXML_PRESENT:
  13. from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
  14. from bs4 import (
  15. BeautifulSoup,
  16. BeautifulStoneSoup,
  17. )
  18. from bs4.element import Comment, Doctype, SoupStrainer
  19. from . import (
  20. HTMLTreeBuilderSmokeTest,
  21. XMLTreeBuilderSmokeTest,
  22. SoupTest,
  23. skipIf,
  24. )
  25. @skipIf(
  26. not LXML_PRESENT,
  27. "lxml seems not to be present, not testing its tree builder.")
  28. class TestLXMLTreeBuilder(SoupTest, HTMLTreeBuilderSmokeTest):
  29. """See ``HTMLTreeBuilderSmokeTest``."""
  30. @property
  31. def default_builder(self):
  32. return LXMLTreeBuilder
  33. def test_out_of_range_entity(self):
  34. self.assert_soup(
  35. "<p>foo&#10000000000000;bar</p>", "<p>foobar</p>")
  36. self.assert_soup(
  37. "<p>foo&#x10000000000000;bar</p>", "<p>foobar</p>")
  38. self.assert_soup(
  39. "<p>foo&#1000000000;bar</p>", "<p>foobar</p>")
  40. def test_entities_in_foreign_document_encoding(self):
  41. # We can't implement this case correctly because by the time we
  42. # hear about markup like "&#147;", it's been (incorrectly) converted into
  43. # a string like u'\x93'
  44. pass
  45. # In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
  46. # test if an old version of lxml is installed.
  47. @skipIf(
  48. not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
  49. "Skipping doctype test for old version of lxml to avoid segfault.")
  50. def test_empty_doctype(self):
  51. soup = self.soup("<!DOCTYPE>")
  52. doctype = soup.contents[0]
  53. assert "" == doctype.strip()
  54. def test_beautifulstonesoup_is_xml_parser(self):
  55. # Make sure that the deprecated BSS class uses an xml builder
  56. # if one is installed.
  57. with warnings.catch_warnings(record=True) as w:
  58. soup = BeautifulStoneSoup("<b />")
  59. assert "<b/>" == str(soup.b)
  60. assert "BeautifulStoneSoup class is deprecated" in str(w[0].message)
  61. def test_tracking_line_numbers(self):
  62. # The lxml TreeBuilder cannot keep track of line numbers from
  63. # the original markup. Even if you ask for line numbers, we
  64. # don't have 'em.
  65. #
  66. # This means that if you have a tag like <sourceline> or
  67. # <sourcepos>, attribute access will find it rather than
  68. # giving you a numeric answer.
  69. soup = self.soup(
  70. "\n <p>\n\n<sourceline>\n<b>text</b></sourceline><sourcepos></p>",
  71. store_line_numbers=True
  72. )
  73. assert "sourceline" == soup.p.sourceline.name
  74. assert "sourcepos" == soup.p.sourcepos.name
  75. @skipIf(
  76. not LXML_PRESENT,
  77. "lxml seems not to be present, not testing its XML tree builder.")
  78. class TestLXMLXMLTreeBuilder(SoupTest, XMLTreeBuilderSmokeTest):
  79. """See ``HTMLTreeBuilderSmokeTest``."""
  80. @property
  81. def default_builder(self):
  82. return LXMLTreeBuilderForXML
  83. def test_namespace_indexing(self):
  84. soup = self.soup(
  85. '<?xml version="1.1"?>\n'
  86. '<root>'
  87. '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
  88. '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</prefix:tag2>'
  89. '<prefix2:tag3 xmlns:prefix2="http://another-namespace.com">'
  90. '<subtag xmlns="http://another-unprefixed-namespace.com">'
  91. '<subsubtag xmlns="http://yet-another-unprefixed-namespace.com">'
  92. '</prefix2:tag3>'
  93. '</root>'
  94. )
  95. # The BeautifulSoup object includes every namespace prefix
  96. # defined in the entire document. This is the default set of
  97. # namespaces used by soupsieve.
  98. #
  99. # Un-prefixed namespaces are not included, and if a given
  100. # prefix is defined twice, only the first prefix encountered
  101. # in the document shows up here.
  102. assert soup._namespaces == {
  103. 'xml': 'http://www.w3.org/XML/1998/namespace',
  104. 'prefix': 'http://prefixed-namespace.com',
  105. 'prefix2': 'http://another-namespace.com'
  106. }
  107. # A Tag object includes only the namespace prefixes
  108. # that were in scope when it was parsed.
  109. # We do not track un-prefixed namespaces as we can only hold
  110. # one (the first one), and it will be recognized as the
  111. # default namespace by soupsieve, even when operating from a
  112. # tag with a different un-prefixed namespace.
  113. assert soup.tag._namespaces == {
  114. 'xml': 'http://www.w3.org/XML/1998/namespace',
  115. }
  116. assert soup.tag2._namespaces == {
  117. 'prefix': 'http://prefixed-namespace.com',
  118. 'xml': 'http://www.w3.org/XML/1998/namespace',
  119. }
  120. assert soup.subtag._namespaces == {
  121. 'prefix2': 'http://another-namespace.com',
  122. 'xml': 'http://www.w3.org/XML/1998/namespace',
  123. }
  124. assert soup.subsubtag._namespaces == {
  125. 'prefix2': 'http://another-namespace.com',
  126. 'xml': 'http://www.w3.org/XML/1998/namespace',
  127. }
  128. def test_namespace_interaction_with_select_and_find(self):
  129. # Demonstrate how namespaces interact with select* and
  130. # find* methods.
  131. soup = self.soup(
  132. '<?xml version="1.1"?>\n'
  133. '<root>'
  134. '<tag xmlns="http://unprefixed-namespace.com">content</tag>'
  135. '<prefix:tag2 xmlns:prefix="http://prefixed-namespace.com">content</tag>'
  136. '<subtag xmlns:prefix="http://another-namespace-same-prefix.com">'
  137. '<prefix:tag3>'
  138. '</subtag>'
  139. '</root>'
  140. )
  141. # soupselect uses namespace URIs.
  142. assert soup.select_one('tag').name == 'tag'
  143. assert soup.select_one('prefix|tag2').name == 'tag2'
  144. # If a prefix is declared more than once, only the first usage
  145. # is registered with the BeautifulSoup object.
  146. assert soup.select_one('prefix|tag3') is None
  147. # But you can always explicitly specify a namespace dictionary.
  148. assert soup.select_one(
  149. 'prefix|tag3', namespaces=soup.subtag._namespaces
  150. ).name == 'tag3'
  151. # And a Tag (as opposed to the BeautifulSoup object) will
  152. # have a set of default namespaces scoped to that Tag.
  153. assert soup.subtag.select_one('prefix|tag3').name=='tag3'
  154. # the find() methods aren't fully namespace-aware; they just
  155. # look at prefixes.
  156. assert soup.find('tag').name == 'tag'
  157. assert soup.find('prefix:tag2').name == 'tag2'
  158. assert soup.find('prefix:tag3').name == 'tag3'
  159. assert soup.subtag.find('prefix:tag3').name == 'tag3'
  160. def test_pickle_removes_builder(self):
  161. # The lxml TreeBuilder is not picklable, so it won't be
  162. # preserved in a pickle/unpickle operation.
  163. soup = self.soup("<a>some markup</a>")
  164. assert isinstance(soup.builder, self.default_builder)
  165. pickled = pickle.dumps(soup)
  166. unpickled = pickle.loads(pickled)
  167. assert "some markup" == unpickled.a.string
  168. assert unpickled.builder is None