test_soup.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462
  1. # -*- coding: utf-8 -*-
  2. """Tests of Beautiful Soup as a whole."""
  3. from pdb import set_trace
  4. import logging
  5. import os
  6. import pickle
  7. import pytest
  8. import sys
  9. import tempfile
  10. from bs4 import (
  11. BeautifulSoup,
  12. BeautifulStoneSoup,
  13. GuessedAtParserWarning,
  14. MarkupResemblesLocatorWarning,
  15. dammit,
  16. )
  17. from bs4.builder import (
  18. builder_registry,
  19. TreeBuilder,
  20. ParserRejectedMarkup,
  21. )
  22. from bs4.element import (
  23. Comment,
  24. SoupStrainer,
  25. Tag,
  26. NavigableString,
  27. )
  28. from . import (
  29. default_builder,
  30. SoupTest,
  31. skipIf,
  32. )
  33. import warnings
  34. try:
  35. from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
  36. LXML_PRESENT = True
  37. except ImportError as e:
  38. LXML_PRESENT = False
  39. PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
  40. class TestConstructor(SoupTest):
  41. def test_short_unicode_input(self):
  42. data = "<h1>éé</h1>"
  43. soup = self.soup(data)
  44. assert "éé" == soup.h1.string
  45. def test_embedded_null(self):
  46. data = "<h1>foo\0bar</h1>"
  47. soup = self.soup(data)
  48. assert "foo\0bar" == soup.h1.string
  49. def test_exclude_encodings(self):
  50. utf8_data = "Räksmörgås".encode("utf-8")
  51. soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
  52. assert "windows-1252" == soup.original_encoding
  53. def test_custom_builder_class(self):
  54. # Verify that you can pass in a custom Builder class and
  55. # it'll be instantiated with the appropriate keyword arguments.
  56. class Mock(object):
  57. def __init__(self, **kwargs):
  58. self.called_with = kwargs
  59. self.is_xml = True
  60. self.store_line_numbers = False
  61. self.cdata_list_attributes = []
  62. self.preserve_whitespace_tags = []
  63. self.string_containers = {}
  64. def initialize_soup(self, soup):
  65. pass
  66. def feed(self, markup):
  67. self.fed = markup
  68. def reset(self):
  69. pass
  70. def ignore(self, ignore):
  71. pass
  72. set_up_substitutions = can_be_empty_element = ignore
  73. def prepare_markup(self, *args, **kwargs):
  74. yield "prepared markup", "original encoding", "declared encoding", "contains replacement characters"
  75. kwargs = dict(
  76. var="value",
  77. # This is a deprecated BS3-era keyword argument, which
  78. # will be stripped out.
  79. convertEntities=True,
  80. )
  81. with warnings.catch_warnings(record=True):
  82. soup = BeautifulSoup('', builder=Mock, **kwargs)
  83. assert isinstance(soup.builder, Mock)
  84. assert dict(var="value") == soup.builder.called_with
  85. assert "prepared markup" == soup.builder.fed
  86. # You can also instantiate the TreeBuilder yourself. In this
  87. # case, that specific object is used and any keyword arguments
  88. # to the BeautifulSoup constructor are ignored.
  89. builder = Mock(**kwargs)
  90. with warnings.catch_warnings(record=True) as w:
  91. soup = BeautifulSoup(
  92. '', builder=builder, ignored_value=True,
  93. )
  94. msg = str(w[0].message)
  95. assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
  96. assert builder == soup.builder
  97. assert kwargs == builder.called_with
  98. def test_parser_markup_rejection(self):
  99. # If markup is completely rejected by the parser, an
  100. # explanatory ParserRejectedMarkup exception is raised.
  101. class Mock(TreeBuilder):
  102. def feed(self, *args, **kwargs):
  103. raise ParserRejectedMarkup("Nope.")
  104. def prepare_markup(self, *args, **kwargs):
  105. # We're going to try two different ways of preparing this markup,
  106. # but feed() will reject both of them.
  107. yield markup, None, None, False
  108. yield markup, None, None, False
  109. import re
  110. with pytest.raises(ParserRejectedMarkup) as exc_info:
  111. BeautifulSoup('', builder=Mock)
  112. assert "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help." in str(exc_info.value)
  113. def test_cdata_list_attributes(self):
  114. # Most attribute values are represented as scalars, but the
  115. # HTML standard says that some attributes, like 'class' have
  116. # space-separated lists as values.
  117. markup = '<a id=" an id " class=" a class "></a>'
  118. soup = self.soup(markup)
  119. # Note that the spaces are stripped for 'class' but not for 'id'.
  120. a = soup.a
  121. assert " an id " == a['id']
  122. assert ["a", "class"] == a['class']
  123. # TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
  124. # you customize or disable this. As always, you can customize the TreeBuilder
  125. # by passing in a keyword argument to the BeautifulSoup constructor.
  126. soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
  127. assert " a class " == soup.a['class']
  128. # Here are two ways of saying that `id` is a multi-valued
  129. # attribute in this context, but 'class' is not.
  130. for switcheroo in ({'*': 'id'}, {'a': 'id'}):
  131. with warnings.catch_warnings(record=True) as w:
  132. # This will create a warning about not explicitly
  133. # specifying a parser, but we'll ignore it.
  134. soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
  135. a = soup.a
  136. assert ["an", "id"] == a['id']
  137. assert " a class " == a['class']
  138. def test_replacement_classes(self):
  139. # Test the ability to pass in replacements for element classes
  140. # which will be used when building the tree.
  141. class TagPlus(Tag):
  142. pass
  143. class StringPlus(NavigableString):
  144. pass
  145. class CommentPlus(Comment):
  146. pass
  147. soup = self.soup(
  148. "<a><b>foo</b>bar</a><!--whee-->",
  149. element_classes = {
  150. Tag: TagPlus,
  151. NavigableString: StringPlus,
  152. Comment: CommentPlus,
  153. }
  154. )
  155. # The tree was built with TagPlus, StringPlus, and CommentPlus objects,
  156. # rather than Tag, String, and Comment objects.
  157. assert all(
  158. isinstance(x, (TagPlus, StringPlus, CommentPlus))
  159. for x in soup.recursiveChildGenerator()
  160. )
  161. def test_alternate_string_containers(self):
  162. # Test the ability to customize the string containers for
  163. # different types of tags.
  164. class PString(NavigableString):
  165. pass
  166. class BString(NavigableString):
  167. pass
  168. soup = self.soup(
  169. "<div>Hello.<p>Here is <b>some <i>bolded</i></b> text",
  170. string_containers = {
  171. 'b': BString,
  172. 'p': PString,
  173. }
  174. )
  175. # The string before the <p> tag is a regular NavigableString.
  176. assert isinstance(soup.div.contents[0], NavigableString)
  177. # The string inside the <p> tag, but not inside the <i> tag,
  178. # is a PString.
  179. assert isinstance(soup.p.contents[0], PString)
  180. # Every string inside the <b> tag is a BString, even the one that
  181. # was also inside an <i> tag.
  182. for s in soup.b.strings:
  183. assert isinstance(s, BString)
  184. # Now that parsing was complete, the string_container_stack
  185. # (where this information was kept) has been cleared out.
  186. assert [] == soup.string_container_stack
  187. class TestWarnings(SoupTest):
  188. def _assert_warning(self, warnings, cls):
  189. for w in warnings:
  190. if isinstance(w.message, cls):
  191. return w
  192. raise Exception("%s warning not found in %r" % (cls, warnings))
  193. def _assert_no_parser_specified(self, w):
  194. warning = self._assert_warning(w, GuessedAtParserWarning)
  195. message = str(warning.message)
  196. assert message.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:60])
  197. def test_warning_if_no_parser_specified(self):
  198. with warnings.catch_warnings(record=True) as w:
  199. soup = BeautifulSoup("<a><b></b></a>")
  200. self._assert_no_parser_specified(w)
  201. def test_warning_if_parser_specified_too_vague(self):
  202. with warnings.catch_warnings(record=True) as w:
  203. soup = BeautifulSoup("<a><b></b></a>", "html")
  204. self._assert_no_parser_specified(w)
  205. def test_no_warning_if_explicit_parser_specified(self):
  206. with warnings.catch_warnings(record=True) as w:
  207. soup = BeautifulSoup("<a><b></b></a>", "html.parser")
  208. assert [] == w
  209. def test_parseOnlyThese_renamed_to_parse_only(self):
  210. with warnings.catch_warnings(record=True) as w:
  211. soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
  212. msg = str(w[0].message)
  213. assert "parseOnlyThese" in msg
  214. assert "parse_only" in msg
  215. assert b"<b></b>" == soup.encode()
  216. def test_fromEncoding_renamed_to_from_encoding(self):
  217. with warnings.catch_warnings(record=True) as w:
  218. utf8 = b"\xc3\xa9"
  219. soup = self.soup(utf8, fromEncoding="utf8")
  220. msg = str(w[0].message)
  221. assert "fromEncoding" in msg
  222. assert "from_encoding" in msg
  223. assert "utf8" == soup.original_encoding
  224. def test_unrecognized_keyword_argument(self):
  225. with pytest.raises(TypeError):
  226. self.soup("<a>", no_such_argument=True)
  227. @pytest.mark.parametrize(
  228. "extension",
  229. ['markup.html', 'markup.htm', 'markup.HTML', 'markup.txt',
  230. 'markup.xhtml', 'markup.xml', "/home/user/file", "c:\\user\file"]
  231. )
  232. def test_resembles_filename_warning(self, extension):
  233. # A warning is issued if the "markup" looks like the name of
  234. # an HTML or text file, or a full path to a file on disk.
  235. with warnings.catch_warnings(record=True) as w:
  236. soup = self.soup("markup" + extension)
  237. warning = self._assert_warning(w, MarkupResemblesLocatorWarning)
  238. assert "looks more like a filename" in str(warning.message)
  239. @pytest.mark.parametrize(
  240. "extension",
  241. ['markuphtml', 'markup.com', '', 'markup.js']
  242. )
  243. def test_resembles_filename_no_warning(self, extension):
  244. # The 'looks more like a filename' warning is not issued if
  245. # the markup looks like a bare string, a domain name, or a
  246. # file that's not an HTML file.
  247. with warnings.catch_warnings(record=True) as w:
  248. soup = self.soup("markup" + extension)
  249. assert [] == w
  250. def test_url_warning_with_bytes_url(self):
  251. url = b"http://www.crummybytes.com/"
  252. with warnings.catch_warnings(record=True) as warning_list:
  253. soup = self.soup(url)
  254. warning = self._assert_warning(
  255. warning_list, MarkupResemblesLocatorWarning
  256. )
  257. assert "looks more like a URL" in str(warning.message)
  258. assert url not in str(warning.message).encode("utf8")
  259. def test_url_warning_with_unicode_url(self):
  260. url = "http://www.crummyunicode.com/"
  261. with warnings.catch_warnings(record=True) as warning_list:
  262. # note - this url must differ from the bytes one otherwise
  263. # python's warnings system swallows the second warning
  264. soup = self.soup(url)
  265. warning = self._assert_warning(
  266. warning_list, MarkupResemblesLocatorWarning
  267. )
  268. assert "looks more like a URL" in str(warning.message)
  269. assert url not in str(warning.message)
  270. def test_url_warning_with_bytes_and_space(self):
  271. # Here the markup contains something besides a URL, so no warning
  272. # is issued.
  273. with warnings.catch_warnings(record=True) as warning_list:
  274. soup = self.soup(b"http://www.crummybytes.com/ is great")
  275. assert not any("looks more like a URL" in str(w.message)
  276. for w in warning_list)
  277. def test_url_warning_with_unicode_and_space(self):
  278. with warnings.catch_warnings(record=True) as warning_list:
  279. soup = self.soup("http://www.crummyunicode.com/ is great")
  280. assert not any("looks more like a URL" in str(w.message)
  281. for w in warning_list)
  282. class TestSelectiveParsing(SoupTest):
  283. def test_parse_with_soupstrainer(self):
  284. markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
  285. strainer = SoupStrainer("b")
  286. soup = self.soup(markup, parse_only=strainer)
  287. assert soup.encode() == b"<b>Yes</b><b>Yes <c>Yes</c></b>"
  288. class TestNewTag(SoupTest):
  289. """Test the BeautifulSoup.new_tag() method."""
  290. def test_new_tag(self):
  291. soup = self.soup("")
  292. new_tag = soup.new_tag("foo", bar="baz", attrs={"name": "a name"})
  293. assert isinstance(new_tag, Tag)
  294. assert "foo" == new_tag.name
  295. assert dict(bar="baz", name="a name") == new_tag.attrs
  296. assert None == new_tag.parent
  297. def test_tag_inherits_self_closing_rules_from_builder(self):
  298. if LXML_PRESENT:
  299. xml_soup = BeautifulSoup("", "lxml-xml")
  300. xml_br = xml_soup.new_tag("br")
  301. xml_p = xml_soup.new_tag("p")
  302. # Both the <br> and <p> tag are empty-element, just because
  303. # they have no contents.
  304. assert b"<br/>" == xml_br.encode()
  305. assert b"<p/>" == xml_p.encode()
  306. html_soup = BeautifulSoup("", "html.parser")
  307. html_br = html_soup.new_tag("br")
  308. html_p = html_soup.new_tag("p")
  309. # The HTML builder users HTML's rules about which tags are
  310. # empty-element tags, and the new tags reflect these rules.
  311. assert b"<br/>" == html_br.encode()
  312. assert b"<p></p>" == html_p.encode()
  313. class TestNewString(SoupTest):
  314. """Test the BeautifulSoup.new_string() method."""
  315. def test_new_string_creates_navigablestring(self):
  316. soup = self.soup("")
  317. s = soup.new_string("foo")
  318. assert "foo" == s
  319. assert isinstance(s, NavigableString)
  320. def test_new_string_can_create_navigablestring_subclass(self):
  321. soup = self.soup("")
  322. s = soup.new_string("foo", Comment)
  323. assert "foo" == s
  324. assert isinstance(s, Comment)
  325. class TestPickle(SoupTest):
  326. # Test our ability to pickle the BeautifulSoup object itself.
  327. def test_normal_pickle(self):
  328. soup = self.soup("<a>some markup</a>")
  329. pickled = pickle.dumps(soup)
  330. unpickled = pickle.loads(pickled)
  331. assert "some markup" == unpickled.a.string
  332. def test_pickle_with_no_builder(self):
  333. # We had a bug that prevented pickling from working if
  334. # the builder wasn't set.
  335. soup = self.soup("some markup")
  336. soup.builder = None
  337. pickled = pickle.dumps(soup)
  338. unpickled = pickle.loads(pickled)
  339. assert "some markup" == unpickled.string
  340. class TestEncodingConversion(SoupTest):
  341. # Test Beautiful Soup's ability to decode and encode from various
  342. # encodings.
  343. def setup_method(self):
  344. self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
  345. self.utf8_data = self.unicode_data.encode("utf-8")
  346. # Just so you know what it looks like.
  347. assert self.utf8_data == b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>'
  348. def test_ascii_in_unicode_out(self):
  349. # ASCII input is converted to Unicode. The original_encoding
  350. # attribute is set to 'utf-8', a superset of ASCII.
  351. chardet = dammit.chardet_dammit
  352. logging.disable(logging.WARNING)
  353. try:
  354. def noop(str):
  355. return None
  356. # Disable chardet, which will realize that the ASCII is ASCII.
  357. dammit.chardet_dammit = noop
  358. ascii = b"<foo>a</foo>"
  359. soup_from_ascii = self.soup(ascii)
  360. unicode_output = soup_from_ascii.decode()
  361. assert isinstance(unicode_output, str)
  362. assert unicode_output == self.document_for(ascii.decode())
  363. assert soup_from_ascii.original_encoding.lower() == "utf-8"
  364. finally:
  365. logging.disable(logging.NOTSET)
  366. dammit.chardet_dammit = chardet
  367. def test_unicode_in_unicode_out(self):
  368. # Unicode input is left alone. The original_encoding attribute
  369. # is not set.
  370. soup_from_unicode = self.soup(self.unicode_data)
  371. assert soup_from_unicode.decode() == self.unicode_data
  372. assert soup_from_unicode.foo.string == 'Sacr\xe9 bleu!'
  373. assert soup_from_unicode.original_encoding == None
  374. def test_utf8_in_unicode_out(self):
  375. # UTF-8 input is converted to Unicode. The original_encoding
  376. # attribute is set.
  377. soup_from_utf8 = self.soup(self.utf8_data)
  378. assert soup_from_utf8.decode() == self.unicode_data
  379. assert soup_from_utf8.foo.string == 'Sacr\xe9 bleu!'
  380. def test_utf8_out(self):
  381. # The internal data structures can be encoded as UTF-8.
  382. soup_from_unicode = self.soup(self.unicode_data)
  383. assert soup_from_unicode.encode('utf-8') == self.utf8_data
  384. @skipIf(
  385. PYTHON_3_PRE_3_2,
  386. "Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
  387. def test_attribute_name_containing_unicode_characters(self):
  388. markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
  389. assert self.soup(markup).div.encode("utf8") == markup.encode("utf8")