test_pageelement.py 27 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751
  1. """Tests of the bs4.element.PageElement class"""
  2. import copy
  3. import pickle
  4. import pytest
  5. from soupsieve import SelectorSyntaxError
  6. from bs4 import BeautifulSoup
  7. from bs4.element import (
  8. Comment,
  9. SoupStrainer,
  10. )
  11. from . import SoupTest
  12. class TestEncoding(SoupTest):
  13. """Test the ability to encode objects into strings."""
  14. def test_unicode_string_can_be_encoded(self):
  15. html = "<b>\N{SNOWMAN}</b>"
  16. soup = self.soup(html)
  17. assert soup.b.string.encode("utf-8") == "\N{SNOWMAN}".encode("utf-8")
  18. def test_tag_containing_unicode_string_can_be_encoded(self):
  19. html = "<b>\N{SNOWMAN}</b>"
  20. soup = self.soup(html)
  21. assert soup.b.encode("utf-8") == html.encode("utf-8")
  22. def test_encoding_substitutes_unrecognized_characters_by_default(self):
  23. html = "<b>\N{SNOWMAN}</b>"
  24. soup = self.soup(html)
  25. assert soup.b.encode("ascii") == b"<b>&#9731;</b>"
  26. def test_encoding_can_be_made_strict(self):
  27. html = "<b>\N{SNOWMAN}</b>"
  28. soup = self.soup(html)
  29. with pytest.raises(UnicodeEncodeError):
  30. soup.encode("ascii", errors="strict")
  31. def test_decode_contents(self):
  32. html = "<b>\N{SNOWMAN}</b>"
  33. soup = self.soup(html)
  34. assert "\N{SNOWMAN}" == soup.b.decode_contents()
  35. def test_encode_contents(self):
  36. html = "<b>\N{SNOWMAN}</b>"
  37. soup = self.soup(html)
  38. assert "\N{SNOWMAN}".encode("utf8") == soup.b.encode_contents(
  39. encoding="utf8"
  40. )
  41. def test_deprecated_renderContents(self):
  42. html = "<b>\N{SNOWMAN}</b>"
  43. soup = self.soup(html)
  44. assert "\N{SNOWMAN}".encode("utf8") == soup.b.renderContents()
  45. def test_repr(self):
  46. html = "<b>\N{SNOWMAN}</b>"
  47. soup = self.soup(html)
  48. assert html == repr(soup)
  49. class TestFormatters(SoupTest):
  50. """Test the formatting feature, used by methods like decode() and
  51. prettify(), and the formatters themselves.
  52. """
  53. def test_default_formatter_is_minimal(self):
  54. markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  55. soup = self.soup(markup)
  56. decoded = soup.decode(formatter="minimal")
  57. # The < is converted back into &lt; but the e-with-acute is left alone.
  58. assert decoded == self.document_for(
  59. "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  60. )
  61. def test_formatter_html(self):
  62. markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  63. soup = self.soup(markup)
  64. decoded = soup.decode(formatter="html")
  65. assert decoded == self.document_for(
  66. "<br/><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"
  67. )
  68. def test_formatter_html5(self):
  69. markup = "<br><b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  70. soup = self.soup(markup)
  71. decoded = soup.decode(formatter="html5")
  72. assert decoded == self.document_for(
  73. "<br><b>&lt;&lt;Sacr&eacute; bleu!&gt;&gt;</b>"
  74. )
  75. def test_formatter_minimal(self):
  76. markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  77. soup = self.soup(markup)
  78. decoded = soup.decode(formatter="minimal")
  79. # The < is converted back into &lt; but the e-with-acute is left alone.
  80. assert decoded == self.document_for(
  81. "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  82. )
  83. def test_formatter_null(self):
  84. markup = "<b>&lt;&lt;Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!&gt;&gt;</b>"
  85. soup = self.soup(markup)
  86. decoded = soup.decode(formatter=None)
  87. # Neither the angle brackets nor the e-with-acute are converted.
  88. # This is not valid HTML, but it's what the user wanted.
  89. assert decoded == self.document_for(
  90. "<b><<Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></b>"
  91. )
  92. def test_formatter_custom(self):
  93. markup = "<b>&lt;foo&gt;</b><b>bar</b><br/>"
  94. soup = self.soup(markup)
  95. decoded = soup.decode(formatter = lambda x: x.upper())
  96. # Instead of normal entity conversion code, the custom
  97. # callable is called on every string.
  98. assert decoded == self.document_for("<b><FOO></b><b>BAR</b><br/>")
  99. def test_formatter_is_run_on_attribute_values(self):
  100. markup = '<a href="http://a.com?a=b&c=é">e</a>'
  101. soup = self.soup(markup)
  102. a = soup.a
  103. expect_minimal = '<a href="http://a.com?a=b&amp;c=é">e</a>'
  104. assert expect_minimal == a.decode()
  105. assert expect_minimal == a.decode(formatter="minimal")
  106. expect_html = '<a href="http://a.com?a=b&amp;c=&eacute;">e</a>'
  107. assert expect_html == a.decode(formatter="html")
  108. assert markup == a.decode(formatter=None)
  109. expect_upper = '<a href="HTTP://A.COM?A=B&C=É">E</a>'
  110. assert expect_upper == a.decode(formatter=lambda x: x.upper())
  111. def test_formatter_skips_script_tag_for_html_documents(self):
  112. doc = """
  113. <script type="text/javascript">
  114. console.log("< < hey > > ");
  115. </script>
  116. """
  117. encoded = BeautifulSoup(doc, 'html.parser').encode()
  118. assert b"< < hey > >" in encoded
  119. def test_formatter_skips_style_tag_for_html_documents(self):
  120. doc = """
  121. <style type="text/css">
  122. console.log("< < hey > > ");
  123. </style>
  124. """
  125. encoded = BeautifulSoup(doc, 'html.parser').encode()
  126. assert b"< < hey > >" in encoded
  127. def test_prettify_leaves_preformatted_text_alone(self):
  128. soup = self.soup("<div> foo <pre> \tbar\n \n </pre> baz <textarea> eee\nfff\t</textarea></div>")
  129. # Everything outside the <pre> tag is reformatted, but everything
  130. # inside is left alone.
  131. assert '<div>\n foo\n <pre> \tbar\n \n </pre>\n baz\n <textarea> eee\nfff\t</textarea>\n</div>' == soup.div.prettify()
  132. def test_prettify_accepts_formatter_function(self):
  133. soup = BeautifulSoup("<html><body>foo</body></html>", 'html.parser')
  134. pretty = soup.prettify(formatter = lambda x: x.upper())
  135. assert "FOO" in pretty
  136. def test_prettify_outputs_unicode_by_default(self):
  137. soup = self.soup("<a></a>")
  138. assert str == type(soup.prettify())
  139. def test_prettify_can_encode_data(self):
  140. soup = self.soup("<a></a>")
  141. assert bytes == type(soup.prettify("utf-8"))
  142. def test_html_entity_substitution_off_by_default(self):
  143. markup = "<b>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</b>"
  144. soup = self.soup(markup)
  145. encoded = soup.b.encode("utf-8")
  146. assert encoded == markup.encode('utf-8')
  147. def test_encoding_substitution(self):
  148. # Here's the <meta> tag saying that a document is
  149. # encoded in Shift-JIS.
  150. meta_tag = ('<meta content="text/html; charset=x-sjis" '
  151. 'http-equiv="Content-type"/>')
  152. soup = self.soup(meta_tag)
  153. # Parse the document, and the charset apprears unchanged.
  154. assert soup.meta['content'] == 'text/html; charset=x-sjis'
  155. # Encode the document into some encoding, and the encoding is
  156. # substituted into the meta tag.
  157. utf_8 = soup.encode("utf-8")
  158. assert b"charset=utf-8" in utf_8
  159. euc_jp = soup.encode("euc_jp")
  160. assert b"charset=euc_jp" in euc_jp
  161. shift_jis = soup.encode("shift-jis")
  162. assert b"charset=shift-jis" in shift_jis
  163. utf_16_u = soup.encode("utf-16").decode("utf-16")
  164. assert "charset=utf-16" in utf_16_u
  165. def test_encoding_substitution_doesnt_happen_if_tag_is_strained(self):
  166. markup = ('<head><meta content="text/html; charset=x-sjis" '
  167. 'http-equiv="Content-type"/></head><pre>foo</pre>')
  168. # Beautiful Soup used to try to rewrite the meta tag even if the
  169. # meta tag got filtered out by the strainer. This test makes
  170. # sure that doesn't happen.
  171. strainer = SoupStrainer('pre')
  172. soup = self.soup(markup, parse_only=strainer)
  173. assert soup.contents[0].name == 'pre'
  174. class TestCSSSelectors(SoupTest):
  175. """Test basic CSS selector functionality.
  176. This functionality is implemented in soupsieve, which has a much
  177. more comprehensive test suite, so this is basically an extra check
  178. that soupsieve works as expected.
  179. """
  180. HTML = """
  181. <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
  182. "http://www.w3.org/TR/html4/strict.dtd">
  183. <html>
  184. <head>
  185. <title>The title</title>
  186. <link rel="stylesheet" href="blah.css" type="text/css" id="l1">
  187. </head>
  188. <body>
  189. <custom-dashed-tag class="dashed" id="dash1">Hello there.</custom-dashed-tag>
  190. <div id="main" class="fancy">
  191. <div id="inner">
  192. <h1 id="header1">An H1</h1>
  193. <p>Some text</p>
  194. <p class="onep" id="p1">Some more text</p>
  195. <h2 id="header2">An H2</h2>
  196. <p class="class1 class2 class3" id="pmulti">Another</p>
  197. <a href="http://bob.example.org/" rel="friend met" id="bob">Bob</a>
  198. <h2 id="header3">Another H2</h2>
  199. <a id="me" href="http://simonwillison.net/" rel="me">me</a>
  200. <span class="s1">
  201. <a href="#" id="s1a1">span1a1</a>
  202. <a href="#" id="s1a2">span1a2 <span id="s1a2s1">test</span></a>
  203. <span class="span2">
  204. <a href="#" id="s2a1">span2a1</a>
  205. </span>
  206. <span class="span3"></span>
  207. <custom-dashed-tag class="dashed" id="dash2"/>
  208. <div data-tag="dashedvalue" id="data1"/>
  209. </span>
  210. </div>
  211. <x id="xid">
  212. <z id="zida"/>
  213. <z id="zidab"/>
  214. <z id="zidac"/>
  215. </x>
  216. <y id="yid">
  217. <z id="zidb"/>
  218. </y>
  219. <p lang="en" id="lang-en">English</p>
  220. <p lang="en-gb" id="lang-en-gb">English UK</p>
  221. <p lang="en-us" id="lang-en-us">English US</p>
  222. <p lang="fr" id="lang-fr">French</p>
  223. </div>
  224. <div id="footer">
  225. </div>
  226. """
  227. def setup_method(self):
  228. self.soup = BeautifulSoup(self.HTML, 'html.parser')
  229. def assert_selects(self, selector, expected_ids, **kwargs):
  230. el_ids = [el['id'] for el in self.soup.select(selector, **kwargs)]
  231. el_ids.sort()
  232. expected_ids.sort()
  233. assert expected_ids == el_ids, "Selector %s, expected [%s], got [%s]" % (
  234. selector, ', '.join(expected_ids), ', '.join(el_ids)
  235. )
  236. assertSelect = assert_selects
  237. def assert_select_multiple(self, *tests):
  238. for selector, expected_ids in tests:
  239. self.assert_selects(selector, expected_ids)
  240. def test_one_tag_one(self):
  241. els = self.soup.select('title')
  242. assert len(els) == 1
  243. assert els[0].name == 'title'
  244. assert els[0].contents == ['The title']
  245. def test_one_tag_many(self):
  246. els = self.soup.select('div')
  247. assert len(els) == 4
  248. for div in els:
  249. assert div.name == 'div'
  250. el = self.soup.select_one('div')
  251. assert 'main' == el['id']
  252. def test_select_one_returns_none_if_no_match(self):
  253. match = self.soup.select_one('nonexistenttag')
  254. assert None == match
  255. def test_tag_in_tag_one(self):
  256. els = self.soup.select('div div')
  257. self.assert_selects('div div', ['inner', 'data1'])
  258. def test_tag_in_tag_many(self):
  259. for selector in ('html div', 'html body div', 'body div'):
  260. self.assert_selects(selector, ['data1', 'main', 'inner', 'footer'])
  261. def test_limit(self):
  262. self.assert_selects('html div', ['main'], limit=1)
  263. self.assert_selects('html body div', ['inner', 'main'], limit=2)
  264. self.assert_selects('body div', ['data1', 'main', 'inner', 'footer'],
  265. limit=10)
  266. def test_tag_no_match(self):
  267. assert len(self.soup.select('del')) == 0
  268. def test_invalid_tag(self):
  269. with pytest.raises(SelectorSyntaxError):
  270. self.soup.select('tag%t')
  271. def test_select_dashed_tag_ids(self):
  272. self.assert_selects('custom-dashed-tag', ['dash1', 'dash2'])
  273. def test_select_dashed_by_id(self):
  274. dashed = self.soup.select('custom-dashed-tag[id=\"dash2\"]')
  275. assert dashed[0].name == 'custom-dashed-tag'
  276. assert dashed[0]['id'] == 'dash2'
  277. def test_dashed_tag_text(self):
  278. assert self.soup.select('body > custom-dashed-tag')[0].text == 'Hello there.'
  279. def test_select_dashed_matches_find_all(self):
  280. assert self.soup.select('custom-dashed-tag') == self.soup.find_all('custom-dashed-tag')
  281. def test_header_tags(self):
  282. self.assert_select_multiple(
  283. ('h1', ['header1']),
  284. ('h2', ['header2', 'header3']),
  285. )
  286. def test_class_one(self):
  287. for selector in ('.onep', 'p.onep', 'html p.onep'):
  288. els = self.soup.select(selector)
  289. assert len(els) == 1
  290. assert els[0].name == 'p'
  291. assert els[0]['class'] == ['onep']
  292. def test_class_mismatched_tag(self):
  293. els = self.soup.select('div.onep')
  294. assert len(els) == 0
  295. def test_one_id(self):
  296. for selector in ('div#inner', '#inner', 'div div#inner'):
  297. self.assert_selects(selector, ['inner'])
  298. def test_bad_id(self):
  299. els = self.soup.select('#doesnotexist')
  300. assert len(els) == 0
  301. def test_items_in_id(self):
  302. els = self.soup.select('div#inner p')
  303. assert len(els) == 3
  304. for el in els:
  305. assert el.name == 'p'
  306. assert els[1]['class'] == ['onep']
  307. assert not els[0].has_attr('class')
  308. def test_a_bunch_of_emptys(self):
  309. for selector in ('div#main del', 'div#main div.oops', 'div div#main'):
  310. assert len(self.soup.select(selector)) == 0
  311. def test_multi_class_support(self):
  312. for selector in ('.class1', 'p.class1', '.class2', 'p.class2',
  313. '.class3', 'p.class3', 'html p.class2', 'div#inner .class2'):
  314. self.assert_selects(selector, ['pmulti'])
  315. def test_multi_class_selection(self):
  316. for selector in ('.class1.class3', '.class3.class2',
  317. '.class1.class2.class3'):
  318. self.assert_selects(selector, ['pmulti'])
  319. def test_child_selector(self):
  320. self.assert_selects('.s1 > a', ['s1a1', 's1a2'])
  321. self.assert_selects('.s1 > a span', ['s1a2s1'])
  322. def test_child_selector_id(self):
  323. self.assert_selects('.s1 > a#s1a2 span', ['s1a2s1'])
  324. def test_attribute_equals(self):
  325. self.assert_select_multiple(
  326. ('p[class="onep"]', ['p1']),
  327. ('p[id="p1"]', ['p1']),
  328. ('[class="onep"]', ['p1']),
  329. ('[id="p1"]', ['p1']),
  330. ('link[rel="stylesheet"]', ['l1']),
  331. ('link[type="text/css"]', ['l1']),
  332. ('link[href="blah.css"]', ['l1']),
  333. ('link[href="no-blah.css"]', []),
  334. ('[rel="stylesheet"]', ['l1']),
  335. ('[type="text/css"]', ['l1']),
  336. ('[href="blah.css"]', ['l1']),
  337. ('[href="no-blah.css"]', []),
  338. ('p[href="no-blah.css"]', []),
  339. ('[href="no-blah.css"]', []),
  340. )
  341. def test_attribute_tilde(self):
  342. self.assert_select_multiple(
  343. ('p[class~="class1"]', ['pmulti']),
  344. ('p[class~="class2"]', ['pmulti']),
  345. ('p[class~="class3"]', ['pmulti']),
  346. ('[class~="class1"]', ['pmulti']),
  347. ('[class~="class2"]', ['pmulti']),
  348. ('[class~="class3"]', ['pmulti']),
  349. ('a[rel~="friend"]', ['bob']),
  350. ('a[rel~="met"]', ['bob']),
  351. ('[rel~="friend"]', ['bob']),
  352. ('[rel~="met"]', ['bob']),
  353. )
  354. def test_attribute_startswith(self):
  355. self.assert_select_multiple(
  356. ('[rel^="style"]', ['l1']),
  357. ('link[rel^="style"]', ['l1']),
  358. ('notlink[rel^="notstyle"]', []),
  359. ('[rel^="notstyle"]', []),
  360. ('link[rel^="notstyle"]', []),
  361. ('link[href^="bla"]', ['l1']),
  362. ('a[href^="http://"]', ['bob', 'me']),
  363. ('[href^="http://"]', ['bob', 'me']),
  364. ('[id^="p"]', ['pmulti', 'p1']),
  365. ('[id^="m"]', ['me', 'main']),
  366. ('div[id^="m"]', ['main']),
  367. ('a[id^="m"]', ['me']),
  368. ('div[data-tag^="dashed"]', ['data1'])
  369. )
  370. def test_attribute_endswith(self):
  371. self.assert_select_multiple(
  372. ('[href$=".css"]', ['l1']),
  373. ('link[href$=".css"]', ['l1']),
  374. ('link[id$="1"]', ['l1']),
  375. ('[id$="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's2a1', 's1a2s1', 'dash1']),
  376. ('div[id$="1"]', ['data1']),
  377. ('[id$="noending"]', []),
  378. )
  379. def test_attribute_contains(self):
  380. self.assert_select_multiple(
  381. # From test_attribute_startswith
  382. ('[rel*="style"]', ['l1']),
  383. ('link[rel*="style"]', ['l1']),
  384. ('notlink[rel*="notstyle"]', []),
  385. ('[rel*="notstyle"]', []),
  386. ('link[rel*="notstyle"]', []),
  387. ('link[href*="bla"]', ['l1']),
  388. ('[href*="http://"]', ['bob', 'me']),
  389. ('[id*="p"]', ['pmulti', 'p1']),
  390. ('div[id*="m"]', ['main']),
  391. ('a[id*="m"]', ['me']),
  392. # From test_attribute_endswith
  393. ('[href*=".css"]', ['l1']),
  394. ('link[href*=".css"]', ['l1']),
  395. ('link[id*="1"]', ['l1']),
  396. ('[id*="1"]', ['data1', 'l1', 'p1', 'header1', 's1a1', 's1a2', 's2a1', 's1a2s1', 'dash1']),
  397. ('div[id*="1"]', ['data1']),
  398. ('[id*="noending"]', []),
  399. # New for this test
  400. ('[href*="."]', ['bob', 'me', 'l1']),
  401. ('a[href*="."]', ['bob', 'me']),
  402. ('link[href*="."]', ['l1']),
  403. ('div[id*="n"]', ['main', 'inner']),
  404. ('div[id*="nn"]', ['inner']),
  405. ('div[data-tag*="edval"]', ['data1'])
  406. )
  407. def test_attribute_exact_or_hypen(self):
  408. self.assert_select_multiple(
  409. ('p[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
  410. ('[lang|="en"]', ['lang-en', 'lang-en-gb', 'lang-en-us']),
  411. ('p[lang|="fr"]', ['lang-fr']),
  412. ('p[lang|="gb"]', []),
  413. )
  414. def test_attribute_exists(self):
  415. self.assert_select_multiple(
  416. ('[rel]', ['l1', 'bob', 'me']),
  417. ('link[rel]', ['l1']),
  418. ('a[rel]', ['bob', 'me']),
  419. ('[lang]', ['lang-en', 'lang-en-gb', 'lang-en-us', 'lang-fr']),
  420. ('p[class]', ['p1', 'pmulti']),
  421. ('[blah]', []),
  422. ('p[blah]', []),
  423. ('div[data-tag]', ['data1'])
  424. )
  425. def test_quoted_space_in_selector_name(self):
  426. html = """<div style="display: wrong">nope</div>
  427. <div style="display: right">yes</div>
  428. """
  429. soup = BeautifulSoup(html, 'html.parser')
  430. [chosen] = soup.select('div[style="display: right"]')
  431. assert "yes" == chosen.string
  432. def test_unsupported_pseudoclass(self):
  433. with pytest.raises(NotImplementedError):
  434. self.soup.select("a:no-such-pseudoclass")
  435. with pytest.raises(SelectorSyntaxError):
  436. self.soup.select("a:nth-of-type(a)")
  437. def test_nth_of_type(self):
  438. # Try to select first paragraph
  439. els = self.soup.select('div#inner p:nth-of-type(1)')
  440. assert len(els) == 1
  441. assert els[0].string == 'Some text'
  442. # Try to select third paragraph
  443. els = self.soup.select('div#inner p:nth-of-type(3)')
  444. assert len(els) == 1
  445. assert els[0].string == 'Another'
  446. # Try to select (non-existent!) fourth paragraph
  447. els = self.soup.select('div#inner p:nth-of-type(4)')
  448. assert len(els) == 0
  449. # Zero will select no tags.
  450. els = self.soup.select('div p:nth-of-type(0)')
  451. assert len(els) == 0
  452. def test_nth_of_type_direct_descendant(self):
  453. els = self.soup.select('div#inner > p:nth-of-type(1)')
  454. assert len(els) == 1
  455. assert els[0].string == 'Some text'
  456. def test_id_child_selector_nth_of_type(self):
  457. self.assert_selects('#inner > p:nth-of-type(2)', ['p1'])
  458. def test_select_on_element(self):
  459. # Other tests operate on the tree; this operates on an element
  460. # within the tree.
  461. inner = self.soup.find("div", id="main")
  462. selected = inner.select("div")
  463. # The <div id="inner"> tag was selected. The <div id="footer">
  464. # tag was not.
  465. self.assert_selects_ids(selected, ['inner', 'data1'])
  466. def test_overspecified_child_id(self):
  467. self.assert_selects(".fancy #inner", ['inner'])
  468. self.assert_selects(".normal #inner", [])
  469. def test_adjacent_sibling_selector(self):
  470. self.assert_selects('#p1 + h2', ['header2'])
  471. self.assert_selects('#p1 + h2 + p', ['pmulti'])
  472. self.assert_selects('#p1 + #header2 + .class1', ['pmulti'])
  473. assert [] == self.soup.select('#p1 + p')
  474. def test_general_sibling_selector(self):
  475. self.assert_selects('#p1 ~ h2', ['header2', 'header3'])
  476. self.assert_selects('#p1 ~ #header2', ['header2'])
  477. self.assert_selects('#p1 ~ h2 + a', ['me'])
  478. self.assert_selects('#p1 ~ h2 + [rel="me"]', ['me'])
  479. assert [] == self.soup.select('#inner ~ h2')
  480. def test_dangling_combinator(self):
  481. with pytest.raises(SelectorSyntaxError):
  482. self.soup.select('h1 >')
  483. def test_sibling_combinator_wont_select_same_tag_twice(self):
  484. self.assert_selects('p[lang] ~ p', ['lang-en-gb', 'lang-en-us', 'lang-fr'])
  485. # Test the selector grouping operator (the comma)
  486. def test_multiple_select(self):
  487. self.assert_selects('x, y', ['xid', 'yid'])
  488. def test_multiple_select_with_no_space(self):
  489. self.assert_selects('x,y', ['xid', 'yid'])
  490. def test_multiple_select_with_more_space(self):
  491. self.assert_selects('x, y', ['xid', 'yid'])
  492. def test_multiple_select_duplicated(self):
  493. self.assert_selects('x, x', ['xid'])
  494. def test_multiple_select_sibling(self):
  495. self.assert_selects('x, y ~ p[lang=fr]', ['xid', 'lang-fr'])
  496. def test_multiple_select_tag_and_direct_descendant(self):
  497. self.assert_selects('x, y > z', ['xid', 'zidb'])
  498. def test_multiple_select_direct_descendant_and_tags(self):
  499. self.assert_selects('div > x, y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
  500. def test_multiple_select_indirect_descendant(self):
  501. self.assert_selects('div x,y, z', ['xid', 'yid', 'zida', 'zidb', 'zidab', 'zidac'])
  502. def test_invalid_multiple_select(self):
  503. with pytest.raises(SelectorSyntaxError):
  504. self.soup.select(',x, y')
  505. with pytest.raises(SelectorSyntaxError):
  506. self.soup.select('x,,y')
  507. def test_multiple_select_attrs(self):
  508. self.assert_selects('p[lang=en], p[lang=en-gb]', ['lang-en', 'lang-en-gb'])
  509. def test_multiple_select_ids(self):
  510. self.assert_selects('x, y > z[id=zida], z[id=zidab], z[id=zidb]', ['xid', 'zidb', 'zidab'])
  511. def test_multiple_select_nested(self):
  512. self.assert_selects('body > div > x, y > z', ['xid', 'zidb'])
  513. def test_select_duplicate_elements(self):
  514. # When markup contains duplicate elements, a multiple select
  515. # will find all of them.
  516. markup = '<div class="c1"/><div class="c2"/><div class="c1"/>'
  517. soup = BeautifulSoup(markup, 'html.parser')
  518. selected = soup.select(".c1, .c2")
  519. assert 3 == len(selected)
  520. # Verify that find_all finds the same elements, though because
  521. # of an implementation detail it finds them in a different
  522. # order.
  523. for element in soup.find_all(class_=['c1', 'c2']):
  524. assert element in selected
  525. class TestPersistence(SoupTest):
  526. "Testing features like pickle and deepcopy."
  527. def setup_method(self):
  528. self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
  529. "http://www.w3.org/TR/REC-html40/transitional.dtd">
  530. <html>
  531. <head>
  532. <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  533. <title>Beautiful Soup: We called him Tortoise because he taught us.</title>
  534. <link rev="made" href="mailto:leonardr@segfault.org">
  535. <meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
  536. <meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
  537. <meta name="author" content="Leonard Richardson">
  538. </head>
  539. <body>
  540. <a href="foo">foo</a>
  541. <a href="foo"><b>bar</b></a>
  542. </body>
  543. </html>"""
  544. self.tree = self.soup(self.page)
  545. def test_pickle_and_unpickle_identity(self):
  546. # Pickling a tree, then unpickling it, yields a tree identical
  547. # to the original.
  548. dumped = pickle.dumps(self.tree, 2)
  549. loaded = pickle.loads(dumped)
  550. assert loaded.__class__ == BeautifulSoup
  551. assert loaded.decode() == self.tree.decode()
  552. def test_deepcopy_identity(self):
  553. # Making a deepcopy of a tree yields an identical tree.
  554. copied = copy.deepcopy(self.tree)
  555. assert copied.decode() == self.tree.decode()
  556. def test_copy_preserves_encoding(self):
  557. soup = BeautifulSoup(b'<p>&nbsp;</p>', 'html.parser')
  558. encoding = soup.original_encoding
  559. copy = soup.__copy__()
  560. assert "<p> </p>" == str(copy)
  561. assert encoding == copy.original_encoding
  562. def test_copy_preserves_builder_information(self):
  563. tag = self.soup('<p></p>').p
  564. # Simulate a tag obtained from a source file.
  565. tag.sourceline = 10
  566. tag.sourcepos = 33
  567. copied = tag.__copy__()
  568. # The TreeBuilder object is no longer availble, but information
  569. # obtained from it gets copied over to the new Tag object.
  570. assert tag.sourceline == copied.sourceline
  571. assert tag.sourcepos == copied.sourcepos
  572. assert tag.can_be_empty_element == copied.can_be_empty_element
  573. assert tag.cdata_list_attributes == copied.cdata_list_attributes
  574. assert tag.preserve_whitespace_tags == copied.preserve_whitespace_tags
  575. def test_unicode_pickle(self):
  576. # A tree containing Unicode characters can be pickled.
  577. html = "<b>\N{SNOWMAN}</b>"
  578. soup = self.soup(html)
  579. dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
  580. loaded = pickle.loads(dumped)
  581. assert loaded.decode() == soup.decode()
  582. def test_copy_navigablestring_is_not_attached_to_tree(self):
  583. html = "<b>Foo<a></a></b><b>Bar</b>"
  584. soup = self.soup(html)
  585. s1 = soup.find(string="Foo")
  586. s2 = copy.copy(s1)
  587. assert s1 == s2
  588. assert None == s2.parent
  589. assert None == s2.next_element
  590. assert None != s1.next_sibling
  591. assert None == s2.next_sibling
  592. assert None == s2.previous_element
  593. def test_copy_navigablestring_subclass_has_same_type(self):
  594. html = "<b><!--Foo--></b>"
  595. soup = self.soup(html)
  596. s1 = soup.string
  597. s2 = copy.copy(s1)
  598. assert s1 == s2
  599. assert isinstance(s2, Comment)
  600. def test_copy_entire_soup(self):
  601. html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
  602. soup = self.soup(html)
  603. soup_copy = copy.copy(soup)
  604. assert soup == soup_copy
  605. def test_copy_tag_copies_contents(self):
  606. html = "<div><b>Foo<a></a></b><b>Bar</b></div>end"
  607. soup = self.soup(html)
  608. div = soup.div
  609. div_copy = copy.copy(div)
  610. # The two tags look the same, and evaluate to equal.
  611. assert str(div) == str(div_copy)
  612. assert div == div_copy
  613. # But they're not the same object.
  614. assert div is not div_copy
  615. # And they don't have the same relation to the parse tree. The
  616. # copy is not associated with a parse tree at all.
  617. assert None == div_copy.parent
  618. assert None == div_copy.previous_element
  619. assert None == div_copy.find(string='Bar').next_element
  620. assert None != div.find(string='Bar').next_element