test_tag.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221
  1. import warnings
  2. from bs4.element import (
  3. Comment,
  4. NavigableString,
  5. )
  6. from . import SoupTest
  7. class TestTag(SoupTest):
  8. """Test various methods of Tag which aren't so complicated they
  9. need their own classes.
  10. """
  11. def test__should_pretty_print(self):
  12. # Test the rules about when a tag should be pretty-printed.
  13. tag = self.soup("").new_tag("a_tag")
  14. # No list of whitespace-preserving tags -> pretty-print
  15. tag._preserve_whitespace_tags = None
  16. assert True == tag._should_pretty_print(0)
  17. # List exists but tag is not on the list -> pretty-print
  18. tag.preserve_whitespace_tags = ["some_other_tag"]
  19. assert True == tag._should_pretty_print(1)
  20. # Indent level is None -> don't pretty-print
  21. assert False == tag._should_pretty_print(None)
  22. # Tag is on the whitespace-preserving list -> don't pretty-print
  23. tag.preserve_whitespace_tags = ["some_other_tag", "a_tag"]
  24. assert False == tag._should_pretty_print(1)
  25. def test_len(self):
  26. """The length of a Tag is its number of children."""
  27. soup = self.soup("<top>1<b>2</b>3</top>")
  28. # The BeautifulSoup object itself contains one element: the
  29. # <top> tag.
  30. assert len(soup.contents) == 1
  31. assert len(soup) == 1
  32. # The <top> tag contains three elements: the text node "1", the
  33. # <b> tag, and the text node "3".
  34. assert len(soup.top) == 3
  35. assert len(soup.top.contents) == 3
  36. def test_member_access_invokes_find(self):
  37. """Accessing a Python member .foo invokes find('foo')"""
  38. soup = self.soup('<b><i></i></b>')
  39. assert soup.b == soup.find('b')
  40. assert soup.b.i == soup.find('b').find('i')
  41. assert soup.a == None
  42. def test_deprecated_member_access(self):
  43. soup = self.soup('<b><i></i></b>')
  44. with warnings.catch_warnings(record=True) as w:
  45. tag = soup.bTag
  46. assert soup.b == tag
  47. assert '.bTag is deprecated, use .find("b") instead. If you really were looking for a tag called bTag, use .find("bTag")' == str(w[0].message)
  48. def test_has_attr(self):
  49. """has_attr() checks for the presence of an attribute.
  50. Please note note: has_attr() is different from
  51. __in__. has_attr() checks the tag's attributes and __in__
  52. checks the tag's chidlren.
  53. """
  54. soup = self.soup("<foo attr='bar'>")
  55. assert soup.foo.has_attr('attr')
  56. assert not soup.foo.has_attr('attr2')
  57. def test_attributes_come_out_in_alphabetical_order(self):
  58. markup = '<b a="1" z="5" m="3" f="2" y="4"></b>'
  59. self.assertSoupEquals(markup, '<b a="1" f="2" m="3" y="4" z="5"></b>')
  60. def test_string(self):
  61. # A Tag that contains only a text node makes that node
  62. # available as .string.
  63. soup = self.soup("<b>foo</b>")
  64. assert soup.b.string == 'foo'
  65. def test_empty_tag_has_no_string(self):
  66. # A Tag with no children has no .stirng.
  67. soup = self.soup("<b></b>")
  68. assert soup.b.string == None
  69. def test_tag_with_multiple_children_has_no_string(self):
  70. # A Tag with no children has no .string.
  71. soup = self.soup("<a>foo<b></b><b></b></b>")
  72. assert soup.b.string == None
  73. soup = self.soup("<a>foo<b></b>bar</b>")
  74. assert soup.b.string == None
  75. # Even if all the children are strings, due to trickery,
  76. # it won't work--but this would be a good optimization.
  77. soup = self.soup("<a>foo</b>")
  78. soup.a.insert(1, "bar")
  79. assert soup.a.string == None
  80. def test_tag_with_recursive_string_has_string(self):
  81. # A Tag with a single child which has a .string inherits that
  82. # .string.
  83. soup = self.soup("<a><b>foo</b></a>")
  84. assert soup.a.string == "foo"
  85. assert soup.string == "foo"
  86. def test_lack_of_string(self):
  87. """Only a Tag containing a single text node has a .string."""
  88. soup = self.soup("<b>f<i>e</i>o</b>")
  89. assert soup.b.string is None
  90. soup = self.soup("<b></b>")
  91. assert soup.b.string is None
  92. def test_all_text(self):
  93. """Tag.text and Tag.get_text(sep=u"") -> all child text, concatenated"""
  94. soup = self.soup("<a>a<b>r</b> <r> t </r></a>")
  95. assert soup.a.text == "ar t "
  96. assert soup.a.get_text(strip=True) == "art"
  97. assert soup.a.get_text(",") == "a,r, , t "
  98. assert soup.a.get_text(",", strip=True) == "a,r,t"
  99. def test_get_text_ignores_special_string_containers(self):
  100. soup = self.soup("foo<!--IGNORE-->bar")
  101. assert soup.get_text() == "foobar"
  102. assert soup.get_text(types=(NavigableString, Comment)) == "fooIGNOREbar"
  103. assert soup.get_text(types=None) == "fooIGNOREbar"
  104. soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
  105. assert soup.get_text() == "foobar"
  106. def test_all_strings_ignores_special_string_containers(self):
  107. soup = self.soup("foo<!--IGNORE-->bar")
  108. assert ['foo', 'bar'] == list(soup.strings)
  109. soup = self.soup("foo<style>CSS</style><script>Javascript</script>bar")
  110. assert ['foo', 'bar'] == list(soup.strings)
  111. def test_string_methods_inside_special_string_container_tags(self):
  112. # Strings inside tags like <script> are generally ignored by
  113. # methods like get_text, because they're not what humans
  114. # consider 'text'. But if you call get_text on the <script>
  115. # tag itself, those strings _are_ considered to be 'text',
  116. # because there's nothing else you might be looking for.
  117. style = self.soup("<div>a<style>Some CSS</style></div>")
  118. template = self.soup("<div>a<template><p>Templated <b>text</b>.</p><!--With a comment.--></template></div>")
  119. script = self.soup("<div>a<script><!--a comment-->Some text</script></div>")
  120. assert style.div.get_text() == "a"
  121. assert list(style.div.strings) == ["a"]
  122. assert style.div.style.get_text() == "Some CSS"
  123. assert list(style.div.style.strings) == ['Some CSS']
  124. # The comment is not picked up here. That's because it was
  125. # parsed into a Comment object, which is not considered
  126. # interesting by template.strings.
  127. assert template.div.get_text() == "a"
  128. assert list(template.div.strings) == ["a"]
  129. assert template.div.template.get_text() == "Templated text."
  130. assert list(template.div.template.strings) == ["Templated ", "text", "."]
  131. # The comment is included here, because it didn't get parsed
  132. # into a Comment object--it's part of the Script string.
  133. assert script.div.get_text() == "a"
  134. assert list(script.div.strings) == ["a"]
  135. assert script.div.script.get_text() == "<!--a comment-->Some text"
  136. assert list(script.div.script.strings) == ['<!--a comment-->Some text']
  137. class TestMultiValuedAttributes(SoupTest):
  138. """Test the behavior of multi-valued attributes like 'class'.
  139. The values of such attributes are always presented as lists.
  140. """
  141. def test_single_value_becomes_list(self):
  142. soup = self.soup("<a class='foo'>")
  143. assert ["foo"] ==soup.a['class']
  144. def test_multiple_values_becomes_list(self):
  145. soup = self.soup("<a class='foo bar'>")
  146. assert ["foo", "bar"] == soup.a['class']
  147. def test_multiple_values_separated_by_weird_whitespace(self):
  148. soup = self.soup("<a class='foo\tbar\nbaz'>")
  149. assert ["foo", "bar", "baz"] ==soup.a['class']
  150. def test_attributes_joined_into_string_on_output(self):
  151. soup = self.soup("<a class='foo\tbar'>")
  152. assert b'<a class="foo bar"></a>' == soup.a.encode()
  153. def test_get_attribute_list(self):
  154. soup = self.soup("<a id='abc def'>")
  155. assert ['abc def'] == soup.a.get_attribute_list('id')
  156. def test_accept_charset(self):
  157. soup = self.soup('<form accept-charset="ISO-8859-1 UTF-8">')
  158. assert ['ISO-8859-1', 'UTF-8'] == soup.form['accept-charset']
  159. def test_cdata_attribute_applying_only_to_one_tag(self):
  160. data = '<a accept-charset="ISO-8859-1 UTF-8"></a>'
  161. soup = self.soup(data)
  162. # We saw in another test that accept-charset is a cdata-list
  163. # attribute for the <form> tag. But it's not a cdata-list
  164. # attribute for any other tag.
  165. assert 'ISO-8859-1 UTF-8' == soup.a['accept-charset']
  166. def test_customization(self):
  167. # It's possible to change which attributes of which tags
  168. # are treated as multi-valued attributes.
  169. #
  170. # Here, 'id' is a multi-valued attribute and 'class' is not.
  171. #
  172. # TODO: This code is in the builder and should be tested there.
  173. soup = self.soup(
  174. '<a class="foo" id="bar">', multi_valued_attributes={ '*' : 'id' }
  175. )
  176. assert soup.a['class'] == 'foo'
  177. assert soup.a['id'] == ['bar']