123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- import pytest
- from bs4.element import (
- CData,
- Comment,
- Declaration,
- Doctype,
- NavigableString,
- RubyParenthesisString,
- RubyTextString,
- Script,
- Stylesheet,
- TemplateString,
- )
- from . import SoupTest
- class TestNavigableString(SoupTest):
- def test_text_acquisition_methods(self):
- # These methods are intended for use against Tag, but they
- # work on NavigableString as well,
-
- s = NavigableString("fee ")
- cdata = CData("fie ")
- comment = Comment("foe ")
- assert "fee " == s.get_text()
- assert "fee" == s.get_text(strip=True)
- assert ["fee "] == list(s.strings)
- assert ["fee"] == list(s.stripped_strings)
- assert ["fee "] == list(s._all_strings())
- assert "fie " == cdata.get_text()
- assert "fie" == cdata.get_text(strip=True)
- assert ["fie "] == list(cdata.strings)
- assert ["fie"] == list(cdata.stripped_strings)
- assert ["fie "] == list(cdata._all_strings())
-
- # Since a Comment isn't normally considered 'text',
- # these methods generally do nothing.
- assert "" == comment.get_text()
- assert [] == list(comment.strings)
- assert [] == list(comment.stripped_strings)
- assert [] == list(comment._all_strings())
- # Unless you specifically say that comments are okay.
- assert "foe" == comment.get_text(strip=True, types=Comment)
- assert "foe " == comment.get_text(types=(Comment, NavigableString))
- def test_string_has_immutable_name_property(self):
- # string.name is defined as None and can't be modified
- string = self.soup("s").string
- assert None == string.name
- with pytest.raises(AttributeError):
- string.name = 'foo'
- class TestNavigableStringSubclasses(SoupTest):
- def test_cdata(self):
- # None of the current builders turn CDATA sections into CData
- # objects, but you can create them manually.
- soup = self.soup("")
- cdata = CData("foo")
- soup.insert(1, cdata)
- assert str(soup) == "<![CDATA[foo]]>"
- assert soup.find(string="foo") == "foo"
- assert soup.contents[0] == "foo"
- def test_cdata_is_never_formatted(self):
- """Text inside a CData object is passed into the formatter.
- But the return value is ignored.
- """
- self.count = 0
- def increment(*args):
- self.count += 1
- return "BITTER FAILURE"
- soup = self.soup("")
- cdata = CData("<><><>")
- soup.insert(1, cdata)
- assert b"<![CDATA[<><><>]]>" == soup.encode(formatter=increment)
- assert 1 == self.count
- def test_doctype_ends_in_newline(self):
- # Unlike other NavigableString subclasses, a DOCTYPE always ends
- # in a newline.
- doctype = Doctype("foo")
- soup = self.soup("")
- soup.insert(1, doctype)
- assert soup.encode() == b"<!DOCTYPE foo>\n"
- def test_declaration(self):
- d = Declaration("foo")
- assert "<?foo?>" == d.output_ready()
- def test_default_string_containers(self):
- # In some cases, we use different NavigableString subclasses for
- # the same text in different tags.
- soup = self.soup(
- "<div>text</div><script>text</script><style>text</style>"
- )
- assert [NavigableString, Script, Stylesheet] == [
- x.__class__ for x in soup.find_all(string=True)
- ]
- # The TemplateString is a little unusual because it's generally found
- # _inside_ children of a <template> element, not a direct child of the
- # <template> element.
- soup = self.soup(
- "<template>Some text<p>In a tag</p></template>Some text outside"
- )
- assert all(
- isinstance(x, TemplateString)
- for x in soup.template._all_strings(types=None)
- )
-
- # Once the <template> tag closed, we went back to using
- # NavigableString.
- outside = soup.template.next_sibling
- assert isinstance(outside, NavigableString)
- assert not isinstance(outside, TemplateString)
- # The TemplateString is also unusual because it can contain
- # NavigableString subclasses of _other_ types, such as
- # Comment.
- markup = b"<template>Some text<p>In a tag</p><!--with a comment--></template>"
- soup = self.soup(markup)
- assert markup == soup.template.encode("utf8")
- def test_ruby_strings(self):
- markup = "<ruby>漢 <rp>(</rp><rt>kan</rt><rp>)</rp> 字 <rp>(</rp><rt>ji</rt><rp>)</rp></ruby>"
- soup = self.soup(markup)
- assert isinstance(soup.rp.string, RubyParenthesisString)
- assert isinstance(soup.rt.string, RubyTextString)
- # Just as a demo, here's what this means for get_text usage.
- assert "漢字" == soup.get_text(strip=True)
- assert "漢(kan)字(ji)" == soup.get_text(
- strip=True,
- types=(NavigableString, RubyTextString, RubyParenthesisString)
- )
|