formatter.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. from bs4.dammit import EntitySubstitution
  2. class Formatter(EntitySubstitution):
  3. """Describes a strategy to use when outputting a parse tree to a string.
  4. Some parts of this strategy come from the distinction between
  5. HTML4, HTML5, and XML. Others are configurable by the user.
  6. Formatters are passed in as the `formatter` argument to methods
  7. like `PageElement.encode`. Most people won't need to think about
  8. formatters, and most people who need to think about them can pass
  9. in one of these predefined strings as `formatter` rather than
  10. making a new Formatter object:
  11. For HTML documents:
  12. * 'html' - HTML entity substitution for generic HTML documents. (default)
  13. * 'html5' - HTML entity substitution for HTML5 documents, as
  14. well as some optimizations in the way tags are rendered.
  15. * 'minimal' - Only make the substitutions necessary to guarantee
  16. valid HTML.
  17. * None - Do not perform any substitution. This will be faster
  18. but may result in invalid markup.
  19. For XML documents:
  20. * 'html' - Entity substitution for XHTML documents.
  21. * 'minimal' - Only make the substitutions necessary to guarantee
  22. valid XML. (default)
  23. * None - Do not perform any substitution. This will be faster
  24. but may result in invalid markup.
  25. """
  26. # Registries of XML and HTML formatters.
  27. XML_FORMATTERS = {}
  28. HTML_FORMATTERS = {}
  29. HTML = 'html'
  30. XML = 'xml'
  31. HTML_DEFAULTS = dict(
  32. cdata_containing_tags=set(["script", "style"]),
  33. )
  34. def _default(self, language, value, kwarg):
  35. if value is not None:
  36. return value
  37. if language == self.XML:
  38. return set()
  39. return self.HTML_DEFAULTS[kwarg]
  40. def __init__(
  41. self, language=None, entity_substitution=None,
  42. void_element_close_prefix='/', cdata_containing_tags=None,
  43. empty_attributes_are_booleans=False, indent=1,
  44. ):
  45. """Constructor.
  46. :param language: This should be Formatter.XML if you are formatting
  47. XML markup and Formatter.HTML if you are formatting HTML markup.
  48. :param entity_substitution: A function to call to replace special
  49. characters with XML/HTML entities. For examples, see
  50. bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
  51. :param void_element_close_prefix: By default, void elements
  52. are represented as <tag/> (XML rules) rather than <tag>
  53. (HTML rules). To get <tag>, pass in the empty string.
  54. :param cdata_containing_tags: The list of tags that are defined
  55. as containing CDATA in this dialect. For example, in HTML,
  56. <script> and <style> tags are defined as containing CDATA,
  57. and their contents should not be formatted.
  58. :param blank_attributes_are_booleans: Render attributes whose value
  59. is the empty string as HTML-style boolean attributes.
  60. (Attributes whose value is None are always rendered this way.)
  61. :param indent: If indent is a non-negative integer or string,
  62. then the contents of elements will be indented
  63. appropriately when pretty-printing. An indent level of 0,
  64. negative, or "" will only insert newlines. Using a
  65. positive integer indent indents that many spaces per
  66. level. If indent is a string (such as "\t"), that string
  67. is used to indent each level. The default behavior to
  68. indent one space per level.
  69. """
  70. self.language = language
  71. self.entity_substitution = entity_substitution
  72. self.void_element_close_prefix = void_element_close_prefix
  73. self.cdata_containing_tags = self._default(
  74. language, cdata_containing_tags, 'cdata_containing_tags'
  75. )
  76. self.empty_attributes_are_booleans=empty_attributes_are_booleans
  77. if indent is None:
  78. indent = 0
  79. if isinstance(indent, int):
  80. if indent < 0:
  81. indent = 0
  82. indent = ' ' * indent
  83. elif isinstance(indent, str):
  84. indent = indent
  85. else:
  86. indent = ' '
  87. self.indent = indent
  88. def substitute(self, ns):
  89. """Process a string that needs to undergo entity substitution.
  90. This may be a string encountered in an attribute value or as
  91. text.
  92. :param ns: A string.
  93. :return: A string with certain characters replaced by named
  94. or numeric entities.
  95. """
  96. if not self.entity_substitution:
  97. return ns
  98. from .element import NavigableString
  99. if (isinstance(ns, NavigableString)
  100. and ns.parent is not None
  101. and ns.parent.name in self.cdata_containing_tags):
  102. # Do nothing.
  103. return ns
  104. # Substitute.
  105. return self.entity_substitution(ns)
  106. def attribute_value(self, value):
  107. """Process the value of an attribute.
  108. :param ns: A string.
  109. :return: A string with certain characters replaced by named
  110. or numeric entities.
  111. """
  112. return self.substitute(value)
  113. def attributes(self, tag):
  114. """Reorder a tag's attributes however you want.
  115. By default, attributes are sorted alphabetically. This makes
  116. behavior consistent between Python 2 and Python 3, and preserves
  117. backwards compatibility with older versions of Beautiful Soup.
  118. If `empty_boolean_attributes` is True, then attributes whose
  119. values are set to the empty string will be treated as boolean
  120. attributes.
  121. """
  122. if tag.attrs is None:
  123. return []
  124. return sorted(
  125. (k, (None if self.empty_attributes_are_booleans and v == '' else v))
  126. for k, v in list(tag.attrs.items())
  127. )
  128. class HTMLFormatter(Formatter):
  129. """A generic Formatter for HTML."""
  130. REGISTRY = {}
  131. def __init__(self, *args, **kwargs):
  132. return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
  133. class XMLFormatter(Formatter):
  134. """A generic Formatter for XML."""
  135. REGISTRY = {}
  136. def __init__(self, *args, **kwargs):
  137. return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
  138. # Set up aliases for the default formatters.
  139. HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
  140. entity_substitution=EntitySubstitution.substitute_html
  141. )
  142. HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
  143. entity_substitution=EntitySubstitution.substitute_html,
  144. void_element_close_prefix=None,
  145. empty_attributes_are_booleans=True,
  146. )
  147. HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
  148. entity_substitution=EntitySubstitution.substitute_xml
  149. )
  150. HTMLFormatter.REGISTRY[None] = HTMLFormatter(
  151. entity_substitution=None
  152. )
  153. XMLFormatter.REGISTRY["html"] = XMLFormatter(
  154. entity_substitution=EntitySubstitution.substitute_html
  155. )
  156. XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
  157. entity_substitution=EntitySubstitution.substitute_xml
  158. )
  159. XMLFormatter.REGISTRY[None] = Formatter(
  160. Formatter(Formatter.XML, entity_substitution=None)
  161. )