__init__.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. # Use of this source code is governed by the MIT license.
  2. __license__ = "MIT"
  3. from collections import defaultdict
  4. import itertools
  5. import re
  6. import warnings
  7. import sys
  8. from bs4.element import (
  9. CharsetMetaAttributeValue,
  10. ContentMetaAttributeValue,
  11. RubyParenthesisString,
  12. RubyTextString,
  13. Stylesheet,
  14. Script,
  15. TemplateString,
  16. nonwhitespace_re
  17. )
  18. __all__ = [
  19. 'HTMLTreeBuilder',
  20. 'SAXTreeBuilder',
  21. 'TreeBuilder',
  22. 'TreeBuilderRegistry',
  23. ]
  24. # Some useful features for a TreeBuilder to have.
  25. FAST = 'fast'
  26. PERMISSIVE = 'permissive'
  27. STRICT = 'strict'
  28. XML = 'xml'
  29. HTML = 'html'
  30. HTML_5 = 'html5'
  31. class XMLParsedAsHTMLWarning(UserWarning):
  32. """The warning issued when an HTML parser is used to parse
  33. XML that is not XHTML.
  34. """
  35. MESSAGE = """It looks like you're parsing an XML document using an HTML parser. If this really is an HTML document (maybe it's XHTML?), you can ignore or filter this warning. If it's XML, you should know that using an XML parser will be more reliable. To parse this document as XML, make sure you have the lxml package installed, and pass the keyword argument `features="xml"` into the BeautifulSoup constructor."""
  36. class TreeBuilderRegistry(object):
  37. """A way of looking up TreeBuilder subclasses by their name or by desired
  38. features.
  39. """
  40. def __init__(self):
  41. self.builders_for_feature = defaultdict(list)
  42. self.builders = []
  43. def register(self, treebuilder_class):
  44. """Register a treebuilder based on its advertised features.
  45. :param treebuilder_class: A subclass of Treebuilder. its .features
  46. attribute should list its features.
  47. """
  48. for feature in treebuilder_class.features:
  49. self.builders_for_feature[feature].insert(0, treebuilder_class)
  50. self.builders.insert(0, treebuilder_class)
  51. def lookup(self, *features):
  52. """Look up a TreeBuilder subclass with the desired features.
  53. :param features: A list of features to look for. If none are
  54. provided, the most recently registered TreeBuilder subclass
  55. will be used.
  56. :return: A TreeBuilder subclass, or None if there's no
  57. registered subclass with all the requested features.
  58. """
  59. if len(self.builders) == 0:
  60. # There are no builders at all.
  61. return None
  62. if len(features) == 0:
  63. # They didn't ask for any features. Give them the most
  64. # recently registered builder.
  65. return self.builders[0]
  66. # Go down the list of features in order, and eliminate any builders
  67. # that don't match every feature.
  68. features = list(features)
  69. features.reverse()
  70. candidates = None
  71. candidate_set = None
  72. while len(features) > 0:
  73. feature = features.pop()
  74. we_have_the_feature = self.builders_for_feature.get(feature, [])
  75. if len(we_have_the_feature) > 0:
  76. if candidates is None:
  77. candidates = we_have_the_feature
  78. candidate_set = set(candidates)
  79. else:
  80. # Eliminate any candidates that don't have this feature.
  81. candidate_set = candidate_set.intersection(
  82. set(we_have_the_feature))
  83. # The only valid candidates are the ones in candidate_set.
  84. # Go through the original list of candidates and pick the first one
  85. # that's in candidate_set.
  86. if candidate_set is None:
  87. return None
  88. for candidate in candidates:
  89. if candidate in candidate_set:
  90. return candidate
  91. return None
  92. # The BeautifulSoup class will take feature lists from developers and use them
  93. # to look up builders in this registry.
  94. builder_registry = TreeBuilderRegistry()
  95. class TreeBuilder(object):
  96. """Turn a textual document into a Beautiful Soup object tree."""
  97. NAME = "[Unknown tree builder]"
  98. ALTERNATE_NAMES = []
  99. features = []
  100. is_xml = False
  101. picklable = False
  102. empty_element_tags = None # A tag will be considered an empty-element
  103. # tag when and only when it has no contents.
  104. # A value for these tag/attribute combinations is a space- or
  105. # comma-separated list of CDATA, rather than a single CDATA.
  106. DEFAULT_CDATA_LIST_ATTRIBUTES = {}
  107. # Whitespace should be preserved inside these tags.
  108. DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
  109. # The textual contents of tags with these names should be
  110. # instantiated with some class other than NavigableString.
  111. DEFAULT_STRING_CONTAINERS = {}
  112. USE_DEFAULT = object()
  113. # Most parsers don't keep track of line numbers.
  114. TRACKS_LINE_NUMBERS = False
  115. def __init__(self, multi_valued_attributes=USE_DEFAULT,
  116. preserve_whitespace_tags=USE_DEFAULT,
  117. store_line_numbers=USE_DEFAULT,
  118. string_containers=USE_DEFAULT,
  119. ):
  120. """Constructor.
  121. :param multi_valued_attributes: If this is set to None, the
  122. TreeBuilder will not turn any values for attributes like
  123. 'class' into lists. Setting this to a dictionary will
  124. customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
  125. for an example.
  126. Internally, these are called "CDATA list attributes", but that
  127. probably doesn't make sense to an end-user, so the argument name
  128. is `multi_valued_attributes`.
  129. :param preserve_whitespace_tags: A list of tags to treat
  130. the way <pre> tags are treated in HTML. Tags in this list
  131. are immune from pretty-printing; their contents will always be
  132. output as-is.
  133. :param string_containers: A dictionary mapping tag names to
  134. the classes that should be instantiated to contain the textual
  135. contents of those tags. The default is to use NavigableString
  136. for every tag, no matter what the name. You can override the
  137. default by changing DEFAULT_STRING_CONTAINERS.
  138. :param store_line_numbers: If the parser keeps track of the
  139. line numbers and positions of the original markup, that
  140. information will, by default, be stored in each corresponding
  141. `Tag` object. You can turn this off by passing
  142. store_line_numbers=False. If the parser you're using doesn't
  143. keep track of this information, then setting store_line_numbers=True
  144. will do nothing.
  145. """
  146. self.soup = None
  147. if multi_valued_attributes is self.USE_DEFAULT:
  148. multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
  149. self.cdata_list_attributes = multi_valued_attributes
  150. if preserve_whitespace_tags is self.USE_DEFAULT:
  151. preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
  152. self.preserve_whitespace_tags = preserve_whitespace_tags
  153. if store_line_numbers == self.USE_DEFAULT:
  154. store_line_numbers = self.TRACKS_LINE_NUMBERS
  155. self.store_line_numbers = store_line_numbers
  156. if string_containers == self.USE_DEFAULT:
  157. string_containers = self.DEFAULT_STRING_CONTAINERS
  158. self.string_containers = string_containers
  159. def initialize_soup(self, soup):
  160. """The BeautifulSoup object has been initialized and is now
  161. being associated with the TreeBuilder.
  162. :param soup: A BeautifulSoup object.
  163. """
  164. self.soup = soup
  165. def reset(self):
  166. """Do any work necessary to reset the underlying parser
  167. for a new document.
  168. By default, this does nothing.
  169. """
  170. pass
  171. def can_be_empty_element(self, tag_name):
  172. """Might a tag with this name be an empty-element tag?
  173. The final markup may or may not actually present this tag as
  174. self-closing.
  175. For instance: an HTMLBuilder does not consider a <p> tag to be
  176. an empty-element tag (it's not in
  177. HTMLBuilder.empty_element_tags). This means an empty <p> tag
  178. will be presented as "<p></p>", not "<p/>" or "<p>".
  179. The default implementation has no opinion about which tags are
  180. empty-element tags, so a tag will be presented as an
  181. empty-element tag if and only if it has no children.
  182. "<foo></foo>" will become "<foo/>", and "<foo>bar</foo>" will
  183. be left alone.
  184. :param tag_name: The name of a markup tag.
  185. """
  186. if self.empty_element_tags is None:
  187. return True
  188. return tag_name in self.empty_element_tags
  189. def feed(self, markup):
  190. """Run some incoming markup through some parsing process,
  191. populating the `BeautifulSoup` object in self.soup.
  192. This method is not implemented in TreeBuilder; it must be
  193. implemented in subclasses.
  194. :return: None.
  195. """
  196. raise NotImplementedError()
  197. def prepare_markup(self, markup, user_specified_encoding=None,
  198. document_declared_encoding=None, exclude_encodings=None):
  199. """Run any preliminary steps necessary to make incoming markup
  200. acceptable to the parser.
  201. :param markup: Some markup -- probably a bytestring.
  202. :param user_specified_encoding: The user asked to try this encoding.
  203. :param document_declared_encoding: The markup itself claims to be
  204. in this encoding. NOTE: This argument is not used by the
  205. calling code and can probably be removed.
  206. :param exclude_encodings: The user asked _not_ to try any of
  207. these encodings.
  208. :yield: A series of 4-tuples:
  209. (markup, encoding, declared encoding,
  210. has undergone character replacement)
  211. Each 4-tuple represents a strategy for converting the
  212. document to Unicode and parsing it. Each strategy will be tried
  213. in turn.
  214. By default, the only strategy is to parse the markup
  215. as-is. See `LXMLTreeBuilderForXML` and
  216. `HTMLParserTreeBuilder` for implementations that take into
  217. account the quirks of particular parsers.
  218. """
  219. yield markup, None, None, False
  220. def test_fragment_to_document(self, fragment):
  221. """Wrap an HTML fragment to make it look like a document.
  222. Different parsers do this differently. For instance, lxml
  223. introduces an empty <head> tag, and html5lib
  224. doesn't. Abstracting this away lets us write simple tests
  225. which run HTML fragments through the parser and compare the
  226. results against other HTML fragments.
  227. This method should not be used outside of tests.
  228. :param fragment: A string -- fragment of HTML.
  229. :return: A string -- a full HTML document.
  230. """
  231. return fragment
  232. def set_up_substitutions(self, tag):
  233. """Set up any substitutions that will need to be performed on
  234. a `Tag` when it's output as a string.
  235. By default, this does nothing. See `HTMLTreeBuilder` for a
  236. case where this is used.
  237. :param tag: A `Tag`
  238. :return: Whether or not a substitution was performed.
  239. """
  240. return False
  241. def _replace_cdata_list_attribute_values(self, tag_name, attrs):
  242. """When an attribute value is associated with a tag that can
  243. have multiple values for that attribute, convert the string
  244. value to a list of strings.
  245. Basically, replaces class="foo bar" with class=["foo", "bar"]
  246. NOTE: This method modifies its input in place.
  247. :param tag_name: The name of a tag.
  248. :param attrs: A dictionary containing the tag's attributes.
  249. Any appropriate attribute values will be modified in place.
  250. """
  251. if not attrs:
  252. return attrs
  253. if self.cdata_list_attributes:
  254. universal = self.cdata_list_attributes.get('*', [])
  255. tag_specific = self.cdata_list_attributes.get(
  256. tag_name.lower(), None)
  257. for attr in list(attrs.keys()):
  258. if attr in universal or (tag_specific and attr in tag_specific):
  259. # We have a "class"-type attribute whose string
  260. # value is a whitespace-separated list of
  261. # values. Split it into a list.
  262. value = attrs[attr]
  263. if isinstance(value, str):
  264. values = nonwhitespace_re.findall(value)
  265. else:
  266. # html5lib sometimes calls setAttributes twice
  267. # for the same tag when rearranging the parse
  268. # tree. On the second call the attribute value
  269. # here is already a list. If this happens,
  270. # leave the value alone rather than trying to
  271. # split it again.
  272. values = value
  273. attrs[attr] = values
  274. return attrs
  275. class SAXTreeBuilder(TreeBuilder):
  276. """A Beautiful Soup treebuilder that listens for SAX events.
  277. This is not currently used for anything, but it demonstrates
  278. how a simple TreeBuilder would work.
  279. """
  280. def feed(self, markup):
  281. raise NotImplementedError()
  282. def close(self):
  283. pass
  284. def startElement(self, name, attrs):
  285. attrs = dict((key[1], value) for key, value in list(attrs.items()))
  286. #print("Start %s, %r" % (name, attrs))
  287. self.soup.handle_starttag(name, attrs)
  288. def endElement(self, name):
  289. #print("End %s" % name)
  290. self.soup.handle_endtag(name)
  291. def startElementNS(self, nsTuple, nodeName, attrs):
  292. # Throw away (ns, nodeName) for now.
  293. self.startElement(nodeName, attrs)
  294. def endElementNS(self, nsTuple, nodeName):
  295. # Throw away (ns, nodeName) for now.
  296. self.endElement(nodeName)
  297. #handler.endElementNS((ns, node.nodeName), node.nodeName)
  298. def startPrefixMapping(self, prefix, nodeValue):
  299. # Ignore the prefix for now.
  300. pass
  301. def endPrefixMapping(self, prefix):
  302. # Ignore the prefix for now.
  303. # handler.endPrefixMapping(prefix)
  304. pass
  305. def characters(self, content):
  306. self.soup.handle_data(content)
  307. def startDocument(self):
  308. pass
  309. def endDocument(self):
  310. pass
  311. class HTMLTreeBuilder(TreeBuilder):
  312. """This TreeBuilder knows facts about HTML.
  313. Such as which tags are empty-element tags.
  314. """
  315. empty_element_tags = set([
  316. # These are from HTML5.
  317. 'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
  318. # These are from earlier versions of HTML and are removed in HTML5.
  319. 'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
  320. ])
  321. # The HTML standard defines these as block-level elements. Beautiful
  322. # Soup does not treat these elements differently from other elements,
  323. # but it may do so eventually, and this information is available if
  324. # you need to use it.
  325. block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
  326. # These HTML tags need special treatment so they can be
  327. # represented by a string class other than NavigableString.
  328. #
  329. # For some of these tags, it's because the HTML standard defines
  330. # an unusual content model for them. I made this list by going
  331. # through the HTML spec
  332. # (https://html.spec.whatwg.org/#metadata-content) and looking for
  333. # "metadata content" elements that can contain strings.
  334. #
  335. # The Ruby tags (<rt> and <rp>) are here despite being normal
  336. # "phrasing content" tags, because the content they contain is
  337. # qualitatively different from other text in the document, and it
  338. # can be useful to be able to distinguish it.
  339. #
  340. # TODO: Arguably <noscript> could go here but it seems
  341. # qualitatively different from the other tags.
  342. DEFAULT_STRING_CONTAINERS = {
  343. 'rt' : RubyTextString,
  344. 'rp' : RubyParenthesisString,
  345. 'style': Stylesheet,
  346. 'script': Script,
  347. 'template': TemplateString,
  348. }
  349. # The HTML standard defines these attributes as containing a
  350. # space-separated list of values, not a single value. That is,
  351. # class="foo bar" means that the 'class' attribute has two values,
  352. # 'foo' and 'bar', not the single value 'foo bar'. When we
  353. # encounter one of these attributes, we will parse its value into
  354. # a list of values if possible. Upon output, the list will be
  355. # converted back into a string.
  356. DEFAULT_CDATA_LIST_ATTRIBUTES = {
  357. "*" : ['class', 'accesskey', 'dropzone'],
  358. "a" : ['rel', 'rev'],
  359. "link" : ['rel', 'rev'],
  360. "td" : ["headers"],
  361. "th" : ["headers"],
  362. "td" : ["headers"],
  363. "form" : ["accept-charset"],
  364. "object" : ["archive"],
  365. # These are HTML5 specific, as are *.accesskey and *.dropzone above.
  366. "area" : ["rel"],
  367. "icon" : ["sizes"],
  368. "iframe" : ["sandbox"],
  369. "output" : ["for"],
  370. }
  371. DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
  372. def set_up_substitutions(self, tag):
  373. """Replace the declared encoding in a <meta> tag with a placeholder,
  374. to be substituted when the tag is output to a string.
  375. An HTML document may come in to Beautiful Soup as one
  376. encoding, but exit in a different encoding, and the <meta> tag
  377. needs to be changed to reflect this.
  378. :param tag: A `Tag`
  379. :return: Whether or not a substitution was performed.
  380. """
  381. # We are only interested in <meta> tags
  382. if tag.name != 'meta':
  383. return False
  384. http_equiv = tag.get('http-equiv')
  385. content = tag.get('content')
  386. charset = tag.get('charset')
  387. # We are interested in <meta> tags that say what encoding the
  388. # document was originally in. This means HTML 5-style <meta>
  389. # tags that provide the "charset" attribute. It also means
  390. # HTML 4-style <meta> tags that provide the "content"
  391. # attribute and have "http-equiv" set to "content-type".
  392. #
  393. # In both cases we will replace the value of the appropriate
  394. # attribute with a standin object that can take on any
  395. # encoding.
  396. meta_encoding = None
  397. if charset is not None:
  398. # HTML 5 style:
  399. # <meta charset="utf8">
  400. meta_encoding = charset
  401. tag['charset'] = CharsetMetaAttributeValue(charset)
  402. elif (content is not None and http_equiv is not None
  403. and http_equiv.lower() == 'content-type'):
  404. # HTML 4 style:
  405. # <meta http-equiv="content-type" content="text/html; charset=utf8">
  406. tag['content'] = ContentMetaAttributeValue(content)
  407. return (meta_encoding is not None)
  408. class DetectsXMLParsedAsHTML(object):
  409. """A mixin class for any class (a TreeBuilder, or some class used by a
  410. TreeBuilder) that's in a position to detect whether an XML
  411. document is being incorrectly parsed as HTML, and issue an
  412. appropriate warning.
  413. This requires being able to observe an incoming processing
  414. instruction that might be an XML declaration, and also able to
  415. observe tags as they're opened. If you can't do that for a given
  416. TreeBuilder, there's a less reliable implementation based on
  417. examining the raw markup.
  418. """
  419. # Regular expression for seeing if markup has an <html> tag.
  420. LOOKS_LIKE_HTML = re.compile("<[^ +]html", re.I)
  421. LOOKS_LIKE_HTML_B = re.compile(b"<[^ +]html", re.I)
  422. XML_PREFIX = '<?xml'
  423. XML_PREFIX_B = b'<?xml'
  424. @classmethod
  425. def warn_if_markup_looks_like_xml(cls, markup):
  426. """Perform a check on some markup to see if it looks like XML
  427. that's not XHTML. If so, issue a warning.
  428. This is much less reliable than doing the check while parsing,
  429. but some of the tree builders can't do that.
  430. :return: True if the markup looks like non-XHTML XML, False
  431. otherwise.
  432. """
  433. if isinstance(markup, bytes):
  434. prefix = cls.XML_PREFIX_B
  435. looks_like_html = cls.LOOKS_LIKE_HTML_B
  436. else:
  437. prefix = cls.XML_PREFIX
  438. looks_like_html = cls.LOOKS_LIKE_HTML
  439. if (markup is not None
  440. and markup.startswith(prefix)
  441. and not looks_like_html.search(markup[:500])
  442. ):
  443. cls._warn()
  444. return True
  445. return False
  446. @classmethod
  447. def _warn(cls):
  448. """Issue a warning about XML being parsed as HTML."""
  449. warnings.warn(
  450. XMLParsedAsHTMLWarning.MESSAGE, XMLParsedAsHTMLWarning
  451. )
  452. def _initialize_xml_detector(self):
  453. """Call this method before parsing a document."""
  454. self._first_processing_instruction = None
  455. self._root_tag = None
  456. def _document_might_be_xml(self, processing_instruction):
  457. """Call this method when encountering an XML declaration, or a
  458. "processing instruction" that might be an XML declaration.
  459. """
  460. if (self._first_processing_instruction is not None
  461. or self._root_tag is not None):
  462. # The document has already started. Don't bother checking
  463. # anymore.
  464. return
  465. self._first_processing_instruction = processing_instruction
  466. # We won't know until we encounter the first tag whether or
  467. # not this is actually a problem.
  468. def _root_tag_encountered(self, name):
  469. """Call this when you encounter the document's root tag.
  470. This is where we actually check whether an XML document is
  471. being incorrectly parsed as HTML, and issue the warning.
  472. """
  473. if self._root_tag is not None:
  474. # This method was incorrectly called multiple times. Do
  475. # nothing.
  476. return
  477. self._root_tag = name
  478. if (name != 'html' and self._first_processing_instruction is not None
  479. and self._first_processing_instruction.lower().startswith('xml ')):
  480. # We encountered an XML declaration and then a tag other
  481. # than 'html'. This is a reliable indicator that a
  482. # non-XHTML document is being parsed as XML.
  483. self._warn()
  484. def register_treebuilders_from(module):
  485. """Copy TreeBuilders from the given module into this module."""
  486. this_module = sys.modules[__name__]
  487. for name in module.__all__:
  488. obj = getattr(module, name)
  489. if issubclass(obj, TreeBuilder):
  490. setattr(this_module, name, obj)
  491. this_module.__all__.append(name)
  492. # Register the builder while we're at it.
  493. this_module.builder_registry.register(obj)
  494. class ParserRejectedMarkup(Exception):
  495. """An Exception to be raised when the underlying parser simply
  496. refuses to parse the given markup.
  497. """
  498. def __init__(self, message_or_exception):
  499. """Explain why the parser rejected the given markup, either
  500. with a textual explanation or another exception.
  501. """
  502. if isinstance(message_or_exception, Exception):
  503. e = message_or_exception
  504. message_or_exception = "%s: %s" % (e.__class__.__name__, str(e))
  505. super(ParserRejectedMarkup, self).__init__(message_or_exception)
  506. # Builders are registered in reverse order of priority, so that custom
  507. # builder registrations will take precedence. In general, we want lxml
  508. # to take precedence over html5lib, because it's faster. And we only
  509. # want to use HTMLParser as a last resort.
  510. from . import _htmlparser
  511. register_treebuilders_from(_htmlparser)
  512. try:
  513. from . import _html5lib
  514. register_treebuilders_from(_html5lib)
  515. except ImportError:
  516. # They don't have html5lib installed.
  517. pass
  518. try:
  519. from . import _lxml
  520. register_treebuilders_from(_lxml)
  521. except ImportError:
  522. # They don't have lxml installed.
  523. pass