_htmlparser.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499
  1. # encoding: utf-8
  2. """Use the HTMLParser library to parse HTML files that aren't too bad."""
  3. # Use of this source code is governed by the MIT license.
  4. __license__ = "MIT"
  5. __all__ = [
  6. 'HTMLParserTreeBuilder',
  7. ]
  8. from html.parser import HTMLParser
  9. try:
  10. from html.parser import HTMLParseError
  11. except ImportError as e:
  12. # HTMLParseError is removed in Python 3.5. Since it can never be
  13. # thrown in 3.5, we can just define our own class as a placeholder.
  14. class HTMLParseError(Exception):
  15. pass
  16. import sys
  17. import warnings
  18. # Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
  19. # argument, which we'd like to set to False. Unfortunately,
  20. # http://bugs.python.org/issue13273 makes strict=True a better bet
  21. # before Python 3.2.3.
  22. #
  23. # At the end of this file, we monkeypatch HTMLParser so that
  24. # strict=True works well on Python 3.2.2.
  25. major, minor, release = sys.version_info[:3]
  26. CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
  27. CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
  28. CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
  29. from bs4.element import (
  30. CData,
  31. Comment,
  32. Declaration,
  33. Doctype,
  34. ProcessingInstruction,
  35. )
  36. from bs4.dammit import EntitySubstitution, UnicodeDammit
  37. from bs4.builder import (
  38. DetectsXMLParsedAsHTML,
  39. HTML,
  40. HTMLTreeBuilder,
  41. STRICT,
  42. )
  43. HTMLPARSER = 'html.parser'
  44. class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):
  45. """A subclass of the Python standard library's HTMLParser class, which
  46. listens for HTMLParser events and translates them into calls
  47. to Beautiful Soup's tree construction API.
  48. """
  49. # Strategies for handling duplicate attributes
  50. IGNORE = 'ignore'
  51. REPLACE = 'replace'
  52. def __init__(self, *args, **kwargs):
  53. """Constructor.
  54. :param on_duplicate_attribute: A strategy for what to do if a
  55. tag includes the same attribute more than once. Accepted
  56. values are: REPLACE (replace earlier values with later
  57. ones, the default), IGNORE (keep the earliest value
  58. encountered), or a callable. A callable must take three
  59. arguments: the dictionary of attributes already processed,
  60. the name of the duplicate attribute, and the most recent value
  61. encountered.
  62. """
  63. self.on_duplicate_attribute = kwargs.pop(
  64. 'on_duplicate_attribute', self.REPLACE
  65. )
  66. HTMLParser.__init__(self, *args, **kwargs)
  67. # Keep a list of empty-element tags that were encountered
  68. # without an explicit closing tag. If we encounter a closing tag
  69. # of this type, we'll associate it with one of those entries.
  70. #
  71. # This isn't a stack because we don't care about the
  72. # order. It's a list of closing tags we've already handled and
  73. # will ignore, assuming they ever show up.
  74. self.already_closed_empty_element = []
  75. self._initialize_xml_detector()
  76. def error(self, msg):
  77. """In Python 3, HTMLParser subclasses must implement error(), although
  78. this requirement doesn't appear to be documented.
  79. In Python 2, HTMLParser implements error() by raising an exception,
  80. which we don't want to do.
  81. In any event, this method is called only on very strange
  82. markup and our best strategy is to pretend it didn't happen
  83. and keep going.
  84. """
  85. warnings.warn(msg)
  86. def handle_startendtag(self, name, attrs):
  87. """Handle an incoming empty-element tag.
  88. This is only called when the markup looks like <tag/>.
  89. :param name: Name of the tag.
  90. :param attrs: Dictionary of the tag's attributes.
  91. """
  92. # is_startend() tells handle_starttag not to close the tag
  93. # just because its name matches a known empty-element tag. We
  94. # know that this is an empty-element tag and we want to call
  95. # handle_endtag ourselves.
  96. tag = self.handle_starttag(name, attrs, handle_empty_element=False)
  97. self.handle_endtag(name)
  98. def handle_starttag(self, name, attrs, handle_empty_element=True):
  99. """Handle an opening tag, e.g. '<tag>'
  100. :param name: Name of the tag.
  101. :param attrs: Dictionary of the tag's attributes.
  102. :param handle_empty_element: True if this tag is known to be
  103. an empty-element tag (i.e. there is not expected to be any
  104. closing tag).
  105. """
  106. # XXX namespace
  107. attr_dict = {}
  108. for key, value in attrs:
  109. # Change None attribute values to the empty string
  110. # for consistency with the other tree builders.
  111. if value is None:
  112. value = ''
  113. if key in attr_dict:
  114. # A single attribute shows up multiple times in this
  115. # tag. How to handle it depends on the
  116. # on_duplicate_attribute setting.
  117. on_dupe = self.on_duplicate_attribute
  118. if on_dupe == self.IGNORE:
  119. pass
  120. elif on_dupe in (None, self.REPLACE):
  121. attr_dict[key] = value
  122. else:
  123. on_dupe(attr_dict, key, value)
  124. else:
  125. attr_dict[key] = value
  126. attrvalue = '""'
  127. #print("START", name)
  128. sourceline, sourcepos = self.getpos()
  129. tag = self.soup.handle_starttag(
  130. name, None, None, attr_dict, sourceline=sourceline,
  131. sourcepos=sourcepos
  132. )
  133. if tag and tag.is_empty_element and handle_empty_element:
  134. # Unlike other parsers, html.parser doesn't send separate end tag
  135. # events for empty-element tags. (It's handled in
  136. # handle_startendtag, but only if the original markup looked like
  137. # <tag/>.)
  138. #
  139. # So we need to call handle_endtag() ourselves. Since we
  140. # know the start event is identical to the end event, we
  141. # don't want handle_endtag() to cross off any previous end
  142. # events for tags of this name.
  143. self.handle_endtag(name, check_already_closed=False)
  144. # But we might encounter an explicit closing tag for this tag
  145. # later on. If so, we want to ignore it.
  146. self.already_closed_empty_element.append(name)
  147. if self._root_tag is None:
  148. self._root_tag_encountered(name)
  149. def handle_endtag(self, name, check_already_closed=True):
  150. """Handle a closing tag, e.g. '</tag>'
  151. :param name: A tag name.
  152. :param check_already_closed: True if this tag is expected to
  153. be the closing portion of an empty-element tag,
  154. e.g. '<tag></tag>'.
  155. """
  156. #print("END", name)
  157. if check_already_closed and name in self.already_closed_empty_element:
  158. # This is a redundant end tag for an empty-element tag.
  159. # We've already called handle_endtag() for it, so just
  160. # check it off the list.
  161. #print("ALREADY CLOSED", name)
  162. self.already_closed_empty_element.remove(name)
  163. else:
  164. self.soup.handle_endtag(name)
  165. def handle_data(self, data):
  166. """Handle some textual data that shows up between tags."""
  167. self.soup.handle_data(data)
  168. def handle_charref(self, name):
  169. """Handle a numeric character reference by converting it to the
  170. corresponding Unicode character and treating it as textual
  171. data.
  172. :param name: Character number, possibly in hexadecimal.
  173. """
  174. # XXX workaround for a bug in HTMLParser. Remove this once
  175. # it's fixed in all supported versions.
  176. # http://bugs.python.org/issue13633
  177. if name.startswith('x'):
  178. real_name = int(name.lstrip('x'), 16)
  179. elif name.startswith('X'):
  180. real_name = int(name.lstrip('X'), 16)
  181. else:
  182. real_name = int(name)
  183. data = None
  184. if real_name < 256:
  185. # HTML numeric entities are supposed to reference Unicode
  186. # code points, but sometimes they reference code points in
  187. # some other encoding (ahem, Windows-1252). E.g. &#147;
  188. # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
  189. # code tries to detect this situation and compensate.
  190. for encoding in (self.soup.original_encoding, 'windows-1252'):
  191. if not encoding:
  192. continue
  193. try:
  194. data = bytearray([real_name]).decode(encoding)
  195. except UnicodeDecodeError as e:
  196. pass
  197. if not data:
  198. try:
  199. data = chr(real_name)
  200. except (ValueError, OverflowError) as e:
  201. pass
  202. data = data or "\N{REPLACEMENT CHARACTER}"
  203. self.handle_data(data)
  204. def handle_entityref(self, name):
  205. """Handle a named entity reference by converting it to the
  206. corresponding Unicode character(s) and treating it as textual
  207. data.
  208. :param name: Name of the entity reference.
  209. """
  210. character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
  211. if character is not None:
  212. data = character
  213. else:
  214. # If this were XML, it would be ambiguous whether "&foo"
  215. # was an character entity reference with a missing
  216. # semicolon or the literal string "&foo". Since this is
  217. # HTML, we have a complete list of all character entity references,
  218. # and this one wasn't found, so assume it's the literal string "&foo".
  219. data = "&%s" % name
  220. self.handle_data(data)
  221. def handle_comment(self, data):
  222. """Handle an HTML comment.
  223. :param data: The text of the comment.
  224. """
  225. self.soup.endData()
  226. self.soup.handle_data(data)
  227. self.soup.endData(Comment)
  228. def handle_decl(self, data):
  229. """Handle a DOCTYPE declaration.
  230. :param data: The text of the declaration.
  231. """
  232. self.soup.endData()
  233. data = data[len("DOCTYPE "):]
  234. self.soup.handle_data(data)
  235. self.soup.endData(Doctype)
  236. def unknown_decl(self, data):
  237. """Handle a declaration of unknown type -- probably a CDATA block.
  238. :param data: The text of the declaration.
  239. """
  240. if data.upper().startswith('CDATA['):
  241. cls = CData
  242. data = data[len('CDATA['):]
  243. else:
  244. cls = Declaration
  245. self.soup.endData()
  246. self.soup.handle_data(data)
  247. self.soup.endData(cls)
  248. def handle_pi(self, data):
  249. """Handle a processing instruction.
  250. :param data: The text of the instruction.
  251. """
  252. self.soup.endData()
  253. self.soup.handle_data(data)
  254. self._document_might_be_xml(data)
  255. self.soup.endData(ProcessingInstruction)
  256. class HTMLParserTreeBuilder(HTMLTreeBuilder):
  257. """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,
  258. found in the Python standard library.
  259. """
  260. is_xml = False
  261. picklable = True
  262. NAME = HTMLPARSER
  263. features = [NAME, HTML, STRICT]
  264. # The html.parser knows which line number and position in the
  265. # original file is the source of an element.
  266. TRACKS_LINE_NUMBERS = True
  267. def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
  268. """Constructor.
  269. :param parser_args: Positional arguments to pass into
  270. the BeautifulSoupHTMLParser constructor, once it's
  271. invoked.
  272. :param parser_kwargs: Keyword arguments to pass into
  273. the BeautifulSoupHTMLParser constructor, once it's
  274. invoked.
  275. :param kwargs: Keyword arguments for the superclass constructor.
  276. """
  277. # Some keyword arguments will be pulled out of kwargs and placed
  278. # into parser_kwargs.
  279. extra_parser_kwargs = dict()
  280. for arg in ('on_duplicate_attribute',):
  281. if arg in kwargs:
  282. value = kwargs.pop(arg)
  283. extra_parser_kwargs[arg] = value
  284. super(HTMLParserTreeBuilder, self).__init__(**kwargs)
  285. parser_args = parser_args or []
  286. parser_kwargs = parser_kwargs or {}
  287. parser_kwargs.update(extra_parser_kwargs)
  288. if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
  289. parser_kwargs['strict'] = False
  290. if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
  291. parser_kwargs['convert_charrefs'] = False
  292. self.parser_args = (parser_args, parser_kwargs)
  293. def prepare_markup(self, markup, user_specified_encoding=None,
  294. document_declared_encoding=None, exclude_encodings=None):
  295. """Run any preliminary steps necessary to make incoming markup
  296. acceptable to the parser.
  297. :param markup: Some markup -- probably a bytestring.
  298. :param user_specified_encoding: The user asked to try this encoding.
  299. :param document_declared_encoding: The markup itself claims to be
  300. in this encoding.
  301. :param exclude_encodings: The user asked _not_ to try any of
  302. these encodings.
  303. :yield: A series of 4-tuples:
  304. (markup, encoding, declared encoding,
  305. has undergone character replacement)
  306. Each 4-tuple represents a strategy for converting the
  307. document to Unicode and parsing it. Each strategy will be tried
  308. in turn.
  309. """
  310. if isinstance(markup, str):
  311. # Parse Unicode as-is.
  312. yield (markup, None, None, False)
  313. return
  314. # Ask UnicodeDammit to sniff the most likely encoding.
  315. # This was provided by the end-user; treat it as a known
  316. # definite encoding per the algorithm laid out in the HTML5
  317. # spec. (See the EncodingDetector class for details.)
  318. known_definite_encodings = [user_specified_encoding]
  319. # This was found in the document; treat it as a slightly lower-priority
  320. # user encoding.
  321. user_encodings = [document_declared_encoding]
  322. try_encodings = [user_specified_encoding, document_declared_encoding]
  323. dammit = UnicodeDammit(
  324. markup,
  325. known_definite_encodings=known_definite_encodings,
  326. user_encodings=user_encodings,
  327. is_html=True,
  328. exclude_encodings=exclude_encodings
  329. )
  330. yield (dammit.markup, dammit.original_encoding,
  331. dammit.declared_html_encoding,
  332. dammit.contains_replacement_characters)
  333. def feed(self, markup):
  334. """Run some incoming markup through some parsing process,
  335. populating the `BeautifulSoup` object in self.soup.
  336. """
  337. args, kwargs = self.parser_args
  338. parser = BeautifulSoupHTMLParser(*args, **kwargs)
  339. parser.soup = self.soup
  340. try:
  341. parser.feed(markup)
  342. parser.close()
  343. except HTMLParseError as e:
  344. warnings.warn(RuntimeWarning(
  345. "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
  346. raise e
  347. parser.already_closed_empty_element = []
  348. # Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
  349. # 3.2.3 code. This ensures they don't treat markup like <p></p> as a
  350. # string.
  351. #
  352. # XXX This code can be removed once most Python 3 users are on 3.2.3.
  353. if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
  354. import re
  355. attrfind_tolerant = re.compile(
  356. r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
  357. r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
  358. HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
  359. locatestarttagend = re.compile(r"""
  360. <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
  361. (?:\s+ # whitespace before attribute name
  362. (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
  363. (?:\s*=\s* # value indicator
  364. (?:'[^']*' # LITA-enclosed value
  365. |\"[^\"]*\" # LIT-enclosed value
  366. |[^'\">\s]+ # bare value
  367. )
  368. )?
  369. )
  370. )*
  371. \s* # trailing whitespace
  372. """, re.VERBOSE)
  373. BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
  374. from html.parser import tagfind, attrfind
  375. def parse_starttag(self, i):
  376. self.__starttag_text = None
  377. endpos = self.check_for_whole_start_tag(i)
  378. if endpos < 0:
  379. return endpos
  380. rawdata = self.rawdata
  381. self.__starttag_text = rawdata[i:endpos]
  382. # Now parse the data between i+1 and j into a tag and attrs
  383. attrs = []
  384. match = tagfind.match(rawdata, i+1)
  385. assert match, 'unexpected call to parse_starttag()'
  386. k = match.end()
  387. self.lasttag = tag = rawdata[i+1:k].lower()
  388. while k < endpos:
  389. if self.strict:
  390. m = attrfind.match(rawdata, k)
  391. else:
  392. m = attrfind_tolerant.match(rawdata, k)
  393. if not m:
  394. break
  395. attrname, rest, attrvalue = m.group(1, 2, 3)
  396. if not rest:
  397. attrvalue = None
  398. elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
  399. attrvalue[:1] == '"' == attrvalue[-1:]:
  400. attrvalue = attrvalue[1:-1]
  401. if attrvalue:
  402. attrvalue = self.unescape(attrvalue)
  403. attrs.append((attrname.lower(), attrvalue))
  404. k = m.end()
  405. end = rawdata[k:endpos].strip()
  406. if end not in (">", "/>"):
  407. lineno, offset = self.getpos()
  408. if "\n" in self.__starttag_text:
  409. lineno = lineno + self.__starttag_text.count("\n")
  410. offset = len(self.__starttag_text) \
  411. - self.__starttag_text.rfind("\n")
  412. else:
  413. offset = offset + len(self.__starttag_text)
  414. if self.strict:
  415. self.error("junk characters in start tag: %r"
  416. % (rawdata[k:endpos][:20],))
  417. self.handle_data(rawdata[i:endpos])
  418. return endpos
  419. if end.endswith('/>'):
  420. # XHTML-style empty tag: <span attr="value" />
  421. self.handle_startendtag(tag, attrs)
  422. else:
  423. self.handle_starttag(tag, attrs)
  424. if tag in self.CDATA_CONTENT_ELEMENTS:
  425. self.set_cdata_mode(tag)
  426. return endpos
  427. def set_cdata_mode(self, elem):
  428. self.cdata_elem = elem.lower()
  429. self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
  430. BeautifulSoupHTMLParser.parse_starttag = parse_starttag
  431. BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
  432. CONSTRUCTOR_TAKES_STRICT = True