diagnose.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. """Diagnostic functions, mainly for use when doing tech support."""
  2. # Use of this source code is governed by the MIT license.
  3. __license__ = "MIT"
  4. import cProfile
  5. from io import BytesIO
  6. from html.parser import HTMLParser
  7. import bs4
  8. from bs4 import BeautifulSoup, __version__
  9. from bs4.builder import builder_registry
  10. import os
  11. import pstats
  12. import random
  13. import tempfile
  14. import time
  15. import traceback
  16. import sys
  17. import cProfile
  18. def diagnose(data):
  19. """Diagnostic suite for isolating common problems.
  20. :param data: A string containing markup that needs to be explained.
  21. :return: None; diagnostics are printed to standard output.
  22. """
  23. print(("Diagnostic running on Beautiful Soup %s" % __version__))
  24. print(("Python version %s" % sys.version))
  25. basic_parsers = ["html.parser", "html5lib", "lxml"]
  26. for name in basic_parsers:
  27. for builder in builder_registry.builders:
  28. if name in builder.features:
  29. break
  30. else:
  31. basic_parsers.remove(name)
  32. print((
  33. "I noticed that %s is not installed. Installing it may help." %
  34. name))
  35. if 'lxml' in basic_parsers:
  36. basic_parsers.append("lxml-xml")
  37. try:
  38. from lxml import etree
  39. print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
  40. except ImportError as e:
  41. print(
  42. "lxml is not installed or couldn't be imported.")
  43. if 'html5lib' in basic_parsers:
  44. try:
  45. import html5lib
  46. print(("Found html5lib version %s" % html5lib.__version__))
  47. except ImportError as e:
  48. print(
  49. "html5lib is not installed or couldn't be imported.")
  50. if hasattr(data, 'read'):
  51. data = data.read()
  52. elif data.startswith("http:") or data.startswith("https:"):
  53. print(('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data))
  54. print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
  55. return
  56. else:
  57. try:
  58. if os.path.exists(data):
  59. print(('"%s" looks like a filename. Reading data from the file.' % data))
  60. with open(data) as fp:
  61. data = fp.read()
  62. except ValueError:
  63. # This can happen on some platforms when the 'filename' is
  64. # too long. Assume it's data and not a filename.
  65. pass
  66. print("")
  67. for parser in basic_parsers:
  68. print(("Trying to parse your markup with %s" % parser))
  69. success = False
  70. try:
  71. soup = BeautifulSoup(data, features=parser)
  72. success = True
  73. except Exception as e:
  74. print(("%s could not parse the markup." % parser))
  75. traceback.print_exc()
  76. if success:
  77. print(("Here's what %s did with the markup:" % parser))
  78. print((soup.prettify()))
  79. print(("-" * 80))
  80. def lxml_trace(data, html=True, **kwargs):
  81. """Print out the lxml events that occur during parsing.
  82. This lets you see how lxml parses a document when no Beautiful
  83. Soup code is running. You can use this to determine whether
  84. an lxml-specific problem is in Beautiful Soup's lxml tree builders
  85. or in lxml itself.
  86. :param data: Some markup.
  87. :param html: If True, markup will be parsed with lxml's HTML parser.
  88. if False, lxml's XML parser will be used.
  89. """
  90. from lxml import etree
  91. recover = kwargs.pop('recover', True)
  92. if isinstance(data, str):
  93. data = data.encode("utf8")
  94. reader = BytesIO(data)
  95. for event, element in etree.iterparse(
  96. reader, html=html, recover=recover, **kwargs
  97. ):
  98. print(("%s, %4s, %s" % (event, element.tag, element.text)))
  99. class AnnouncingParser(HTMLParser):
  100. """Subclass of HTMLParser that announces parse events, without doing
  101. anything else.
  102. You can use this to get a picture of how html.parser sees a given
  103. document. The easiest way to do this is to call `htmlparser_trace`.
  104. """
  105. def _p(self, s):
  106. print(s)
  107. def handle_starttag(self, name, attrs):
  108. self._p("%s START" % name)
  109. def handle_endtag(self, name):
  110. self._p("%s END" % name)
  111. def handle_data(self, data):
  112. self._p("%s DATA" % data)
  113. def handle_charref(self, name):
  114. self._p("%s CHARREF" % name)
  115. def handle_entityref(self, name):
  116. self._p("%s ENTITYREF" % name)
  117. def handle_comment(self, data):
  118. self._p("%s COMMENT" % data)
  119. def handle_decl(self, data):
  120. self._p("%s DECL" % data)
  121. def unknown_decl(self, data):
  122. self._p("%s UNKNOWN-DECL" % data)
  123. def handle_pi(self, data):
  124. self._p("%s PI" % data)
  125. def htmlparser_trace(data):
  126. """Print out the HTMLParser events that occur during parsing.
  127. This lets you see how HTMLParser parses a document when no
  128. Beautiful Soup code is running.
  129. :param data: Some markup.
  130. """
  131. parser = AnnouncingParser()
  132. parser.feed(data)
  133. _vowels = "aeiou"
  134. _consonants = "bcdfghjklmnpqrstvwxyz"
  135. def rword(length=5):
  136. "Generate a random word-like string."
  137. s = ''
  138. for i in range(length):
  139. if i % 2 == 0:
  140. t = _consonants
  141. else:
  142. t = _vowels
  143. s += random.choice(t)
  144. return s
  145. def rsentence(length=4):
  146. "Generate a random sentence-like string."
  147. return " ".join(rword(random.randint(4,9)) for i in range(length))
  148. def rdoc(num_elements=1000):
  149. """Randomly generate an invalid HTML document."""
  150. tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
  151. elements = []
  152. for i in range(num_elements):
  153. choice = random.randint(0,3)
  154. if choice == 0:
  155. # New tag.
  156. tag_name = random.choice(tag_names)
  157. elements.append("<%s>" % tag_name)
  158. elif choice == 1:
  159. elements.append(rsentence(random.randint(1,4)))
  160. elif choice == 2:
  161. # Close a tag.
  162. tag_name = random.choice(tag_names)
  163. elements.append("</%s>" % tag_name)
  164. return "<html>" + "\n".join(elements) + "</html>"
  165. def benchmark_parsers(num_elements=100000):
  166. """Very basic head-to-head performance benchmark."""
  167. print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
  168. data = rdoc(num_elements)
  169. print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
  170. for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
  171. success = False
  172. try:
  173. a = time.time()
  174. soup = BeautifulSoup(data, parser)
  175. b = time.time()
  176. success = True
  177. except Exception as e:
  178. print(("%s could not parse the markup." % parser))
  179. traceback.print_exc()
  180. if success:
  181. print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
  182. from lxml import etree
  183. a = time.time()
  184. etree.HTML(data)
  185. b = time.time()
  186. print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
  187. import html5lib
  188. parser = html5lib.HTMLParser()
  189. a = time.time()
  190. parser.parse(data)
  191. b = time.time()
  192. print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
  193. def profile(num_elements=100000, parser="lxml"):
  194. """Use Python's profiler on a randomly generated document."""
  195. filehandle = tempfile.NamedTemporaryFile()
  196. filename = filehandle.name
  197. data = rdoc(num_elements)
  198. vars = dict(bs4=bs4, data=data, parser=parser)
  199. cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
  200. stats = pstats.Stats(filename)
  201. # stats.strip_dirs()
  202. stats.sort_stats("cumulative")
  203. stats.print_stats('_html5lib|bs4', 50)
  204. # If this file is run as a script, standard input is diagnosed.
  205. if __name__ == '__main__':
  206. diagnose(sys.stdin.read())