diagnose.py 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232
  1. """Diagnostic functions, mainly for use when doing tech support."""
  2. # Use of this source code is governed by the MIT license.
  3. __license__ = "MIT"
  4. import cProfile
  5. from io import BytesIO
  6. from html.parser import HTMLParser
  7. import bs4
  8. from bs4 import BeautifulSoup, __version__
  9. from bs4.builder import builder_registry
  10. import os
  11. import pstats
  12. import random
  13. import tempfile
  14. import time
  15. import traceback
  16. import sys
  17. def diagnose(data):
  18. """Diagnostic suite for isolating common problems.
  19. :param data: A string containing markup that needs to be explained.
  20. :return: None; diagnostics are printed to standard output.
  21. """
  22. print(("Diagnostic running on Beautiful Soup %s" % __version__))
  23. print(("Python version %s" % sys.version))
  24. basic_parsers = ["html.parser", "html5lib", "lxml"]
  25. for name in basic_parsers:
  26. for builder in builder_registry.builders:
  27. if name in builder.features:
  28. break
  29. else:
  30. basic_parsers.remove(name)
  31. print((
  32. "I noticed that %s is not installed. Installing it may help." %
  33. name))
  34. if 'lxml' in basic_parsers:
  35. basic_parsers.append("lxml-xml")
  36. try:
  37. from lxml import etree
  38. print(("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))))
  39. except ImportError as e:
  40. print(
  41. "lxml is not installed or couldn't be imported.")
  42. if 'html5lib' in basic_parsers:
  43. try:
  44. import html5lib
  45. print(("Found html5lib version %s" % html5lib.__version__))
  46. except ImportError as e:
  47. print(
  48. "html5lib is not installed or couldn't be imported.")
  49. if hasattr(data, 'read'):
  50. data = data.read()
  51. for parser in basic_parsers:
  52. print(("Trying to parse your markup with %s" % parser))
  53. success = False
  54. try:
  55. soup = BeautifulSoup(data, features=parser)
  56. success = True
  57. except Exception as e:
  58. print(("%s could not parse the markup." % parser))
  59. traceback.print_exc()
  60. if success:
  61. print(("Here's what %s did with the markup:" % parser))
  62. print((soup.prettify()))
  63. print(("-" * 80))
  64. def lxml_trace(data, html=True, **kwargs):
  65. """Print out the lxml events that occur during parsing.
  66. This lets you see how lxml parses a document when no Beautiful
  67. Soup code is running. You can use this to determine whether
  68. an lxml-specific problem is in Beautiful Soup's lxml tree builders
  69. or in lxml itself.
  70. :param data: Some markup.
  71. :param html: If True, markup will be parsed with lxml's HTML parser.
  72. if False, lxml's XML parser will be used.
  73. """
  74. from lxml import etree
  75. recover = kwargs.pop('recover', True)
  76. if isinstance(data, str):
  77. data = data.encode("utf8")
  78. reader = BytesIO(data)
  79. for event, element in etree.iterparse(
  80. reader, html=html, recover=recover, **kwargs
  81. ):
  82. print(("%s, %4s, %s" % (event, element.tag, element.text)))
  83. class AnnouncingParser(HTMLParser):
  84. """Subclass of HTMLParser that announces parse events, without doing
  85. anything else.
  86. You can use this to get a picture of how html.parser sees a given
  87. document. The easiest way to do this is to call `htmlparser_trace`.
  88. """
  89. def _p(self, s):
  90. print(s)
  91. def handle_starttag(self, name, attrs):
  92. self._p("%s START" % name)
  93. def handle_endtag(self, name):
  94. self._p("%s END" % name)
  95. def handle_data(self, data):
  96. self._p("%s DATA" % data)
  97. def handle_charref(self, name):
  98. self._p("%s CHARREF" % name)
  99. def handle_entityref(self, name):
  100. self._p("%s ENTITYREF" % name)
  101. def handle_comment(self, data):
  102. self._p("%s COMMENT" % data)
  103. def handle_decl(self, data):
  104. self._p("%s DECL" % data)
  105. def unknown_decl(self, data):
  106. self._p("%s UNKNOWN-DECL" % data)
  107. def handle_pi(self, data):
  108. self._p("%s PI" % data)
  109. def htmlparser_trace(data):
  110. """Print out the HTMLParser events that occur during parsing.
  111. This lets you see how HTMLParser parses a document when no
  112. Beautiful Soup code is running.
  113. :param data: Some markup.
  114. """
  115. parser = AnnouncingParser()
  116. parser.feed(data)
  117. _vowels = "aeiou"
  118. _consonants = "bcdfghjklmnpqrstvwxyz"
  119. def rword(length=5):
  120. "Generate a random word-like string."
  121. s = ''
  122. for i in range(length):
  123. if i % 2 == 0:
  124. t = _consonants
  125. else:
  126. t = _vowels
  127. s += random.choice(t)
  128. return s
  129. def rsentence(length=4):
  130. "Generate a random sentence-like string."
  131. return " ".join(rword(random.randint(4,9)) for i in range(length))
  132. def rdoc(num_elements=1000):
  133. """Randomly generate an invalid HTML document."""
  134. tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
  135. elements = []
  136. for i in range(num_elements):
  137. choice = random.randint(0,3)
  138. if choice == 0:
  139. # New tag.
  140. tag_name = random.choice(tag_names)
  141. elements.append("<%s>" % tag_name)
  142. elif choice == 1:
  143. elements.append(rsentence(random.randint(1,4)))
  144. elif choice == 2:
  145. # Close a tag.
  146. tag_name = random.choice(tag_names)
  147. elements.append("</%s>" % tag_name)
  148. return "<html>" + "\n".join(elements) + "</html>"
  149. def benchmark_parsers(num_elements=100000):
  150. """Very basic head-to-head performance benchmark."""
  151. print(("Comparative parser benchmark on Beautiful Soup %s" % __version__))
  152. data = rdoc(num_elements)
  153. print(("Generated a large invalid HTML document (%d bytes)." % len(data)))
  154. for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
  155. success = False
  156. try:
  157. a = time.time()
  158. soup = BeautifulSoup(data, parser)
  159. b = time.time()
  160. success = True
  161. except Exception as e:
  162. print(("%s could not parse the markup." % parser))
  163. traceback.print_exc()
  164. if success:
  165. print(("BS4+%s parsed the markup in %.2fs." % (parser, b-a)))
  166. from lxml import etree
  167. a = time.time()
  168. etree.HTML(data)
  169. b = time.time()
  170. print(("Raw lxml parsed the markup in %.2fs." % (b-a)))
  171. import html5lib
  172. parser = html5lib.HTMLParser()
  173. a = time.time()
  174. parser.parse(data)
  175. b = time.time()
  176. print(("Raw html5lib parsed the markup in %.2fs." % (b-a)))
  177. def profile(num_elements=100000, parser="lxml"):
  178. """Use Python's profiler on a randomly generated document."""
  179. filehandle = tempfile.NamedTemporaryFile()
  180. filename = filehandle.name
  181. data = rdoc(num_elements)
  182. vars = dict(bs4=bs4, data=data, parser=parser)
  183. cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
  184. stats = pstats.Stats(filename)
  185. # stats.strip_dirs()
  186. stats.sort_stats("cumulative")
  187. stats.print_stats('_html5lib|bs4', 50)
  188. # If this file is run as a script, standard input is diagnosed.
  189. if __name__ == '__main__':
  190. diagnose(sys.stdin.read())