diagnose.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. """Diagnostic functions, mainly for use when doing tech support."""
  2. import cProfile
  3. from StringIO import StringIO
  4. from HTMLParser import HTMLParser
  5. import bs4
  6. from bs4 import BeautifulSoup, __version__
  7. from bs4.builder import builder_registry
  8. import os
  9. import pstats
  10. import random
  11. import tempfile
  12. import time
  13. import traceback
  14. import sys
  15. import cProfile
  16. def diagnose(data):
  17. """Diagnostic suite for isolating common problems."""
  18. print "Diagnostic running on Beautiful Soup %s" % __version__
  19. print "Python version %s" % sys.version
  20. basic_parsers = ["html.parser", "html5lib", "lxml"]
  21. for name in basic_parsers:
  22. for builder in builder_registry.builders:
  23. if name in builder.features:
  24. break
  25. else:
  26. basic_parsers.remove(name)
  27. print (
  28. "I noticed that %s is not installed. Installing it may help." %
  29. name)
  30. if 'lxml' in basic_parsers:
  31. basic_parsers.append(["lxml", "xml"])
  32. from lxml import etree
  33. print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
  34. if 'html5lib' in basic_parsers:
  35. import html5lib
  36. print "Found html5lib version %s" % html5lib.__version__
  37. if hasattr(data, 'read'):
  38. data = data.read()
  39. elif os.path.exists(data):
  40. print '"%s" looks like a filename. Reading data from the file.' % data
  41. data = open(data).read()
  42. elif data.startswith("http:") or data.startswith("https:"):
  43. print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
  44. print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
  45. return
  46. print
  47. for parser in basic_parsers:
  48. print "Trying to parse your markup with %s" % parser
  49. success = False
  50. try:
  51. soup = BeautifulSoup(data, parser)
  52. success = True
  53. except Exception, e:
  54. print "%s could not parse the markup." % parser
  55. traceback.print_exc()
  56. if success:
  57. print "Here's what %s did with the markup:" % parser
  58. print soup.prettify()
  59. print "-" * 80
  60. def lxml_trace(data, html=True, **kwargs):
  61. """Print out the lxml events that occur during parsing.
  62. This lets you see how lxml parses a document when no Beautiful
  63. Soup code is running.
  64. """
  65. from lxml import etree
  66. for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
  67. print("%s, %4s, %s" % (event, element.tag, element.text))
  68. class AnnouncingParser(HTMLParser):
  69. """Announces HTMLParser parse events, without doing anything else."""
  70. def _p(self, s):
  71. print(s)
  72. def handle_starttag(self, name, attrs):
  73. self._p("%s START" % name)
  74. def handle_endtag(self, name):
  75. self._p("%s END" % name)
  76. def handle_data(self, data):
  77. self._p("%s DATA" % data)
  78. def handle_charref(self, name):
  79. self._p("%s CHARREF" % name)
  80. def handle_entityref(self, name):
  81. self._p("%s ENTITYREF" % name)
  82. def handle_comment(self, data):
  83. self._p("%s COMMENT" % data)
  84. def handle_decl(self, data):
  85. self._p("%s DECL" % data)
  86. def unknown_decl(self, data):
  87. self._p("%s UNKNOWN-DECL" % data)
  88. def handle_pi(self, data):
  89. self._p("%s PI" % data)
  90. def htmlparser_trace(data):
  91. """Print out the HTMLParser events that occur during parsing.
  92. This lets you see how HTMLParser parses a document when no
  93. Beautiful Soup code is running.
  94. """
  95. parser = AnnouncingParser()
  96. parser.feed(data)
  97. _vowels = "aeiou"
  98. _consonants = "bcdfghjklmnpqrstvwxyz"
  99. def rword(length=5):
  100. "Generate a random word-like string."
  101. s = ''
  102. for i in range(length):
  103. if i % 2 == 0:
  104. t = _consonants
  105. else:
  106. t = _vowels
  107. s += random.choice(t)
  108. return s
  109. def rsentence(length=4):
  110. "Generate a random sentence-like string."
  111. return " ".join(rword(random.randint(4,9)) for i in range(length))
  112. def rdoc(num_elements=1000):
  113. """Randomly generate an invalid HTML document."""
  114. tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
  115. elements = []
  116. for i in range(num_elements):
  117. choice = random.randint(0,3)
  118. if choice == 0:
  119. # New tag.
  120. tag_name = random.choice(tag_names)
  121. elements.append("<%s>" % tag_name)
  122. elif choice == 1:
  123. elements.append(rsentence(random.randint(1,4)))
  124. elif choice == 2:
  125. # Close a tag.
  126. tag_name = random.choice(tag_names)
  127. elements.append("</%s>" % tag_name)
  128. return "<html>" + "\n".join(elements) + "</html>"
  129. def benchmark_parsers(num_elements=100000):
  130. """Very basic head-to-head performance benchmark."""
  131. print "Comparative parser benchmark on Beautiful Soup %s" % __version__
  132. data = rdoc(num_elements)
  133. print "Generated a large invalid HTML document (%d bytes)." % len(data)
  134. for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
  135. success = False
  136. try:
  137. a = time.time()
  138. soup = BeautifulSoup(data, parser)
  139. b = time.time()
  140. success = True
  141. except Exception, e:
  142. print "%s could not parse the markup." % parser
  143. traceback.print_exc()
  144. if success:
  145. print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
  146. from lxml import etree
  147. a = time.time()
  148. etree.HTML(data)
  149. b = time.time()
  150. print "Raw lxml parsed the markup in %.2fs." % (b-a)
  151. import html5lib
  152. parser = html5lib.HTMLParser()
  153. a = time.time()
  154. parser.parse(data)
  155. b = time.time()
  156. print "Raw html5lib parsed the markup in %.2fs." % (b-a)
  157. def profile(num_elements=100000, parser="lxml"):
  158. filehandle = tempfile.NamedTemporaryFile()
  159. filename = filehandle.name
  160. data = rdoc(num_elements)
  161. vars = dict(bs4=bs4, data=data, parser=parser)
  162. cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
  163. stats = pstats.Stats(filename)
  164. # stats.strip_dirs()
  165. stats.sort_stats("cumulative")
  166. stats.print_stats('_html5lib|bs4', 50)
  167. if __name__ == '__main__':
  168. diagnose(sys.stdin.read())