_lxml.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. __all__ = [
  2. 'LXMLTreeBuilderForXML',
  3. 'LXMLTreeBuilder',
  4. ]
  5. from io import BytesIO
  6. from StringIO import StringIO
  7. import collections
  8. from lxml import etree
  9. from bs4.element import Comment, Doctype, NamespacedAttribute
  10. from bs4.builder import (
  11. FAST,
  12. HTML,
  13. HTMLTreeBuilder,
  14. PERMISSIVE,
  15. ParserRejectedMarkup,
  16. TreeBuilder,
  17. XML)
  18. from bs4.dammit import EncodingDetector
  19. LXML = 'lxml'
  20. class LXMLTreeBuilderForXML(TreeBuilder):
  21. DEFAULT_PARSER_CLASS = etree.XMLParser
  22. is_xml = True
  23. # Well, it's permissive by XML parser standards.
  24. features = [LXML, XML, FAST, PERMISSIVE]
  25. CHUNK_SIZE = 512
  26. # This namespace mapping is specified in the XML Namespace
  27. # standard.
  28. DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
  29. def default_parser(self, encoding):
  30. # This can either return a parser object or a class, which
  31. # will be instantiated with default arguments.
  32. if self._default_parser is not None:
  33. return self._default_parser
  34. return etree.XMLParser(
  35. target=self, strip_cdata=False, recover=True, encoding=encoding)
  36. def parser_for(self, encoding):
  37. # Use the default parser.
  38. parser = self.default_parser(encoding)
  39. if isinstance(parser, collections.Callable):
  40. # Instantiate the parser with default arguments
  41. parser = parser(target=self, strip_cdata=False, encoding=encoding)
  42. return parser
  43. def __init__(self, parser=None, empty_element_tags=None):
  44. # TODO: Issue a warning if parser is present but not a
  45. # callable, since that means there's no way to create new
  46. # parsers for different encodings.
  47. self._default_parser = parser
  48. if empty_element_tags is not None:
  49. self.empty_element_tags = set(empty_element_tags)
  50. self.soup = None
  51. self.nsmaps = [self.DEFAULT_NSMAPS]
  52. def _getNsTag(self, tag):
  53. # Split the namespace URL out of a fully-qualified lxml tag
  54. # name. Copied from lxml's src/lxml/sax.py.
  55. if tag[0] == '{':
  56. return tuple(tag[1:].split('}', 1))
  57. else:
  58. return (None, tag)
  59. def prepare_markup(self, markup, user_specified_encoding=None,
  60. document_declared_encoding=None):
  61. """
  62. :yield: A series of 4-tuples.
  63. (markup, encoding, declared encoding,
  64. has undergone character replacement)
  65. Each 4-tuple represents a strategy for parsing the document.
  66. """
  67. if isinstance(markup, unicode):
  68. # We were given Unicode. Maybe lxml can parse Unicode on
  69. # this system?
  70. yield markup, None, document_declared_encoding, False
  71. if isinstance(markup, unicode):
  72. # No, apparently not. Convert the Unicode to UTF-8 and
  73. # tell lxml to parse it as UTF-8.
  74. yield (markup.encode("utf8"), "utf8",
  75. document_declared_encoding, False)
  76. # Instead of using UnicodeDammit to convert the bytestring to
  77. # Unicode using different encodings, use EncodingDetector to
  78. # iterate over the encodings, and tell lxml to try to parse
  79. # the document as each one in turn.
  80. is_html = not self.is_xml
  81. try_encodings = [user_specified_encoding, document_declared_encoding]
  82. detector = EncodingDetector(markup, try_encodings, is_html)
  83. for encoding in detector.encodings:
  84. yield (detector.markup, encoding, document_declared_encoding, False)
  85. def feed(self, markup):
  86. if isinstance(markup, bytes):
  87. markup = BytesIO(markup)
  88. elif isinstance(markup, unicode):
  89. markup = StringIO(markup)
  90. # Call feed() at least once, even if the markup is empty,
  91. # or the parser won't be initialized.
  92. data = markup.read(self.CHUNK_SIZE)
  93. try:
  94. self.parser = self.parser_for(self.soup.original_encoding)
  95. self.parser.feed(data)
  96. while len(data) != 0:
  97. # Now call feed() on the rest of the data, chunk by chunk.
  98. data = markup.read(self.CHUNK_SIZE)
  99. if len(data) != 0:
  100. self.parser.feed(data)
  101. self.parser.close()
  102. except (UnicodeDecodeError, LookupError, etree.ParserError), e:
  103. raise ParserRejectedMarkup(str(e))
  104. def close(self):
  105. self.nsmaps = [self.DEFAULT_NSMAPS]
  106. def start(self, name, attrs, nsmap={}):
  107. # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
  108. attrs = dict(attrs)
  109. nsprefix = None
  110. # Invert each namespace map as it comes in.
  111. if len(self.nsmaps) > 1:
  112. # There are no new namespaces for this tag, but
  113. # non-default namespaces are in play, so we need a
  114. # separate tag stack to know when they end.
  115. self.nsmaps.append(None)
  116. elif len(nsmap) > 0:
  117. # A new namespace mapping has come into play.
  118. inverted_nsmap = dict((value, key) for key, value in nsmap.items())
  119. self.nsmaps.append(inverted_nsmap)
  120. # Also treat the namespace mapping as a set of attributes on the
  121. # tag, so we can recreate it later.
  122. attrs = attrs.copy()
  123. for prefix, namespace in nsmap.items():
  124. attribute = NamespacedAttribute(
  125. "xmlns", prefix, "http://www.w3.org/2000/xmlns/")
  126. attrs[attribute] = namespace
  127. # Namespaces are in play. Find any attributes that came in
  128. # from lxml with namespaces attached to their names, and
  129. # turn then into NamespacedAttribute objects.
  130. new_attrs = {}
  131. for attr, value in attrs.items():
  132. namespace, attr = self._getNsTag(attr)
  133. if namespace is None:
  134. new_attrs[attr] = value
  135. else:
  136. nsprefix = self._prefix_for_namespace(namespace)
  137. attr = NamespacedAttribute(nsprefix, attr, namespace)
  138. new_attrs[attr] = value
  139. attrs = new_attrs
  140. namespace, name = self._getNsTag(name)
  141. nsprefix = self._prefix_for_namespace(namespace)
  142. self.soup.handle_starttag(name, namespace, nsprefix, attrs)
  143. def _prefix_for_namespace(self, namespace):
  144. """Find the currently active prefix for the given namespace."""
  145. if namespace is None:
  146. return None
  147. for inverted_nsmap in reversed(self.nsmaps):
  148. if inverted_nsmap is not None and namespace in inverted_nsmap:
  149. return inverted_nsmap[namespace]
  150. return None
  151. def end(self, name):
  152. self.soup.endData()
  153. completed_tag = self.soup.tagStack[-1]
  154. namespace, name = self._getNsTag(name)
  155. nsprefix = None
  156. if namespace is not None:
  157. for inverted_nsmap in reversed(self.nsmaps):
  158. if inverted_nsmap is not None and namespace in inverted_nsmap:
  159. nsprefix = inverted_nsmap[namespace]
  160. break
  161. self.soup.handle_endtag(name, nsprefix)
  162. if len(self.nsmaps) > 1:
  163. # This tag, or one of its parents, introduced a namespace
  164. # mapping, so pop it off the stack.
  165. self.nsmaps.pop()
  166. def pi(self, target, data):
  167. pass
  168. def data(self, content):
  169. self.soup.handle_data(content)
  170. def doctype(self, name, pubid, system):
  171. self.soup.endData()
  172. doctype = Doctype.for_name_and_ids(name, pubid, system)
  173. self.soup.object_was_parsed(doctype)
  174. def comment(self, content):
  175. "Handle comments as Comment objects."
  176. self.soup.endData()
  177. self.soup.handle_data(content)
  178. self.soup.endData(Comment)
  179. def test_fragment_to_document(self, fragment):
  180. """See `TreeBuilder`."""
  181. return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
  182. class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
  183. features = [LXML, HTML, FAST, PERMISSIVE]
  184. is_xml = False
  185. def default_parser(self, encoding):
  186. return etree.HTMLParser
  187. def feed(self, markup):
  188. encoding = self.soup.original_encoding
  189. try:
  190. self.parser = self.parser_for(encoding)
  191. self.parser.feed(markup)
  192. self.parser.close()
  193. except (UnicodeDecodeError, LookupError, etree.ParserError), e:
  194. raise ParserRejectedMarkup(str(e))
  195. def test_fragment_to_document(self, fragment):
  196. """See `TreeBuilder`."""
  197. return u'<html><body>%s</body></html>' % fragment