element.py 91 KB


  1. # Use of this source code is governed by the MIT license.
  2. __license__ = "MIT"
  3. try:
  4. from collections.abc import Callable # Python 3.6
  5. except ImportError as e:
  6. from collections import Callable
  7. import re
  8. import sys
  9. import warnings
  10. from bs4.css import CSS
  11. from bs4.formatter import (
  12. Formatter,
  13. HTMLFormatter,
  14. XMLFormatter,
  15. )
  16. DEFAULT_OUTPUT_ENCODING = "utf-8"
  17. nonwhitespace_re = re.compile(r"\S+")
  18. # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
  19. # the off chance someone imported it for their own use.
  20. whitespace_re = re.compile(r"\s+")
  21. def _alias(attr):
  22. """Alias one attribute name to another for backward compatibility"""
  23. @property
  24. def alias(self):
  25. return getattr(self, attr)
  26. @alias.setter
  27. def alias(self):
  28. return setattr(self, attr)
  29. return alias
  30. # These encodings are recognized by Python (so PageElement.encode
  31. # could theoretically support them) but XML and HTML don't recognize
  32. # them (so they should not show up in an XML or HTML document as that
  33. # document's encoding).
  34. #
  35. # If an XML document is encoded in one of these encodings, no encoding
  36. # will be mentioned in the XML declaration. If an HTML document is
  37. # encoded in one of these encodings, and the HTML document has a
  38. # <meta> tag that mentions an encoding, the encoding will be given as
  39. # the empty string.
  40. #
  41. # Source:
  42. # https://docs.python.org/3/library/codecs.html#python-specific-encodings
  43. PYTHON_SPECIFIC_ENCODINGS = set([
  44. "idna",
  45. "mbcs",
  46. "oem",
  47. "palmos",
  48. "punycode",
  49. "raw_unicode_escape",
  50. "undefined",
  51. "unicode_escape",
  52. "raw-unicode-escape",
  53. "unicode-escape",
  54. "string-escape",
  55. "string_escape",
  56. ])
  57. class NamespacedAttribute(str):
  58. """A namespaced string (e.g. 'xml:lang') that remembers the namespace
  59. ('xml') and the name ('lang') that were used to create it.
  60. """
  61. def __new__(cls, prefix, name=None, namespace=None):
  62. if not name:
  63. # This is the default namespace. Its name "has no value"
  64. # per https://www.w3.org/TR/xml-names/#defaulting
  65. name = None
  66. if not name:
  67. obj = str.__new__(cls, prefix)
  68. elif not prefix:
  69. # Not really namespaced.
  70. obj = str.__new__(cls, name)
  71. else:
  72. obj = str.__new__(cls, prefix + ":" + name)
  73. obj.prefix = prefix
  74. obj.name = name
  75. obj.namespace = namespace
  76. return obj
  77. class AttributeValueWithCharsetSubstitution(str):
  78. """A stand-in object for a character encoding specified in HTML."""
  79. class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
  80. """A generic stand-in for the value of a meta tag's 'charset' attribute.
  81. When Beautiful Soup parses the markup '<meta charset="utf8">', the
  82. value of the 'charset' attribute will be one of these objects.
  83. """
  84. def __new__(cls, original_value):
  85. obj = str.__new__(cls, original_value)
  86. obj.original_value = original_value
  87. return obj
  88. def encode(self, encoding):
  89. """When an HTML document is being encoded to a given encoding, the
  90. value of a meta tag's 'charset' is the name of the encoding.
  91. """
  92. if encoding in PYTHON_SPECIFIC_ENCODINGS:
  93. return ''
  94. return encoding
  95. class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
  96. """A generic stand-in for the value of a meta tag's 'content' attribute.
  97. When Beautiful Soup parses the markup:
  98. <meta http-equiv="content-type" content="text/html; charset=utf8">
  99. The value of the 'content' attribute will be one of these objects.
  100. """
  101. CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
  102. def __new__(cls, original_value):
  103. match = cls.CHARSET_RE.search(original_value)
  104. if match is None:
  105. # No substitution necessary.
  106. return str.__new__(str, original_value)
  107. obj = str.__new__(cls, original_value)
  108. obj.original_value = original_value
  109. return obj
  110. def encode(self, encoding):
  111. if encoding in PYTHON_SPECIFIC_ENCODINGS:
  112. return ''
  113. def rewrite(match):
  114. return match.group(1) + encoding
  115. return self.CHARSET_RE.sub(rewrite, self.original_value)
  116. class PageElement(object):
  117. """Contains the navigational information for some part of the page:
  118. that is, its current location in the parse tree.
  119. NavigableString, Tag, etc. are all subclasses of PageElement.
  120. """
  121. # In general, we can't tell just by looking at an element whether
  122. # it's contained in an XML document or an HTML document. But for
  123. # Tags (q.v.) we can store this information at parse time.
  124. known_xml = None
  125. def setup(self, parent=None, previous_element=None, next_element=None,
  126. previous_sibling=None, next_sibling=None):
  127. """Sets up the initial relations between this element and
  128. other elements.
  129. :param parent: The parent of this element.
  130. :param previous_element: The element parsed immediately before
  131. this one.
  132. :param next_element: The element parsed immediately before
  133. this one.
  134. :param previous_sibling: The most recently encountered element
  135. on the same level of the parse tree as this one.
  136. :param previous_sibling: The next element to be encountered
  137. on the same level of the parse tree as this one.
  138. """
  139. self.parent = parent
  140. self.previous_element = previous_element
  141. if previous_element is not None:
  142. self.previous_element.next_element = self
  143. self.next_element = next_element
  144. if self.next_element is not None:
  145. self.next_element.previous_element = self
  146. self.next_sibling = next_sibling
  147. if self.next_sibling is not None:
  148. self.next_sibling.previous_sibling = self
  149. if (previous_sibling is None
  150. and self.parent is not None and self.parent.contents):
  151. previous_sibling = self.parent.contents[-1]
  152. self.previous_sibling = previous_sibling
  153. if previous_sibling is not None:
  154. self.previous_sibling.next_sibling = self
  155. def format_string(self, s, formatter):
  156. """Format the given string using the given formatter.
  157. :param s: A string.
  158. :param formatter: A Formatter object, or a string naming one of the standard formatters.
  159. """
  160. if formatter is None:
  161. return s
  162. if not isinstance(formatter, Formatter):
  163. formatter = self.formatter_for_name(formatter)
  164. output = formatter.substitute(s)
  165. return output
  166. def formatter_for_name(self, formatter):
  167. """Look up or create a Formatter for the given identifier,
  168. if necessary.
  169. :param formatter: Can be a Formatter object (used as-is), a
  170. function (used as the entity substitution hook for an
  171. XMLFormatter or HTMLFormatter), or a string (used to look
  172. up an XMLFormatter or HTMLFormatter in the appropriate
  173. registry.
  174. """
  175. if isinstance(formatter, Formatter):
  176. return formatter
  177. if self._is_xml:
  178. c = XMLFormatter
  179. else:
  180. c = HTMLFormatter
  181. if isinstance(formatter, Callable):
  182. return c(entity_substitution=formatter)
  183. return c.REGISTRY[formatter]
  184. @property
  185. def _is_xml(self):
  186. """Is this element part of an XML tree or an HTML tree?
  187. This is used in formatter_for_name, when deciding whether an
  188. XMLFormatter or HTMLFormatter is more appropriate. It can be
  189. inefficient, but it should be called very rarely.
  190. """
  191. if self.known_xml is not None:
  192. # Most of the time we will have determined this when the
  193. # document is parsed.
  194. return self.known_xml
  195. # Otherwise, it's likely that this element was created by
  196. # direct invocation of the constructor from within the user's
  197. # Python code.
  198. if self.parent is None:
  199. # This is the top-level object. It should have .known_xml set
  200. # from tree creation. If not, take a guess--BS is usually
  201. # used on HTML markup.
  202. return getattr(self, 'is_xml', False)
  203. return self.parent._is_xml
  204. nextSibling = _alias("next_sibling") # BS3
  205. previousSibling = _alias("previous_sibling") # BS3
  206. default = object()
  207. def _all_strings(self, strip=False, types=default):
  208. """Yield all strings of certain classes, possibly stripping them.
  209. This is implemented differently in Tag and NavigableString.
  210. """
  211. raise NotImplementedError()
  212. @property
  213. def stripped_strings(self):
  214. """Yield all strings in this PageElement, stripping them first.
  215. :yield: A sequence of stripped strings.
  216. """
  217. for string in self._all_strings(True):
  218. yield string
  219. def get_text(self, separator="", strip=False,
  220. types=default):
  221. """Get all child strings of this PageElement, concatenated using the
  222. given separator.
  223. :param separator: Strings will be concatenated using this separator.
  224. :param strip: If True, strings will be stripped before being
  225. concatenated.
  226. :param types: A tuple of NavigableString subclasses. Any
  227. strings of a subclass not found in this list will be
  228. ignored. Although there are exceptions, the default
  229. behavior in most cases is to consider only NavigableString
  230. and CData objects. That means no comments, processing
  231. instructions, etc.
  232. :return: A string.
  233. """
  234. return separator.join([s for s in self._all_strings(
  235. strip, types=types)])
  236. getText = get_text
  237. text = property(get_text)
  238. def replace_with(self, *args):
  239. """Replace this PageElement with one or more PageElements, keeping the
  240. rest of the tree the same.
  241. :param args: One or more PageElements.
  242. :return: `self`, no longer part of the tree.
  243. """
  244. if self.parent is None:
  245. raise ValueError(
  246. "Cannot replace one element with another when the "
  247. "element to be replaced is not part of a tree.")
  248. if len(args) == 1 and args[0] is self:
  249. return
  250. if any(x is self.parent for x in args):
  251. raise ValueError("Cannot replace a Tag with its parent.")
  252. old_parent = self.parent
  253. my_index = self.parent.index(self)
  254. self.extract(_self_index=my_index)
  255. for idx, replace_with in enumerate(args, start=my_index):
  256. old_parent.insert(idx, replace_with)
  257. return self
  258. replaceWith = replace_with # BS3
  259. def unwrap(self):
  260. """Replace this PageElement with its contents.
  261. :return: `self`, no longer part of the tree.
  262. """
  263. my_parent = self.parent
  264. if self.parent is None:
  265. raise ValueError(
  266. "Cannot replace an element with its contents when that"
  267. "element is not part of a tree.")
  268. my_index = self.parent.index(self)
  269. self.extract(_self_index=my_index)
  270. for child in reversed(self.contents[:]):
  271. my_parent.insert(my_index, child)
  272. return self
  273. replace_with_children = unwrap
  274. replaceWithChildren = unwrap # BS3
  275. def wrap(self, wrap_inside):
  276. """Wrap this PageElement inside another one.
  277. :param wrap_inside: A PageElement.
  278. :return: `wrap_inside`, occupying the position in the tree that used
  279. to be occupied by `self`, and with `self` inside it.
  280. """
  281. me = self.replace_with(wrap_inside)
  282. wrap_inside.append(me)
  283. return wrap_inside
  284. def extract(self, _self_index=None):
  285. """Destructively rips this element out of the tree.
  286. :param _self_index: The location of this element in its parent's
  287. .contents, if known. Passing this in allows for a performance
  288. optimization.
  289. :return: `self`, no longer part of the tree.
  290. """
  291. if self.parent is not None:
  292. if _self_index is None:
  293. _self_index = self.parent.index(self)
  294. del self.parent.contents[_self_index]
  295. #Find the two elements that would be next to each other if
  296. #this element (and any children) hadn't been parsed. Connect
  297. #the two.
  298. last_child = self._last_descendant()
  299. next_element = last_child.next_element
  300. if (self.previous_element is not None and
  301. self.previous_element is not next_element):
  302. self.previous_element.next_element = next_element
  303. if next_element is not None and next_element is not self.previous_element:
  304. next_element.previous_element = self.previous_element
  305. self.previous_element = None
  306. last_child.next_element = None
  307. self.parent = None
  308. if (self.previous_sibling is not None
  309. and self.previous_sibling is not self.next_sibling):
  310. self.previous_sibling.next_sibling = self.next_sibling
  311. if (self.next_sibling is not None
  312. and self.next_sibling is not self.previous_sibling):
  313. self.next_sibling.previous_sibling = self.previous_sibling
  314. self.previous_sibling = self.next_sibling = None
  315. return self
  316. def _last_descendant(self, is_initialized=True, accept_self=True):
  317. """Finds the last element beneath this object to be parsed.
  318. :param is_initialized: Has `setup` been called on this PageElement
  319. yet?
  320. :param accept_self: Is `self` an acceptable answer to the question?
  321. """
  322. if is_initialized and self.next_sibling is not None:
  323. last_child = self.next_sibling.previous_element
  324. else:
  325. last_child = self
  326. while isinstance(last_child, Tag) and last_child.contents:
  327. last_child = last_child.contents[-1]
  328. if not accept_self and last_child is self:
  329. last_child = None
  330. return last_child
  331. # BS3: Not part of the API!
  332. _lastRecursiveChild = _last_descendant
  333. def insert(self, position, new_child):
  334. """Insert a new PageElement in the list of this PageElement's children.
  335. This works the same way as `list.insert`.
  336. :param position: The numeric position that should be occupied
  337. in `self.children` by the new PageElement.
  338. :param new_child: A PageElement.
  339. """
  340. if new_child is None:
  341. raise ValueError("Cannot insert None into a tag.")
  342. if new_child is self:
  343. raise ValueError("Cannot insert a tag into itself.")
  344. if (isinstance(new_child, str)
  345. and not isinstance(new_child, NavigableString)):
  346. new_child = NavigableString(new_child)
  347. from bs4 import BeautifulSoup
  348. if isinstance(new_child, BeautifulSoup):
  349. # We don't want to end up with a situation where one BeautifulSoup
  350. # object contains another. Insert the children one at a time.
  351. for subchild in list(new_child.contents):
  352. self.insert(position, subchild)
  353. position += 1
  354. return
  355. position = min(position, len(self.contents))
  356. if hasattr(new_child, 'parent') and new_child.parent is not None:
  357. # We're 'inserting' an element that's already one
  358. # of this object's children.
  359. if new_child.parent is self:
  360. current_index = self.index(new_child)
  361. if current_index < position:
  362. # We're moving this element further down the list
  363. # of this object's children. That means that when
  364. # we extract this element, our target index will
  365. # jump down one.
  366. position -= 1
  367. new_child.extract()
  368. new_child.parent = self
  369. previous_child = None
  370. if position == 0:
  371. new_child.previous_sibling = None
  372. new_child.previous_element = self
  373. else:
  374. previous_child = self.contents[position - 1]
  375. new_child.previous_sibling = previous_child
  376. new_child.previous_sibling.next_sibling = new_child
  377. new_child.previous_element = previous_child._last_descendant(False)
  378. if new_child.previous_element is not None:
  379. new_child.previous_element.next_element = new_child
  380. new_childs_last_element = new_child._last_descendant(False)
  381. if position >= len(self.contents):
  382. new_child.next_sibling = None
  383. parent = self
  384. parents_next_sibling = None
  385. while parents_next_sibling is None and parent is not None:
  386. parents_next_sibling = parent.next_sibling
  387. parent = parent.parent
  388. if parents_next_sibling is not None:
  389. # We found the element that comes next in the document.
  390. break
  391. if parents_next_sibling is not None:
  392. new_childs_last_element.next_element = parents_next_sibling
  393. else:
  394. # The last element of this tag is the last element in
  395. # the document.
  396. new_childs_last_element.next_element = None
  397. else:
  398. next_child = self.contents[position]
  399. new_child.next_sibling = next_child
  400. if new_child.next_sibling is not None:
  401. new_child.next_sibling.previous_sibling = new_child
  402. new_childs_last_element.next_element = next_child
  403. if new_childs_last_element.next_element is not None:
  404. new_childs_last_element.next_element.previous_element = new_childs_last_element
  405. self.contents.insert(position, new_child)
  406. def append(self, tag):
  407. """Appends the given PageElement to the contents of this one.
  408. :param tag: A PageElement.
  409. """
  410. self.insert(len(self.contents), tag)
  411. def extend(self, tags):
  412. """Appends the given PageElements to this one's contents.
  413. :param tags: A list of PageElements. If a single Tag is
  414. provided instead, this PageElement's contents will be extended
  415. with that Tag's contents.
  416. """
  417. if isinstance(tags, Tag):
  418. tags = tags.contents
  419. if isinstance(tags, list):
  420. # Moving items around the tree may change their position in
  421. # the original list. Make a list that won't change.
  422. tags = list(tags)
  423. for tag in tags:
  424. self.append(tag)
  425. def insert_before(self, *args):
  426. """Makes the given element(s) the immediate predecessor of this one.
  427. All the elements will have the same parent, and the given elements
  428. will be immediately before this one.
  429. :param args: One or more PageElements.
  430. """
  431. parent = self.parent
  432. if parent is None:
  433. raise ValueError(
  434. "Element has no parent, so 'before' has no meaning.")
  435. if any(x is self for x in args):
  436. raise ValueError("Can't insert an element before itself.")
  437. for predecessor in args:
  438. # Extract first so that the index won't be screwed up if they
  439. # are siblings.
  440. if isinstance(predecessor, PageElement):
  441. predecessor.extract()
  442. index = parent.index(self)
  443. parent.insert(index, predecessor)
  444. def insert_after(self, *args):
  445. """Makes the given element(s) the immediate successor of this one.
  446. The elements will have the same parent, and the given elements
  447. will be immediately after this one.
  448. :param args: One or more PageElements.
  449. """
  450. # Do all error checking before modifying the tree.
  451. parent = self.parent
  452. if parent is None:
  453. raise ValueError(
  454. "Element has no parent, so 'after' has no meaning.")
  455. if any(x is self for x in args):
  456. raise ValueError("Can't insert an element after itself.")
  457. offset = 0
  458. for successor in args:
  459. # Extract first so that the index won't be screwed up if they
  460. # are siblings.
  461. if isinstance(successor, PageElement):
  462. successor.extract()
  463. index = parent.index(self)
  464. parent.insert(index+1+offset, successor)
  465. offset += 1
  466. def find_next(self, name=None, attrs={}, string=None, **kwargs):
  467. """Find the first PageElement that matches the given criteria and
  468. appears later in the document than this PageElement.
  469. All find_* methods take a common set of arguments. See the online
  470. documentation for detailed explanations.
  471. :param name: A filter on tag name.
  472. :param attrs: A dictionary of filters on attribute values.
  473. :param string: A filter for a NavigableString with specific text.
  474. :kwargs: A dictionary of filters on attribute values.
  475. :return: A PageElement.
  476. :rtype: bs4.element.Tag | bs4.element.NavigableString
  477. """
  478. return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
  479. findNext = find_next # BS3
  480. def find_all_next(self, name=None, attrs={}, string=None, limit=None,
  481. **kwargs):
  482. """Find all PageElements that match the given criteria and appear
  483. later in the document than this PageElement.
  484. All find_* methods take a common set of arguments. See the online
  485. documentation for detailed explanations.
  486. :param name: A filter on tag name.
  487. :param attrs: A dictionary of filters on attribute values.
  488. :param string: A filter for a NavigableString with specific text.
  489. :param limit: Stop looking after finding this many results.
  490. :kwargs: A dictionary of filters on attribute values.
  491. :return: A ResultSet containing PageElements.
  492. """
  493. _stacklevel = kwargs.pop('_stacklevel', 2)
  494. return self._find_all(name, attrs, string, limit, self.next_elements,
  495. _stacklevel=_stacklevel+1, **kwargs)
  496. findAllNext = find_all_next # BS3
  497. def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
  498. """Find the closest sibling to this PageElement that matches the
  499. given criteria and appears later in the document.
  500. All find_* methods take a common set of arguments. See the
  501. online documentation for detailed explanations.
  502. :param name: A filter on tag name.
  503. :param attrs: A dictionary of filters on attribute values.
  504. :param string: A filter for a NavigableString with specific text.
  505. :kwargs: A dictionary of filters on attribute values.
  506. :return: A PageElement.
  507. :rtype: bs4.element.Tag | bs4.element.NavigableString
  508. """
  509. return self._find_one(self.find_next_siblings, name, attrs, string,
  510. **kwargs)
  511. findNextSibling = find_next_sibling # BS3
  512. def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
  513. **kwargs):
  514. """Find all siblings of this PageElement that match the given criteria
  515. and appear later in the document.
  516. All find_* methods take a common set of arguments. See the online
  517. documentation for detailed explanations.
  518. :param name: A filter on tag name.
  519. :param attrs: A dictionary of filters on attribute values.
  520. :param string: A filter for a NavigableString with specific text.
  521. :param limit: Stop looking after finding this many results.
  522. :kwargs: A dictionary of filters on attribute values.
  523. :return: A ResultSet of PageElements.
  524. :rtype: bs4.element.ResultSet
  525. """
  526. _stacklevel = kwargs.pop('_stacklevel', 2)
  527. return self._find_all(
  528. name, attrs, string, limit,
  529. self.next_siblings, _stacklevel=_stacklevel+1, **kwargs
  530. )
  531. findNextSiblings = find_next_siblings # BS3
  532. fetchNextSiblings = find_next_siblings # BS2
  533. def find_previous(self, name=None, attrs={}, string=None, **kwargs):
  534. """Look backwards in the document from this PageElement and find the
  535. first PageElement that matches the given criteria.
  536. All find_* methods take a common set of arguments. See the online
  537. documentation for detailed explanations.
  538. :param name: A filter on tag name.
  539. :param attrs: A dictionary of filters on attribute values.
  540. :param string: A filter for a NavigableString with specific text.
  541. :kwargs: A dictionary of filters on attribute values.
  542. :return: A PageElement.
  543. :rtype: bs4.element.Tag | bs4.element.NavigableString
  544. """
  545. return self._find_one(
  546. self.find_all_previous, name, attrs, string, **kwargs)
  547. findPrevious = find_previous # BS3
  548. def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
  549. **kwargs):
  550. """Look backwards in the document from this PageElement and find all
  551. PageElements that match the given criteria.
  552. All find_* methods take a common set of arguments. See the online
  553. documentation for detailed explanations.
  554. :param name: A filter on tag name.
  555. :param attrs: A dictionary of filters on attribute values.
  556. :param string: A filter for a NavigableString with specific text.
  557. :param limit: Stop looking after finding this many results.
  558. :kwargs: A dictionary of filters on attribute values.
  559. :return: A ResultSet of PageElements.
  560. :rtype: bs4.element.ResultSet
  561. """
  562. _stacklevel = kwargs.pop('_stacklevel', 2)
  563. return self._find_all(
  564. name, attrs, string, limit, self.previous_elements,
  565. _stacklevel=_stacklevel+1, **kwargs
  566. )
  567. findAllPrevious = find_all_previous # BS3
  568. fetchPrevious = find_all_previous # BS2
  569. def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
  570. """Returns the closest sibling to this PageElement that matches the
  571. given criteria and appears earlier in the document.
  572. All find_* methods take a common set of arguments. See the online
  573. documentation for detailed explanations.
  574. :param name: A filter on tag name.
  575. :param attrs: A dictionary of filters on attribute values.
  576. :param string: A filter for a NavigableString with specific text.
  577. :kwargs: A dictionary of filters on attribute values.
  578. :return: A PageElement.
  579. :rtype: bs4.element.Tag | bs4.element.NavigableString
  580. """
  581. return self._find_one(self.find_previous_siblings, name, attrs, string,
  582. **kwargs)
  583. findPreviousSibling = find_previous_sibling # BS3
  584. def find_previous_siblings(self, name=None, attrs={}, string=None,
  585. limit=None, **kwargs):
  586. """Returns all siblings to this PageElement that match the
  587. given criteria and appear earlier in the document.
  588. All find_* methods take a common set of arguments. See the online
  589. documentation for detailed explanations.
  590. :param name: A filter on tag name.
  591. :param attrs: A dictionary of filters on attribute values.
  592. :param string: A filter for a NavigableString with specific text.
  593. :param limit: Stop looking after finding this many results.
  594. :kwargs: A dictionary of filters on attribute values.
  595. :return: A ResultSet of PageElements.
  596. :rtype: bs4.element.ResultSet
  597. """
  598. _stacklevel = kwargs.pop('_stacklevel', 2)
  599. return self._find_all(
  600. name, attrs, string, limit,
  601. self.previous_siblings, _stacklevel=_stacklevel+1, **kwargs
  602. )
  603. findPreviousSiblings = find_previous_siblings # BS3
  604. fetchPreviousSiblings = find_previous_siblings # BS2
  605. def find_parent(self, name=None, attrs={}, **kwargs):
  606. """Find the closest parent of this PageElement that matches the given
  607. criteria.
  608. All find_* methods take a common set of arguments. See the online
  609. documentation for detailed explanations.
  610. :param name: A filter on tag name.
  611. :param attrs: A dictionary of filters on attribute values.
  612. :kwargs: A dictionary of filters on attribute values.
  613. :return: A PageElement.
  614. :rtype: bs4.element.Tag | bs4.element.NavigableString
  615. """
  616. # NOTE: We can't use _find_one because findParents takes a different
  617. # set of arguments.
  618. r = None
  619. l = self.find_parents(name, attrs, 1, _stacklevel=3, **kwargs)
  620. if l:
  621. r = l[0]
  622. return r
  623. findParent = find_parent # BS3
  624. def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
  625. """Find all parents of this PageElement that match the given criteria.
  626. All find_* methods take a common set of arguments. See the online
  627. documentation for detailed explanations.
  628. :param name: A filter on tag name.
  629. :param attrs: A dictionary of filters on attribute values.
  630. :param limit: Stop looking after finding this many results.
  631. :kwargs: A dictionary of filters on attribute values.
  632. :return: A PageElement.
  633. :rtype: bs4.element.Tag | bs4.element.NavigableString
  634. """
  635. _stacklevel = kwargs.pop('_stacklevel', 2)
  636. return self._find_all(name, attrs, None, limit, self.parents,
  637. _stacklevel=_stacklevel+1, **kwargs)
  638. findParents = find_parents # BS3
  639. fetchParents = find_parents # BS2
  640. @property
  641. def next(self):
  642. """The PageElement, if any, that was parsed just after this one.
  643. :return: A PageElement.
  644. :rtype: bs4.element.Tag | bs4.element.NavigableString
  645. """
  646. return self.next_element
  647. @property
  648. def previous(self):
  649. """The PageElement, if any, that was parsed just before this one.
  650. :return: A PageElement.
  651. :rtype: bs4.element.Tag | bs4.element.NavigableString
  652. """
  653. return self.previous_element
  654. #These methods do the real heavy lifting.
  655. def _find_one(self, method, name, attrs, string, **kwargs):
  656. r = None
  657. l = method(name, attrs, string, 1, _stacklevel=4, **kwargs)
  658. if l:
  659. r = l[0]
  660. return r
  661. def _find_all(self, name, attrs, string, limit, generator, **kwargs):
  662. "Iterates over a generator looking for things that match."
  663. _stacklevel = kwargs.pop('_stacklevel', 3)
  664. if string is None and 'text' in kwargs:
  665. string = kwargs.pop('text')
  666. warnings.warn(
  667. "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
  668. DeprecationWarning, stacklevel=_stacklevel
  669. )
  670. if isinstance(name, SoupStrainer):
  671. strainer = name
  672. else:
  673. strainer = SoupStrainer(name, attrs, string, **kwargs)
  674. if string is None and not limit and not attrs and not kwargs:
  675. if name is True or name is None:
  676. # Optimization to find all tags.
  677. result = (element for element in generator
  678. if isinstance(element, Tag))
  679. return ResultSet(strainer, result)
  680. elif isinstance(name, str):
  681. # Optimization to find all tags with a given name.
  682. if name.count(':') == 1:
  683. # This is a name with a prefix. If this is a namespace-aware document,
  684. # we need to match the local name against tag.name. If not,
  685. # we need to match the fully-qualified name against tag.name.
  686. prefix, local_name = name.split(':', 1)
  687. else:
  688. prefix = None
  689. local_name = name
  690. result = (element for element in generator
  691. if isinstance(element, Tag)
  692. and (
  693. element.name == name
  694. ) or (
  695. element.name == local_name
  696. and (prefix is None or element.prefix == prefix)
  697. )
  698. )
  699. return ResultSet(strainer, result)
  700. results = ResultSet(strainer)
  701. while True:
  702. try:
  703. i = next(generator)
  704. except StopIteration:
  705. break
  706. if i:
  707. found = strainer.search(i)
  708. if found:
  709. results.append(found)
  710. if limit and len(results) >= limit:
  711. break
  712. return results
  713. #These generators can be used to navigate starting from both
  714. #NavigableStrings and Tags.
  715. @property
  716. def next_elements(self):
  717. """All PageElements that were parsed after this one.
  718. :yield: A sequence of PageElements.
  719. """
  720. i = self.next_element
  721. while i is not None:
  722. yield i
  723. i = i.next_element
  724. @property
  725. def next_siblings(self):
  726. """All PageElements that are siblings of this one but were parsed
  727. later.
  728. :yield: A sequence of PageElements.
  729. """
  730. i = self.next_sibling
  731. while i is not None:
  732. yield i
  733. i = i.next_sibling
  734. @property
  735. def previous_elements(self):
  736. """All PageElements that were parsed before this one.
  737. :yield: A sequence of PageElements.
  738. """
  739. i = self.previous_element
  740. while i is not None:
  741. yield i
  742. i = i.previous_element
  743. @property
  744. def previous_siblings(self):
  745. """All PageElements that are siblings of this one but were parsed
  746. earlier.
  747. :yield: A sequence of PageElements.
  748. """
  749. i = self.previous_sibling
  750. while i is not None:
  751. yield i
  752. i = i.previous_sibling
  753. @property
  754. def parents(self):
  755. """All PageElements that are parents of this PageElement.
  756. :yield: A sequence of PageElements.
  757. """
  758. i = self.parent
  759. while i is not None:
  760. yield i
  761. i = i.parent
  762. @property
  763. def decomposed(self):
  764. """Check whether a PageElement has been decomposed.
  765. :rtype: bool
  766. """
  767. return getattr(self, '_decomposed', False) or False
  768. # Old non-property versions of the generators, for backwards
  769. # compatibility with BS3.
  770. def nextGenerator(self):
  771. return self.next_elements
  772. def nextSiblingGenerator(self):
  773. return self.next_siblings
  774. def previousGenerator(self):
  775. return self.previous_elements
  776. def previousSiblingGenerator(self):
  777. return self.previous_siblings
  778. def parentGenerator(self):
  779. return self.parents
  780. class NavigableString(str, PageElement):
  781. """A Python Unicode string that is part of a parse tree.
  782. When Beautiful Soup parses the markup <b>penguin</b>, it will
  783. create a NavigableString for the string "penguin".
  784. """
  785. PREFIX = ''
  786. SUFFIX = ''
  787. def __new__(cls, value):
  788. """Create a new NavigableString.
  789. When unpickling a NavigableString, this method is called with
  790. the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
  791. passed in to the superclass's __new__ or the superclass won't know
  792. how to handle non-ASCII characters.
  793. """
  794. if isinstance(value, str):
  795. u = str.__new__(cls, value)
  796. else:
  797. u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
  798. u.setup()
  799. return u
  800. def __deepcopy__(self, memo, recursive=False):
  801. """A copy of a NavigableString has the same contents and class
  802. as the original, but it is not connected to the parse tree.
  803. :param recursive: This parameter is ignored; it's only defined
  804. so that NavigableString.__deepcopy__ implements the same
  805. signature as Tag.__deepcopy__.
  806. """
  807. return type(self)(self)
  808. def __copy__(self):
  809. """A copy of a NavigableString can only be a deep copy, because
  810. only one PageElement can occupy a given place in a parse tree.
  811. """
  812. return self.__deepcopy__({})
  813. def __getnewargs__(self):
  814. return (str(self),)
  815. def __getattr__(self, attr):
  816. """text.string gives you text. This is for backwards
  817. compatibility for Navigable*String, but for CData* it lets you
  818. get the string without the CData wrapper."""
  819. if attr == 'string':
  820. return self
  821. else:
  822. raise AttributeError(
  823. "'%s' object has no attribute '%s'" % (
  824. self.__class__.__name__, attr))
  825. def output_ready(self, formatter="minimal"):
  826. """Run the string through the provided formatter.
  827. :param formatter: A Formatter object, or a string naming one of the standard formatters.
  828. """
  829. output = self.format_string(self, formatter)
  830. return self.PREFIX + output + self.SUFFIX
  831. @property
  832. def name(self):
  833. """Since a NavigableString is not a Tag, it has no .name.
  834. This property is implemented so that code like this doesn't crash
  835. when run on a mixture of Tag and NavigableString objects:
  836. [x.name for x in tag.children]
  837. """
  838. return None
  839. @name.setter
  840. def name(self, name):
  841. """Prevent NavigableString.name from ever being set."""
  842. raise AttributeError("A NavigableString cannot be given a name.")
  843. def _all_strings(self, strip=False, types=PageElement.default):
  844. """Yield all strings of certain classes, possibly stripping them.
  845. This makes it easy for NavigableString to implement methods
  846. like get_text() as conveniences, creating a consistent
  847. text-extraction API across all PageElements.
  848. :param strip: If True, all strings will be stripped before being
  849. yielded.
  850. :param types: A tuple of NavigableString subclasses. If this
  851. NavigableString isn't one of those subclasses, the
  852. sequence will be empty. By default, the subclasses
  853. considered are NavigableString and CData objects. That
  854. means no comments, processing instructions, etc.
  855. :yield: A sequence that either contains this string, or is empty.
  856. """
  857. if types is self.default:
  858. # This is kept in Tag because it's full of subclasses of
  859. # this class, which aren't defined until later in the file.
  860. types = Tag.DEFAULT_INTERESTING_STRING_TYPES
  861. # Do nothing if the caller is looking for specific types of
  862. # string, and we're of a different type.
  863. #
  864. # We check specific types instead of using isinstance(self,
  865. # types) because all of these classes subclass
  866. # NavigableString. Anyone who's using this feature probably
  867. # wants generic NavigableStrings but not other stuff.
  868. my_type = type(self)
  869. if types is not None:
  870. if isinstance(types, type):
  871. # Looking for a single type.
  872. if my_type is not types:
  873. return
  874. elif my_type not in types:
  875. # Looking for one of a list of types.
  876. return
  877. value = self
  878. if strip:
  879. value = value.strip()
  880. if len(value) > 0:
  881. yield value
  882. strings = property(_all_strings)
  883. class PreformattedString(NavigableString):
  884. """A NavigableString not subject to the normal formatting rules.
  885. This is an abstract class used for special kinds of strings such
  886. as comments (the Comment class) and CDATA blocks (the CData
  887. class).
  888. """
  889. PREFIX = ''
  890. SUFFIX = ''
  891. def output_ready(self, formatter=None):
  892. """Make this string ready for output by adding any subclass-specific
  893. prefix or suffix.
  894. :param formatter: A Formatter object, or a string naming one
  895. of the standard formatters. The string will be passed into the
  896. Formatter, but only to trigger any side effects: the return
  897. value is ignored.
  898. :return: The string, with any subclass-specific prefix and
  899. suffix added on.
  900. """
  901. if formatter is not None:
  902. ignore = self.format_string(self, formatter)
  903. return self.PREFIX + self + self.SUFFIX
  904. class CData(PreformattedString):
  905. """A CDATA block."""
  906. PREFIX = '<![CDATA['
  907. SUFFIX = ']]>'
  908. class ProcessingInstruction(PreformattedString):
  909. """A SGML processing instruction."""
  910. PREFIX = '<?'
  911. SUFFIX = '>'
  912. class XMLProcessingInstruction(ProcessingInstruction):
  913. """An XML processing instruction."""
  914. PREFIX = '<?'
  915. SUFFIX = '?>'
  916. class Comment(PreformattedString):
  917. """An HTML or XML comment."""
  918. PREFIX = '<!--'
  919. SUFFIX = '-->'
  920. class Declaration(PreformattedString):
  921. """An XML declaration."""
  922. PREFIX = '<?'
  923. SUFFIX = '?>'
  924. class Doctype(PreformattedString):
  925. """A document type declaration."""
  926. @classmethod
  927. def for_name_and_ids(cls, name, pub_id, system_id):
  928. """Generate an appropriate document type declaration for a given
  929. public ID and system ID.
  930. :param name: The name of the document's root element, e.g. 'html'.
  931. :param pub_id: The Formal Public Identifier for this document type,
  932. e.g. '-//W3C//DTD XHTML 1.1//EN'
  933. :param system_id: The system identifier for this document type,
  934. e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
  935. :return: A Doctype.
  936. """
  937. value = name or ''
  938. if pub_id is not None:
  939. value += ' PUBLIC "%s"' % pub_id
  940. if system_id is not None:
  941. value += ' "%s"' % system_id
  942. elif system_id is not None:
  943. value += ' SYSTEM "%s"' % system_id
  944. return Doctype(value)
  945. PREFIX = '<!DOCTYPE '
  946. SUFFIX = '>\n'
  947. class Stylesheet(NavigableString):
  948. """A NavigableString representing an stylesheet (probably
  949. CSS).
  950. Used to distinguish embedded stylesheets from textual content.
  951. """
  952. pass
  953. class Script(NavigableString):
  954. """A NavigableString representing an executable script (probably
  955. Javascript).
  956. Used to distinguish executable code from textual content.
  957. """
  958. pass
  959. class TemplateString(NavigableString):
  960. """A NavigableString representing a string found inside an HTML
  961. template embedded in a larger document.
  962. Used to distinguish such strings from the main body of the document.
  963. """
  964. pass
  965. class RubyTextString(NavigableString):
  966. """A NavigableString representing the contents of the <rt> HTML
  967. element.
  968. https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
  969. Can be used to distinguish such strings from the strings they're
  970. annotating.
  971. """
  972. pass
  973. class RubyParenthesisString(NavigableString):
  974. """A NavigableString representing the contents of the <rp> HTML
  975. element.
  976. https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
  977. """
  978. pass
  979. class Tag(PageElement):
  980. """Represents an HTML or XML tag that is part of a parse tree, along
  981. with its attributes and contents.
  982. When Beautiful Soup parses the markup <b>penguin</b>, it will
  983. create a Tag object representing the <b> tag.
  984. """
  985. def __init__(self, parser=None, builder=None, name=None, namespace=None,
  986. prefix=None, attrs=None, parent=None, previous=None,
  987. is_xml=None, sourceline=None, sourcepos=None,
  988. can_be_empty_element=None, cdata_list_attributes=None,
  989. preserve_whitespace_tags=None,
  990. interesting_string_types=None,
  991. namespaces=None
  992. ):
  993. """Basic constructor.
  994. :param parser: A BeautifulSoup object.
  995. :param builder: A TreeBuilder.
  996. :param name: The name of the tag.
  997. :param namespace: The URI of this Tag's XML namespace, if any.
  998. :param prefix: The prefix for this Tag's XML namespace, if any.
  999. :param attrs: A dictionary of this Tag's attribute values.
  1000. :param parent: The PageElement to use as this Tag's parent.
  1001. :param previous: The PageElement that was parsed immediately before
  1002. this tag.
  1003. :param is_xml: If True, this is an XML tag. Otherwise, this is an
  1004. HTML tag.
  1005. :param sourceline: The line number where this tag was found in its
  1006. source document.
  1007. :param sourcepos: The character position within `sourceline` where this
  1008. tag was found.
  1009. :param can_be_empty_element: If True, this tag should be
  1010. represented as <tag/>. If False, this tag should be represented
  1011. as <tag></tag>.
  1012. :param cdata_list_attributes: A list of attributes whose values should
  1013. be treated as CDATA if they ever show up on this tag.
  1014. :param preserve_whitespace_tags: A list of tag names whose contents
  1015. should have their whitespace preserved.
  1016. :param interesting_string_types: This is a NavigableString
  1017. subclass or a tuple of them. When iterating over this
  1018. Tag's strings in methods like Tag.strings or Tag.get_text,
  1019. these are the types of strings that are interesting enough
  1020. to be considered. The default is to consider
  1021. NavigableString and CData the only interesting string
  1022. subtypes.
  1023. :param namespaces: A dictionary mapping currently active
  1024. namespace prefixes to URIs. This can be used later to
  1025. construct CSS selectors.
  1026. """
  1027. if parser is None:
  1028. self.parser_class = None
  1029. else:
  1030. # We don't actually store the parser object: that lets extracted
  1031. # chunks be garbage-collected.
  1032. self.parser_class = parser.__class__
  1033. if name is None:
  1034. raise ValueError("No value provided for new tag's name.")
  1035. self.name = name
  1036. self.namespace = namespace
  1037. self._namespaces = namespaces or {}
  1038. self.prefix = prefix
  1039. if ((not builder or builder.store_line_numbers)
  1040. and (sourceline is not None or sourcepos is not None)):
  1041. self.sourceline = sourceline
  1042. self.sourcepos = sourcepos
  1043. if attrs is None:
  1044. attrs = {}
  1045. elif attrs:
  1046. if builder is not None and builder.cdata_list_attributes:
  1047. attrs = builder._replace_cdata_list_attribute_values(
  1048. self.name, attrs)
  1049. else:
  1050. attrs = dict(attrs)
  1051. else:
  1052. attrs = dict(attrs)
  1053. # If possible, determine ahead of time whether this tag is an
  1054. # XML tag.
  1055. if builder:
  1056. self.known_xml = builder.is_xml
  1057. else:
  1058. self.known_xml = is_xml
  1059. self.attrs = attrs
  1060. self.contents = []
  1061. self.setup(parent, previous)
  1062. self.hidden = False
  1063. if builder is None:
  1064. # In the absence of a TreeBuilder, use whatever values were
  1065. # passed in here. They're probably None, unless this is a copy of some
  1066. # other tag.
  1067. self.can_be_empty_element = can_be_empty_element
  1068. self.cdata_list_attributes = cdata_list_attributes
  1069. self.preserve_whitespace_tags = preserve_whitespace_tags
  1070. self.interesting_string_types = interesting_string_types
  1071. else:
  1072. # Set up any substitutions for this tag, such as the charset in a META tag.
  1073. builder.set_up_substitutions(self)
  1074. # Ask the TreeBuilder whether this tag might be an empty-element tag.
  1075. self.can_be_empty_element = builder.can_be_empty_element(name)
  1076. # Keep track of the list of attributes of this tag that
  1077. # might need to be treated as a list.
  1078. #
  1079. # For performance reasons, we store the whole data structure
  1080. # rather than asking the question of every tag. Asking would
  1081. # require building a new data structure every time, and
  1082. # (unlike can_be_empty_element), we almost never need
  1083. # to check this.
  1084. self.cdata_list_attributes = builder.cdata_list_attributes
  1085. # Keep track of the names that might cause this tag to be treated as a
  1086. # whitespace-preserved tag.
  1087. self.preserve_whitespace_tags = builder.preserve_whitespace_tags
  1088. if self.name in builder.string_containers:
  1089. # This sort of tag uses a special string container
  1090. # subclass for most of its strings. When we ask the
  1091. self.interesting_string_types = builder.string_containers[self.name]
  1092. else:
  1093. self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
  1094. parserClass = _alias("parser_class") # BS3
  1095. def __deepcopy__(self, memo, recursive=True):
  1096. """A deepcopy of a Tag is a new Tag, unconnected to the parse tree.
  1097. Its contents are a copy of the old Tag's contents.
  1098. """
  1099. clone = self._clone()
  1100. if recursive:
  1101. # Clone this tag's descendants recursively, but without
  1102. # making any recursive function calls.
  1103. tag_stack = [clone]
  1104. for event, element in self._event_stream(self.descendants):
  1105. if event is Tag.END_ELEMENT_EVENT:
  1106. # Stop appending incoming Tags to the Tag that was
  1107. # just closed.
  1108. tag_stack.pop()
  1109. else:
  1110. descendant_clone = element.__deepcopy__(
  1111. memo, recursive=False
  1112. )
  1113. # Add to its parent's .contents
  1114. tag_stack[-1].append(descendant_clone)
  1115. if event is Tag.START_ELEMENT_EVENT:
  1116. # Add the Tag itself to the stack so that its
  1117. # children will be .appended to it.
  1118. tag_stack.append(descendant_clone)
  1119. return clone
  1120. def __copy__(self):
  1121. """A copy of a Tag must always be a deep copy, because a Tag's
  1122. children can only have one parent at a time.
  1123. """
  1124. return self.__deepcopy__({})
  1125. def _clone(self):
  1126. """Create a new Tag just like this one, but with no
  1127. contents and unattached to any parse tree.
  1128. This is the first step in the deepcopy process.
  1129. """
  1130. clone = type(self)(
  1131. None, None, self.name, self.namespace,
  1132. self.prefix, self.attrs, is_xml=self._is_xml,
  1133. sourceline=self.sourceline, sourcepos=self.sourcepos,
  1134. can_be_empty_element=self.can_be_empty_element,
  1135. cdata_list_attributes=self.cdata_list_attributes,
  1136. preserve_whitespace_tags=self.preserve_whitespace_tags,
  1137. interesting_string_types=self.interesting_string_types
  1138. )
  1139. for attr in ('can_be_empty_element', 'hidden'):
  1140. setattr(clone, attr, getattr(self, attr))
  1141. return clone
  1142. @property
  1143. def is_empty_element(self):
  1144. """Is this tag an empty-element tag? (aka a self-closing tag)
  1145. A tag that has contents is never an empty-element tag.
  1146. A tag that has no contents may or may not be an empty-element
  1147. tag. It depends on the builder used to create the tag. If the
  1148. builder has a designated list of empty-element tags, then only
  1149. a tag whose name shows up in that list is considered an
  1150. empty-element tag.
  1151. If the builder has no designated list of empty-element tags,
  1152. then any tag with no contents is an empty-element tag.
  1153. """
  1154. return len(self.contents) == 0 and self.can_be_empty_element
  1155. isSelfClosing = is_empty_element # BS3
  1156. @property
  1157. def string(self):
  1158. """Convenience property to get the single string within this
  1159. PageElement.
  1160. TODO It might make sense to have NavigableString.string return
  1161. itself.
  1162. :return: If this element has a single string child, return
  1163. value is that string. If this element has one child tag,
  1164. return value is the 'string' attribute of the child tag,
  1165. recursively. If this element is itself a string, has no
  1166. children, or has more than one child, return value is None.
  1167. """
  1168. if len(self.contents) != 1:
  1169. return None
  1170. child = self.contents[0]
  1171. if isinstance(child, NavigableString):
  1172. return child
  1173. return child.string
  1174. @string.setter
  1175. def string(self, string):
  1176. """Replace this PageElement's contents with `string`."""
  1177. self.clear()
  1178. self.append(string.__class__(string))
  1179. DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
  1180. def _all_strings(self, strip=False, types=PageElement.default):
  1181. """Yield all strings of certain classes, possibly stripping them.
  1182. :param strip: If True, all strings will be stripped before being
  1183. yielded.
  1184. :param types: A tuple of NavigableString subclasses. Any strings of
  1185. a subclass not found in this list will be ignored. By
  1186. default, the subclasses considered are the ones found in
  1187. self.interesting_string_types. If that's not specified,
  1188. only NavigableString and CData objects will be
  1189. considered. That means no comments, processing
  1190. instructions, etc.
  1191. :yield: A sequence of strings.
  1192. """
  1193. if types is self.default:
  1194. types = self.interesting_string_types
  1195. for descendant in self.descendants:
  1196. if (types is None and not isinstance(descendant, NavigableString)):
  1197. continue
  1198. descendant_type = type(descendant)
  1199. if isinstance(types, type):
  1200. if descendant_type is not types:
  1201. # We're not interested in strings of this type.
  1202. continue
  1203. elif types is not None and descendant_type not in types:
  1204. # We're not interested in strings of this type.
  1205. continue
  1206. if strip:
  1207. descendant = descendant.strip()
  1208. if len(descendant) == 0:
  1209. continue
  1210. yield descendant
  1211. strings = property(_all_strings)
  1212. def decompose(self):
  1213. """Recursively destroys this PageElement and its children.
  1214. This element will be removed from the tree and wiped out; so
  1215. will everything beneath it.
  1216. The behavior of a decomposed PageElement is undefined and you
  1217. should never use one for anything, but if you need to _check_
  1218. whether an element has been decomposed, you can use the
  1219. `decomposed` property.
  1220. """
  1221. self.extract()
  1222. i = self
  1223. while i is not None:
  1224. n = i.next_element
  1225. i.__dict__.clear()
  1226. i.contents = []
  1227. i._decomposed = True
  1228. i = n
  1229. def clear(self, decompose=False):
  1230. """Wipe out all children of this PageElement by calling extract()
  1231. on them.
  1232. :param decompose: If this is True, decompose() (a more
  1233. destructive method) will be called instead of extract().
  1234. """
  1235. if decompose:
  1236. for element in self.contents[:]:
  1237. if isinstance(element, Tag):
  1238. element.decompose()
  1239. else:
  1240. element.extract()
  1241. else:
  1242. for element in self.contents[:]:
  1243. element.extract()
  1244. def smooth(self):
  1245. """Smooth out this element's children by consolidating consecutive
  1246. strings.
  1247. This makes pretty-printed output look more natural following a
  1248. lot of operations that modified the tree.
  1249. """
  1250. # Mark the first position of every pair of children that need
  1251. # to be consolidated. Do this rather than making a copy of
  1252. # self.contents, since in most cases very few strings will be
  1253. # affected.
  1254. marked = []
  1255. for i, a in enumerate(self.contents):
  1256. if isinstance(a, Tag):
  1257. # Recursively smooth children.
  1258. a.smooth()
  1259. if i == len(self.contents)-1:
  1260. # This is the last item in .contents, and it's not a
  1261. # tag. There's no chance it needs any work.
  1262. continue
  1263. b = self.contents[i+1]
  1264. if (isinstance(a, NavigableString)
  1265. and isinstance(b, NavigableString)
  1266. and not isinstance(a, PreformattedString)
  1267. and not isinstance(b, PreformattedString)
  1268. ):
  1269. marked.append(i)
  1270. # Go over the marked positions in reverse order, so that
  1271. # removing items from .contents won't affect the remaining
  1272. # positions.
  1273. for i in reversed(marked):
  1274. a = self.contents[i]
  1275. b = self.contents[i+1]
  1276. b.extract()
  1277. n = NavigableString(a+b)
  1278. a.replace_with(n)
  1279. def index(self, element):
  1280. """Find the index of a child by identity, not value.
  1281. Avoids issues with tag.contents.index(element) getting the
  1282. index of equal elements.
  1283. :param element: Look for this PageElement in `self.contents`.
  1284. """
  1285. for i, child in enumerate(self.contents):
  1286. if child is element:
  1287. return i
  1288. raise ValueError("Tag.index: element not in tag")
  1289. def get(self, key, default=None):
  1290. """Returns the value of the 'key' attribute for the tag, or
  1291. the value given for 'default' if it doesn't have that
  1292. attribute."""
  1293. return self.attrs.get(key, default)
  1294. def get_attribute_list(self, key, default=None):
  1295. """The same as get(), but always returns a list.
  1296. :param key: The attribute to look for.
  1297. :param default: Use this value if the attribute is not present
  1298. on this PageElement.
  1299. :return: A list of values, probably containing only a single
  1300. value.
  1301. """
  1302. value = self.get(key, default)
  1303. if not isinstance(value, list):
  1304. value = [value]
  1305. return value
  1306. def has_attr(self, key):
  1307. """Does this PageElement have an attribute with the given name?"""
  1308. return key in self.attrs
  1309. def __hash__(self):
  1310. return str(self).__hash__()
  1311. def __getitem__(self, key):
  1312. """tag[key] returns the value of the 'key' attribute for the Tag,
  1313. and throws an exception if it's not there."""
  1314. return self.attrs[key]
  1315. def __iter__(self):
  1316. "Iterating over a Tag iterates over its contents."
  1317. return iter(self.contents)
  1318. def __len__(self):
  1319. "The length of a Tag is the length of its list of contents."
  1320. return len(self.contents)
  1321. def __contains__(self, x):
  1322. return x in self.contents
  1323. def __bool__(self):
  1324. "A tag is non-None even if it has no contents."
  1325. return True
  1326. def __setitem__(self, key, value):
  1327. """Setting tag[key] sets the value of the 'key' attribute for the
  1328. tag."""
  1329. self.attrs[key] = value
  1330. def __delitem__(self, key):
  1331. "Deleting tag[key] deletes all 'key' attributes for the tag."
  1332. self.attrs.pop(key, None)
  1333. def __call__(self, *args, **kwargs):
  1334. """Calling a Tag like a function is the same as calling its
  1335. find_all() method. Eg. tag('a') returns a list of all the A tags
  1336. found within this tag."""
  1337. return self.find_all(*args, **kwargs)
  1338. def __getattr__(self, tag):
  1339. """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
  1340. #print("Getattr %s.%s" % (self.__class__, tag))
  1341. if len(tag) > 3 and tag.endswith('Tag'):
  1342. # BS3: soup.aTag -> "soup.find("a")
  1343. tag_name = tag[:-3]
  1344. warnings.warn(
  1345. '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
  1346. name=tag_name
  1347. ),
  1348. DeprecationWarning, stacklevel=2
  1349. )
  1350. return self.find(tag_name)
  1351. # We special case contents to avoid recursion.
  1352. elif not tag.startswith("__") and not tag == "contents":
  1353. return self.find(tag)
  1354. raise AttributeError(
  1355. "'%s' object has no attribute '%s'" % (self.__class__, tag))
  1356. def __eq__(self, other):
  1357. """Returns true iff this Tag has the same name, the same attributes,
  1358. and the same contents (recursively) as `other`."""
  1359. if self is other:
  1360. return True
  1361. if (not hasattr(other, 'name') or
  1362. not hasattr(other, 'attrs') or
  1363. not hasattr(other, 'contents') or
  1364. self.name != other.name or
  1365. self.attrs != other.attrs or
  1366. len(self) != len(other)):
  1367. return False
  1368. for i, my_child in enumerate(self.contents):
  1369. if my_child != other.contents[i]:
  1370. return False
  1371. return True
  1372. def __ne__(self, other):
  1373. """Returns true iff this Tag is not identical to `other`,
  1374. as defined in __eq__."""
  1375. return not self == other
  1376. def __repr__(self, encoding="unicode-escape"):
  1377. """Renders this PageElement as a string.
  1378. :param encoding: The encoding to use (Python 2 only).
  1379. TODO: This is now ignored and a warning should be issued
  1380. if a value is provided.
  1381. :return: A (Unicode) string.
  1382. """
  1383. # "The return value must be a string object", i.e. Unicode
  1384. return self.decode()
  1385. def __unicode__(self):
  1386. """Renders this PageElement as a Unicode string."""
  1387. return self.decode()
  1388. __str__ = __repr__ = __unicode__
  1389. def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
  1390. indent_level=None, formatter="minimal",
  1391. errors="xmlcharrefreplace"):
  1392. """Render a bytestring representation of this PageElement and its
  1393. contents.
  1394. :param encoding: The destination encoding.
  1395. :param indent_level: Each line of the rendering will be
  1396. indented this many levels. (The formatter decides what a
  1397. 'level' means in terms of spaces or other characters
  1398. output.) Used internally in recursive calls while
  1399. pretty-printing.
  1400. :param formatter: A Formatter object, or a string naming one of
  1401. the standard formatters.
  1402. :param errors: An error handling strategy such as
  1403. 'xmlcharrefreplace'. This value is passed along into
  1404. encode() and its value should be one of the constants
  1405. defined by Python.
  1406. :return: A bytestring.
  1407. """
  1408. # Turn the data structure into Unicode, then encode the
  1409. # Unicode.
  1410. u = self.decode(indent_level, encoding, formatter)
  1411. return u.encode(encoding, errors)
  1412. def decode(self, indent_level=None,
  1413. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  1414. formatter="minimal",
  1415. iterator=None):
  1416. pieces = []
  1417. # First off, turn a non-Formatter `formatter` into a Formatter
  1418. # object. This will stop the lookup from happening over and
  1419. # over again.
  1420. if not isinstance(formatter, Formatter):
  1421. formatter = self.formatter_for_name(formatter)
  1422. if indent_level is True:
  1423. indent_level = 0
  1424. # The currently active tag that put us into string literal
  1425. # mode. Until this element is closed, children will be treated
  1426. # as string literals and not pretty-printed. String literal
  1427. # mode is turned on immediately after this tag begins, and
  1428. # turned off immediately before it's closed. This means there
  1429. # will be whitespace before and after the tag itself.
  1430. string_literal_tag = None
  1431. for event, element in self._event_stream(iterator):
  1432. if event in (Tag.START_ELEMENT_EVENT, Tag.EMPTY_ELEMENT_EVENT):
  1433. piece = element._format_tag(
  1434. eventual_encoding, formatter, opening=True
  1435. )
  1436. elif event is Tag.END_ELEMENT_EVENT:
  1437. piece = element._format_tag(
  1438. eventual_encoding, formatter, opening=False
  1439. )
  1440. if indent_level is not None:
  1441. indent_level -= 1
  1442. else:
  1443. piece = element.output_ready(formatter)
  1444. # Now we need to apply the 'prettiness' -- extra
  1445. # whitespace before and/or after this tag. This can get
  1446. # complicated because certain tags, like <pre> and
  1447. # <script>, can't be prettified, since adding whitespace would
  1448. # change the meaning of the content.
  1449. # The default behavior is to add whitespace before and
  1450. # after an element when string literal mode is off, and to
  1451. # leave things as they are when string literal mode is on.
  1452. if string_literal_tag:
  1453. indent_before = indent_after = False
  1454. else:
  1455. indent_before = indent_after = True
  1456. # The only time the behavior is more complex than that is
  1457. # when we encounter an opening or closing tag that might
  1458. # put us into or out of string literal mode.
  1459. if (event is Tag.START_ELEMENT_EVENT
  1460. and not string_literal_tag
  1461. and not element._should_pretty_print()):
  1462. # We are about to enter string literal mode. Add
  1463. # whitespace before this tag, but not after. We
  1464. # will stay in string literal mode until this tag
  1465. # is closed.
  1466. indent_before = True
  1467. indent_after = False
  1468. string_literal_tag = element
  1469. elif (event is Tag.END_ELEMENT_EVENT
  1470. and element is string_literal_tag):
  1471. # We are about to exit string literal mode by closing
  1472. # the tag that sent us into that mode. Add whitespace
  1473. # after this tag, but not before.
  1474. indent_before = False
  1475. indent_after = True
  1476. string_literal_tag = None
  1477. # Now we know whether to add whitespace before and/or
  1478. # after this element.
  1479. if indent_level is not None:
  1480. if (indent_before or indent_after):
  1481. if isinstance(element, NavigableString):
  1482. piece = piece.strip()
  1483. if piece:
  1484. piece = self._indent_string(
  1485. piece, indent_level, formatter,
  1486. indent_before, indent_after
  1487. )
  1488. if event == Tag.START_ELEMENT_EVENT:
  1489. indent_level += 1
  1490. pieces.append(piece)
  1491. return "".join(pieces)
  1492. # Names for the different events yielded by _event_stream
  1493. START_ELEMENT_EVENT = object()
  1494. END_ELEMENT_EVENT = object()
  1495. EMPTY_ELEMENT_EVENT = object()
  1496. STRING_ELEMENT_EVENT = object()
  1497. def _event_stream(self, iterator=None):
  1498. """Yield a sequence of events that can be used to reconstruct the DOM
  1499. for this element.
  1500. This lets us recreate the nested structure of this element
  1501. (e.g. when formatting it as a string) without using recursive
  1502. method calls.
  1503. This is similar in concept to the SAX API, but it's a simpler
  1504. interface designed for internal use. The events are different
  1505. from SAX and the arguments associated with the events are Tags
  1506. and other Beautiful Soup objects.
  1507. :param iterator: An alternate iterator to use when traversing
  1508. the tree.
  1509. """
  1510. tag_stack = []
  1511. iterator = iterator or self.self_and_descendants
  1512. for c in iterator:
  1513. # If the parent of the element we're about to yield is not
  1514. # the tag currently on the stack, it means that the tag on
  1515. # the stack closed before this element appeared.
  1516. while tag_stack and c.parent != tag_stack[-1]:
  1517. now_closed_tag = tag_stack.pop()
  1518. yield Tag.END_ELEMENT_EVENT, now_closed_tag
  1519. if isinstance(c, Tag):
  1520. if c.is_empty_element:
  1521. yield Tag.EMPTY_ELEMENT_EVENT, c
  1522. else:
  1523. yield Tag.START_ELEMENT_EVENT, c
  1524. tag_stack.append(c)
  1525. continue
  1526. else:
  1527. yield Tag.STRING_ELEMENT_EVENT, c
  1528. while tag_stack:
  1529. now_closed_tag = tag_stack.pop()
  1530. yield Tag.END_ELEMENT_EVENT, now_closed_tag
  1531. def _indent_string(self, s, indent_level, formatter,
  1532. indent_before, indent_after):
  1533. """Add indentation whitespace before and/or after a string.
  1534. :param s: The string to amend with whitespace.
  1535. :param indent_level: The indentation level; affects how much
  1536. whitespace goes before the string.
  1537. :param indent_before: Whether or not to add whitespace
  1538. before the string.
  1539. :param indent_after: Whether or not to add whitespace
  1540. (a newline) after the string.
  1541. """
  1542. space_before = ''
  1543. if indent_before and indent_level:
  1544. space_before = (formatter.indent * indent_level)
  1545. space_after = ''
  1546. if indent_after:
  1547. space_after = "\n"
  1548. return space_before + s + space_after
  1549. def _format_tag(self, eventual_encoding, formatter, opening):
  1550. if self.hidden:
  1551. # A hidden tag is invisible, although its contents
  1552. # are visible.
  1553. return ''
  1554. # A tag starts with the < character (see below).
  1555. # Then the / character, if this is a closing tag.
  1556. closing_slash = ''
  1557. if not opening:
  1558. closing_slash = '/'
  1559. # Then an optional namespace prefix.
  1560. prefix = ''
  1561. if self.prefix:
  1562. prefix = self.prefix + ":"
  1563. # Then a list of attribute values, if this is an opening tag.
  1564. attribute_string = ''
  1565. if opening:
  1566. attributes = formatter.attributes(self)
  1567. attrs = []
  1568. for key, val in attributes:
  1569. if val is None:
  1570. decoded = key
  1571. else:
  1572. if isinstance(val, list) or isinstance(val, tuple):
  1573. val = ' '.join(val)
  1574. elif not isinstance(val, str):
  1575. val = str(val)
  1576. elif (
  1577. isinstance(val, AttributeValueWithCharsetSubstitution)
  1578. and eventual_encoding is not None
  1579. ):
  1580. val = val.encode(eventual_encoding)
  1581. text = formatter.attribute_value(val)
  1582. decoded = (
  1583. str(key) + '='
  1584. + formatter.quoted_attribute_value(text))
  1585. attrs.append(decoded)
  1586. if attrs:
  1587. attribute_string = ' ' + ' '.join(attrs)
  1588. # Then an optional closing slash (for a void element in an
  1589. # XML document).
  1590. void_element_closing_slash = ''
  1591. if self.is_empty_element:
  1592. void_element_closing_slash = formatter.void_element_close_prefix or ''
  1593. # Put it all together.
  1594. return '<' + closing_slash + prefix + self.name + attribute_string + void_element_closing_slash + '>'
  1595. def _should_pretty_print(self, indent_level=1):
  1596. """Should this tag be pretty-printed?
  1597. Most of them should, but some (such as <pre> in HTML
  1598. documents) should not.
  1599. """
  1600. return (
  1601. indent_level is not None
  1602. and (
  1603. not self.preserve_whitespace_tags
  1604. or self.name not in self.preserve_whitespace_tags
  1605. )
  1606. )
  1607. def prettify(self, encoding=None, formatter="minimal"):
  1608. """Pretty-print this PageElement as a string.
  1609. :param encoding: The eventual encoding of the string. If this is None,
  1610. a Unicode string will be returned.
  1611. :param formatter: A Formatter object, or a string naming one of
  1612. the standard formatters.
  1613. :return: A Unicode string (if encoding==None) or a bytestring
  1614. (otherwise).
  1615. """
  1616. if encoding is None:
  1617. return self.decode(True, formatter=formatter)
  1618. else:
  1619. return self.encode(encoding, True, formatter=formatter)
  1620. def decode_contents(self, indent_level=None,
  1621. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  1622. formatter="minimal"):
  1623. """Renders the contents of this tag as a Unicode string.
  1624. :param indent_level: Each line of the rendering will be
  1625. indented this many levels. (The formatter decides what a
  1626. 'level' means in terms of spaces or other characters
  1627. output.) Used internally in recursive calls while
  1628. pretty-printing.
  1629. :param eventual_encoding: The tag is destined to be
  1630. encoded into this encoding. decode_contents() is _not_
  1631. responsible for performing that encoding. This information
  1632. is passed in so that it can be substituted in if the
  1633. document contains a <META> tag that mentions the document's
  1634. encoding.
  1635. :param formatter: A Formatter object, or a string naming one of
  1636. the standard Formatters.
  1637. """
  1638. return self.decode(indent_level, eventual_encoding, formatter,
  1639. iterator=self.descendants)
  1640. def encode_contents(
  1641. self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
  1642. formatter="minimal"):
  1643. """Renders the contents of this PageElement as a bytestring.
  1644. :param indent_level: Each line of the rendering will be
  1645. indented this many levels. (The formatter decides what a
  1646. 'level' means in terms of spaces or other characters
  1647. output.) Used internally in recursive calls while
  1648. pretty-printing.
  1649. :param eventual_encoding: The bytestring will be in this encoding.
  1650. :param formatter: A Formatter object, or a string naming one of
  1651. the standard Formatters.
  1652. :return: A bytestring.
  1653. """
  1654. contents = self.decode_contents(indent_level, encoding, formatter)
  1655. return contents.encode(encoding)
  1656. # Old method for BS3 compatibility
  1657. def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
  1658. prettyPrint=False, indentLevel=0):
  1659. """Deprecated method for BS3 compatibility."""
  1660. if not prettyPrint:
  1661. indentLevel = None
  1662. return self.encode_contents(
  1663. indent_level=indentLevel, encoding=encoding)
  1664. #Soup methods
  1665. def find(self, name=None, attrs={}, recursive=True, string=None,
  1666. **kwargs):
  1667. """Look in the children of this PageElement and find the first
  1668. PageElement that matches the given criteria.
  1669. All find_* methods take a common set of arguments. See the online
  1670. documentation for detailed explanations.
  1671. :param name: A filter on tag name.
  1672. :param attrs: A dictionary of filters on attribute values.
  1673. :param recursive: If this is True, find() will perform a
  1674. recursive search of this PageElement's children. Otherwise,
  1675. only the direct children will be considered.
  1676. :param limit: Stop looking after finding this many results.
  1677. :kwargs: A dictionary of filters on attribute values.
  1678. :return: A PageElement.
  1679. :rtype: bs4.element.Tag | bs4.element.NavigableString
  1680. """
  1681. r = None
  1682. l = self.find_all(name, attrs, recursive, string, 1, _stacklevel=3,
  1683. **kwargs)
  1684. if l:
  1685. r = l[0]
  1686. return r
  1687. findChild = find #BS2
  1688. def find_all(self, name=None, attrs={}, recursive=True, string=None,
  1689. limit=None, **kwargs):
  1690. """Look in the children of this PageElement and find all
  1691. PageElements that match the given criteria.
  1692. All find_* methods take a common set of arguments. See the online
  1693. documentation for detailed explanations.
  1694. :param name: A filter on tag name.
  1695. :param attrs: A dictionary of filters on attribute values.
  1696. :param recursive: If this is True, find_all() will perform a
  1697. recursive search of this PageElement's children. Otherwise,
  1698. only the direct children will be considered.
  1699. :param limit: Stop looking after finding this many results.
  1700. :kwargs: A dictionary of filters on attribute values.
  1701. :return: A ResultSet of PageElements.
  1702. :rtype: bs4.element.ResultSet
  1703. """
  1704. generator = self.descendants
  1705. if not recursive:
  1706. generator = self.children
  1707. _stacklevel = kwargs.pop('_stacklevel', 2)
  1708. return self._find_all(name, attrs, string, limit, generator,
  1709. _stacklevel=_stacklevel+1, **kwargs)
  1710. findAll = find_all # BS3
  1711. findChildren = find_all # BS2
  1712. #Generator methods
  1713. @property
  1714. def children(self):
  1715. """Iterate over all direct children of this PageElement.
  1716. :yield: A sequence of PageElements.
  1717. """
  1718. # return iter() to make the purpose of the method clear
  1719. return iter(self.contents) # XXX This seems to be untested.
  1720. @property
  1721. def self_and_descendants(self):
  1722. """Iterate over this PageElement and its children in a
  1723. breadth-first sequence.
  1724. :yield: A sequence of PageElements.
  1725. """
  1726. if not self.hidden:
  1727. yield self
  1728. for i in self.descendants:
  1729. yield i
  1730. @property
  1731. def descendants(self):
  1732. """Iterate over all children of this PageElement in a
  1733. breadth-first sequence.
  1734. :yield: A sequence of PageElements.
  1735. """
  1736. if not len(self.contents):
  1737. return
  1738. stopNode = self._last_descendant().next_element
  1739. current = self.contents[0]
  1740. while current is not stopNode:
  1741. yield current
  1742. current = current.next_element
  1743. # CSS selector code
  1744. def select_one(self, selector, namespaces=None, **kwargs):
  1745. """Perform a CSS selection operation on the current element.
  1746. :param selector: A CSS selector.
  1747. :param namespaces: A dictionary mapping namespace prefixes
  1748. used in the CSS selector to namespace URIs. By default,
  1749. Beautiful Soup will use the prefixes it encountered while
  1750. parsing the document.
  1751. :param kwargs: Keyword arguments to be passed into Soup Sieve's
  1752. soupsieve.select() method.
  1753. :return: A Tag.
  1754. :rtype: bs4.element.Tag
  1755. """
  1756. return self.css.select_one(selector, namespaces, **kwargs)
  1757. def select(self, selector, namespaces=None, limit=None, **kwargs):
  1758. """Perform a CSS selection operation on the current element.
  1759. This uses the SoupSieve library.
  1760. :param selector: A string containing a CSS selector.
  1761. :param namespaces: A dictionary mapping namespace prefixes
  1762. used in the CSS selector to namespace URIs. By default,
  1763. Beautiful Soup will use the prefixes it encountered while
  1764. parsing the document.
  1765. :param limit: After finding this number of results, stop looking.
  1766. :param kwargs: Keyword arguments to be passed into SoupSieve's
  1767. soupsieve.select() method.
  1768. :return: A ResultSet of Tags.
  1769. :rtype: bs4.element.ResultSet
  1770. """
  1771. return self.css.select(selector, namespaces, limit, **kwargs)
  1772. @property
  1773. def css(self):
  1774. """Return an interface to the CSS selector API."""
  1775. return CSS(self)
  1776. # Old names for backwards compatibility
  1777. def childGenerator(self):
  1778. """Deprecated generator."""
  1779. return self.children
  1780. def recursiveChildGenerator(self):
  1781. """Deprecated generator."""
  1782. return self.descendants
  1783. def has_key(self, key):
  1784. """Deprecated method. This was kind of misleading because has_key()
  1785. (attributes) was different from __in__ (contents).
  1786. has_key() is gone in Python 3, anyway.
  1787. """
  1788. warnings.warn(
  1789. 'has_key is deprecated. Use has_attr(key) instead.',
  1790. DeprecationWarning, stacklevel=2
  1791. )
  1792. return self.has_attr(key)
  1793. # Next, a couple classes to represent queries and their results.
  1794. class SoupStrainer(object):
  1795. """Encapsulates a number of ways of matching a markup element (tag or
  1796. string).
  1797. This is primarily used to underpin the find_* methods, but you can
  1798. create one yourself and pass it in as `parse_only` to the
  1799. `BeautifulSoup` constructor, to parse a subset of a large
  1800. document.
  1801. """
  1802. def __init__(self, name=None, attrs={}, string=None, **kwargs):
  1803. """Constructor.
  1804. The SoupStrainer constructor takes the same arguments passed
  1805. into the find_* methods. See the online documentation for
  1806. detailed explanations.
  1807. :param name: A filter on tag name.
  1808. :param attrs: A dictionary of filters on attribute values.
  1809. :param string: A filter for a NavigableString with specific text.
  1810. :kwargs: A dictionary of filters on attribute values.
  1811. """
  1812. if string is None and 'text' in kwargs:
  1813. string = kwargs.pop('text')
  1814. warnings.warn(
  1815. "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
  1816. DeprecationWarning, stacklevel=2
  1817. )
  1818. self.name = self._normalize_search_value(name)
  1819. if not isinstance(attrs, dict):
  1820. # Treat a non-dict value for attrs as a search for the 'class'
  1821. # attribute.
  1822. kwargs['class'] = attrs
  1823. attrs = None
  1824. if 'class_' in kwargs:
  1825. # Treat class_="foo" as a search for the 'class'
  1826. # attribute, overriding any non-dict value for attrs.
  1827. kwargs['class'] = kwargs['class_']
  1828. del kwargs['class_']
  1829. if kwargs:
  1830. if attrs:
  1831. attrs = attrs.copy()
  1832. attrs.update(kwargs)
  1833. else:
  1834. attrs = kwargs
  1835. normalized_attrs = {}
  1836. for key, value in list(attrs.items()):
  1837. normalized_attrs[key] = self._normalize_search_value(value)
  1838. self.attrs = normalized_attrs
  1839. self.string = self._normalize_search_value(string)
  1840. # DEPRECATED but just in case someone is checking this.
  1841. self.text = self.string
  1842. def _normalize_search_value(self, value):
  1843. # Leave it alone if it's a Unicode string, a callable, a
  1844. # regular expression, a boolean, or None.
  1845. if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
  1846. or isinstance(value, bool) or value is None):
  1847. return value
  1848. # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
  1849. if isinstance(value, bytes):
  1850. return value.decode("utf8")
  1851. # If it's listlike, convert it into a list of strings.
  1852. if hasattr(value, '__iter__'):
  1853. new_value = []
  1854. for v in value:
  1855. if (hasattr(v, '__iter__') and not isinstance(v, bytes)
  1856. and not isinstance(v, str)):
  1857. # This is almost certainly the user's mistake. In the
  1858. # interests of avoiding infinite loops, we'll let
  1859. # it through as-is rather than doing a recursive call.
  1860. new_value.append(v)
  1861. else:
  1862. new_value.append(self._normalize_search_value(v))
  1863. return new_value
  1864. # Otherwise, convert it into a Unicode string.
  1865. # The unicode(str()) thing is so this will do the same thing on Python 2
  1866. # and Python 3.
  1867. return str(str(value))
  1868. def __str__(self):
  1869. """A human-readable representation of this SoupStrainer."""
  1870. if self.string:
  1871. return self.string
  1872. else:
  1873. return "%s|%s" % (self.name, self.attrs)
  1874. def search_tag(self, markup_name=None, markup_attrs={}):
  1875. """Check whether a Tag with the given name and attributes would
  1876. match this SoupStrainer.
  1877. Used prospectively to decide whether to even bother creating a Tag
  1878. object.
  1879. :param markup_name: A tag name as found in some markup.
  1880. :param markup_attrs: A dictionary of attributes as found in some markup.
  1881. :return: True if the prospective tag would match this SoupStrainer;
  1882. False otherwise.
  1883. """
  1884. found = None
  1885. markup = None
  1886. if isinstance(markup_name, Tag):
  1887. markup = markup_name
  1888. markup_attrs = markup
  1889. if isinstance(self.name, str):
  1890. # Optimization for a very common case where the user is
  1891. # searching for a tag with one specific name, and we're
  1892. # looking at a tag with a different name.
  1893. if markup and not markup.prefix and self.name != markup.name:
  1894. return False
  1895. call_function_with_tag_data = (
  1896. isinstance(self.name, Callable)
  1897. and not isinstance(markup_name, Tag))
  1898. if ((not self.name)
  1899. or call_function_with_tag_data
  1900. or (markup and self._matches(markup, self.name))
  1901. or (not markup and self._matches(markup_name, self.name))):
  1902. if call_function_with_tag_data:
  1903. match = self.name(markup_name, markup_attrs)
  1904. else:
  1905. match = True
  1906. markup_attr_map = None
  1907. for attr, match_against in list(self.attrs.items()):
  1908. if not markup_attr_map:
  1909. if hasattr(markup_attrs, 'get'):
  1910. markup_attr_map = markup_attrs
  1911. else:
  1912. markup_attr_map = {}
  1913. for k, v in markup_attrs:
  1914. markup_attr_map[k] = v
  1915. attr_value = markup_attr_map.get(attr)
  1916. if not self._matches(attr_value, match_against):
  1917. match = False
  1918. break
  1919. if match:
  1920. if markup:
  1921. found = markup
  1922. else:
  1923. found = markup_name
  1924. if found and self.string and not self._matches(found.string, self.string):
  1925. found = None
  1926. return found
  1927. # For BS3 compatibility.
  1928. searchTag = search_tag
  1929. def search(self, markup):
  1930. """Find all items in `markup` that match this SoupStrainer.
  1931. Used by the core _find_all() method, which is ultimately
  1932. called by all find_* methods.
  1933. :param markup: A PageElement or a list of them.
  1934. """
  1935. # print('looking for %s in %s' % (self, markup))
  1936. found = None
  1937. # If given a list of items, scan it for a text element that
  1938. # matches.
  1939. if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
  1940. for element in markup:
  1941. if isinstance(element, NavigableString) \
  1942. and self.search(element):
  1943. found = element
  1944. break
  1945. # If it's a Tag, make sure its name or attributes match.
  1946. # Don't bother with Tags if we're searching for text.
  1947. elif isinstance(markup, Tag):
  1948. if not self.string or self.name or self.attrs:
  1949. found = self.search_tag(markup)
  1950. # If it's text, make sure the text matches.
  1951. elif isinstance(markup, NavigableString) or \
  1952. isinstance(markup, str):
  1953. if not self.name and not self.attrs and self._matches(markup, self.string):
  1954. found = markup
  1955. else:
  1956. raise Exception(
  1957. "I don't know how to match against a %s" % markup.__class__)
  1958. return found
  1959. def _matches(self, markup, match_against, already_tried=None):
  1960. # print(u"Matching %s against %s" % (markup, match_against))
  1961. result = False
  1962. if isinstance(markup, list) or isinstance(markup, tuple):
  1963. # This should only happen when searching a multi-valued attribute
  1964. # like 'class'.
  1965. for item in markup:
  1966. if self._matches(item, match_against):
  1967. return True
  1968. # We didn't match any particular value of the multivalue
  1969. # attribute, but maybe we match the attribute value when
  1970. # considered as a string.
  1971. if self._matches(' '.join(markup), match_against):
  1972. return True
  1973. return False
  1974. if match_against is True:
  1975. # True matches any non-None value.
  1976. return markup is not None
  1977. if isinstance(match_against, Callable):
  1978. return match_against(markup)
  1979. # Custom callables take the tag as an argument, but all
  1980. # other ways of matching match the tag name as a string.
  1981. original_markup = markup
  1982. if isinstance(markup, Tag):
  1983. markup = markup.name
  1984. # Ensure that `markup` is either a Unicode string, or None.
  1985. markup = self._normalize_search_value(markup)
  1986. if markup is None:
  1987. # None matches None, False, an empty string, an empty list, and so on.
  1988. return not match_against
  1989. if (hasattr(match_against, '__iter__')
  1990. and not isinstance(match_against, str)):
  1991. # We're asked to match against an iterable of items.
  1992. # The markup must be match at least one item in the
  1993. # iterable. We'll try each one in turn.
  1994. #
  1995. # To avoid infinite recursion we need to keep track of
  1996. # items we've already seen.
  1997. if not already_tried:
  1998. already_tried = set()
  1999. for item in match_against:
  2000. if item.__hash__:
  2001. key = item
  2002. else:
  2003. key = id(item)
  2004. if key in already_tried:
  2005. continue
  2006. else:
  2007. already_tried.add(key)
  2008. if self._matches(original_markup, item, already_tried):
  2009. return True
  2010. else:
  2011. return False
  2012. # Beyond this point we might need to run the test twice: once against
  2013. # the tag's name and once against its prefixed name.
  2014. match = False
  2015. if not match and isinstance(match_against, str):
  2016. # Exact string match
  2017. match = markup == match_against
  2018. if not match and hasattr(match_against, 'search'):
  2019. # Regexp match
  2020. return match_against.search(markup)
  2021. if (not match
  2022. and isinstance(original_markup, Tag)
  2023. and original_markup.prefix):
  2024. # Try the whole thing again with the prefixed tag name.
  2025. return self._matches(
  2026. original_markup.prefix + ':' + original_markup.name, match_against
  2027. )
  2028. return match
  2029. class ResultSet(list):
  2030. """A ResultSet is just a list that keeps track of the SoupStrainer
  2031. that created it."""
  2032. def __init__(self, source, result=()):
  2033. """Constructor.
  2034. :param source: A SoupStrainer.
  2035. :param result: A list of PageElements.
  2036. """
  2037. super(ResultSet, self).__init__(result)
  2038. self.source = source
  2039. def __getattr__(self, key):
  2040. """Raise a helpful exception to explain a common code fix."""
  2041. raise AttributeError(
  2042. "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
  2043. )