element.py 85 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291
  1. # Use of this source code is governed by the MIT license.
  2. __license__ = "MIT"
  3. try:
  4. from collections.abc import Callable # Python 3.6
  5. except ImportError as e:
  6. from collections import Callable
  7. import re
  8. import sys
  9. import warnings
  10. try:
  11. import soupsieve
  12. except ImportError as e:
  13. soupsieve = None
  14. warnings.warn(
  15. 'The soupsieve package is not installed. CSS selectors cannot be used.'
  16. )
  17. from bs4.formatter import (
  18. Formatter,
  19. HTMLFormatter,
  20. XMLFormatter,
  21. )
  22. DEFAULT_OUTPUT_ENCODING = "utf-8"
  23. nonwhitespace_re = re.compile(r"\S+")
  24. # NOTE: This isn't used as of 4.7.0. I'm leaving it for a little bit on
  25. # the off chance someone imported it for their own use.
  26. whitespace_re = re.compile(r"\s+")
  27. def _alias(attr):
  28. """Alias one attribute name to another for backward compatibility"""
  29. @property
  30. def alias(self):
  31. return getattr(self, attr)
  32. @alias.setter
  33. def alias(self):
  34. return setattr(self, attr)
  35. return alias
  36. # These encodings are recognized by Python (so PageElement.encode
  37. # could theoretically support them) but XML and HTML don't recognize
  38. # them (so they should not show up in an XML or HTML document as that
  39. # document's encoding).
  40. #
  41. # If an XML document is encoded in one of these encodings, no encoding
  42. # will be mentioned in the XML declaration. If an HTML document is
  43. # encoded in one of these encodings, and the HTML document has a
  44. # <meta> tag that mentions an encoding, the encoding will be given as
  45. # the empty string.
  46. #
  47. # Source:
  48. # https://docs.python.org/3/library/codecs.html#python-specific-encodings
  49. PYTHON_SPECIFIC_ENCODINGS = set([
  50. "idna",
  51. "mbcs",
  52. "oem",
  53. "palmos",
  54. "punycode",
  55. "raw_unicode_escape",
  56. "undefined",
  57. "unicode_escape",
  58. "raw-unicode-escape",
  59. "unicode-escape",
  60. "string-escape",
  61. "string_escape",
  62. ])
  63. class NamespacedAttribute(str):
  64. """A namespaced string (e.g. 'xml:lang') that remembers the namespace
  65. ('xml') and the name ('lang') that were used to create it.
  66. """
  67. def __new__(cls, prefix, name=None, namespace=None):
  68. if not name:
  69. # This is the default namespace. Its name "has no value"
  70. # per https://www.w3.org/TR/xml-names/#defaulting
  71. name = None
  72. if not name:
  73. obj = str.__new__(cls, prefix)
  74. elif not prefix:
  75. # Not really namespaced.
  76. obj = str.__new__(cls, name)
  77. else:
  78. obj = str.__new__(cls, prefix + ":" + name)
  79. obj.prefix = prefix
  80. obj.name = name
  81. obj.namespace = namespace
  82. return obj
  83. class AttributeValueWithCharsetSubstitution(str):
  84. """A stand-in object for a character encoding specified in HTML."""
  85. class CharsetMetaAttributeValue(AttributeValueWithCharsetSubstitution):
  86. """A generic stand-in for the value of a meta tag's 'charset' attribute.
  87. When Beautiful Soup parses the markup '<meta charset="utf8">', the
  88. value of the 'charset' attribute will be one of these objects.
  89. """
  90. def __new__(cls, original_value):
  91. obj = str.__new__(cls, original_value)
  92. obj.original_value = original_value
  93. return obj
  94. def encode(self, encoding):
  95. """When an HTML document is being encoded to a given encoding, the
  96. value of a meta tag's 'charset' is the name of the encoding.
  97. """
  98. if encoding in PYTHON_SPECIFIC_ENCODINGS:
  99. return ''
  100. return encoding
  101. class ContentMetaAttributeValue(AttributeValueWithCharsetSubstitution):
  102. """A generic stand-in for the value of a meta tag's 'content' attribute.
  103. When Beautiful Soup parses the markup:
  104. <meta http-equiv="content-type" content="text/html; charset=utf8">
  105. The value of the 'content' attribute will be one of these objects.
  106. """
  107. CHARSET_RE = re.compile(r"((^|;)\s*charset=)([^;]*)", re.M)
  108. def __new__(cls, original_value):
  109. match = cls.CHARSET_RE.search(original_value)
  110. if match is None:
  111. # No substitution necessary.
  112. return str.__new__(str, original_value)
  113. obj = str.__new__(cls, original_value)
  114. obj.original_value = original_value
  115. return obj
  116. def encode(self, encoding):
  117. if encoding in PYTHON_SPECIFIC_ENCODINGS:
  118. return ''
  119. def rewrite(match):
  120. return match.group(1) + encoding
  121. return self.CHARSET_RE.sub(rewrite, self.original_value)
  122. class PageElement(object):
  123. """Contains the navigational information for some part of the page:
  124. that is, its current location in the parse tree.
  125. NavigableString, Tag, etc. are all subclasses of PageElement.
  126. """
  127. def setup(self, parent=None, previous_element=None, next_element=None,
  128. previous_sibling=None, next_sibling=None):
  129. """Sets up the initial relations between this element and
  130. other elements.
  131. :param parent: The parent of this element.
  132. :param previous_element: The element parsed immediately before
  133. this one.
  134. :param next_element: The element parsed immediately before
  135. this one.
  136. :param previous_sibling: The most recently encountered element
  137. on the same level of the parse tree as this one.
  138. :param previous_sibling: The next element to be encountered
  139. on the same level of the parse tree as this one.
  140. """
  141. self.parent = parent
  142. self.previous_element = previous_element
  143. if previous_element is not None:
  144. self.previous_element.next_element = self
  145. self.next_element = next_element
  146. if self.next_element is not None:
  147. self.next_element.previous_element = self
  148. self.next_sibling = next_sibling
  149. if self.next_sibling is not None:
  150. self.next_sibling.previous_sibling = self
  151. if (previous_sibling is None
  152. and self.parent is not None and self.parent.contents):
  153. previous_sibling = self.parent.contents[-1]
  154. self.previous_sibling = previous_sibling
  155. if previous_sibling is not None:
  156. self.previous_sibling.next_sibling = self
  157. def format_string(self, s, formatter):
  158. """Format the given string using the given formatter.
  159. :param s: A string.
  160. :param formatter: A Formatter object, or a string naming one of the standard formatters.
  161. """
  162. if formatter is None:
  163. return s
  164. if not isinstance(formatter, Formatter):
  165. formatter = self.formatter_for_name(formatter)
  166. output = formatter.substitute(s)
  167. return output
  168. def formatter_for_name(self, formatter):
  169. """Look up or create a Formatter for the given identifier,
  170. if necessary.
  171. :param formatter: Can be a Formatter object (used as-is), a
  172. function (used as the entity substitution hook for an
  173. XMLFormatter or HTMLFormatter), or a string (used to look
  174. up an XMLFormatter or HTMLFormatter in the appropriate
  175. registry.
  176. """
  177. if isinstance(formatter, Formatter):
  178. return formatter
  179. if self._is_xml:
  180. c = XMLFormatter
  181. else:
  182. c = HTMLFormatter
  183. if isinstance(formatter, Callable):
  184. return c(entity_substitution=formatter)
  185. return c.REGISTRY[formatter]
  186. @property
  187. def _is_xml(self):
  188. """Is this element part of an XML tree or an HTML tree?
  189. This is used in formatter_for_name, when deciding whether an
  190. XMLFormatter or HTMLFormatter is more appropriate. It can be
  191. inefficient, but it should be called very rarely.
  192. """
  193. if self.known_xml is not None:
  194. # Most of the time we will have determined this when the
  195. # document is parsed.
  196. return self.known_xml
  197. # Otherwise, it's likely that this element was created by
  198. # direct invocation of the constructor from within the user's
  199. # Python code.
  200. if self.parent is None:
  201. # This is the top-level object. It should have .known_xml set
  202. # from tree creation. If not, take a guess--BS is usually
  203. # used on HTML markup.
  204. return getattr(self, 'is_xml', False)
  205. return self.parent._is_xml
  206. nextSibling = _alias("next_sibling") # BS3
  207. previousSibling = _alias("previous_sibling") # BS3
  208. default = object()
  209. def _all_strings(self, strip=False, types=default):
  210. """Yield all strings of certain classes, possibly stripping them.
  211. This is implemented differently in Tag and NavigableString.
  212. """
  213. raise NotImplementedError()
  214. @property
  215. def stripped_strings(self):
  216. """Yield all strings in this PageElement, stripping them first.
  217. :yield: A sequence of stripped strings.
  218. """
  219. for string in self._all_strings(True):
  220. yield string
  221. def get_text(self, separator="", strip=False,
  222. types=default):
  223. """Get all child strings of this PageElement, concatenated using the
  224. given separator.
  225. :param separator: Strings will be concatenated using this separator.
  226. :param strip: If True, strings will be stripped before being
  227. concatenated.
  228. :param types: A tuple of NavigableString subclasses. Any
  229. strings of a subclass not found in this list will be
  230. ignored. Although there are exceptions, the default
  231. behavior in most cases is to consider only NavigableString
  232. and CData objects. That means no comments, processing
  233. instructions, etc.
  234. :return: A string.
  235. """
  236. return separator.join([s for s in self._all_strings(
  237. strip, types=types)])
  238. getText = get_text
  239. text = property(get_text)
  240. def replace_with(self, *args):
  241. """Replace this PageElement with one or more PageElements, keeping the
  242. rest of the tree the same.
  243. :param args: One or more PageElements.
  244. :return: `self`, no longer part of the tree.
  245. """
  246. if self.parent is None:
  247. raise ValueError(
  248. "Cannot replace one element with another when the "
  249. "element to be replaced is not part of a tree.")
  250. if len(args) == 1 and args[0] is self:
  251. return
  252. if any(x is self.parent for x in args):
  253. raise ValueError("Cannot replace a Tag with its parent.")
  254. old_parent = self.parent
  255. my_index = self.parent.index(self)
  256. self.extract(_self_index=my_index)
  257. for idx, replace_with in enumerate(args, start=my_index):
  258. old_parent.insert(idx, replace_with)
  259. return self
  260. replaceWith = replace_with # BS3
  261. def unwrap(self):
  262. """Replace this PageElement with its contents.
  263. :return: `self`, no longer part of the tree.
  264. """
  265. my_parent = self.parent
  266. if self.parent is None:
  267. raise ValueError(
  268. "Cannot replace an element with its contents when that"
  269. "element is not part of a tree.")
  270. my_index = self.parent.index(self)
  271. self.extract(_self_index=my_index)
  272. for child in reversed(self.contents[:]):
  273. my_parent.insert(my_index, child)
  274. return self
  275. replace_with_children = unwrap
  276. replaceWithChildren = unwrap # BS3
  277. def wrap(self, wrap_inside):
  278. """Wrap this PageElement inside another one.
  279. :param wrap_inside: A PageElement.
  280. :return: `wrap_inside`, occupying the position in the tree that used
  281. to be occupied by `self`, and with `self` inside it.
  282. """
  283. me = self.replace_with(wrap_inside)
  284. wrap_inside.append(me)
  285. return wrap_inside
  286. def extract(self, _self_index=None):
  287. """Destructively rips this element out of the tree.
  288. :param _self_index: The location of this element in its parent's
  289. .contents, if known. Passing this in allows for a performance
  290. optimization.
  291. :return: `self`, no longer part of the tree.
  292. """
  293. if self.parent is not None:
  294. if _self_index is None:
  295. _self_index = self.parent.index(self)
  296. del self.parent.contents[_self_index]
  297. #Find the two elements that would be next to each other if
  298. #this element (and any children) hadn't been parsed. Connect
  299. #the two.
  300. last_child = self._last_descendant()
  301. next_element = last_child.next_element
  302. if (self.previous_element is not None and
  303. self.previous_element is not next_element):
  304. self.previous_element.next_element = next_element
  305. if next_element is not None and next_element is not self.previous_element:
  306. next_element.previous_element = self.previous_element
  307. self.previous_element = None
  308. last_child.next_element = None
  309. self.parent = None
  310. if (self.previous_sibling is not None
  311. and self.previous_sibling is not self.next_sibling):
  312. self.previous_sibling.next_sibling = self.next_sibling
  313. if (self.next_sibling is not None
  314. and self.next_sibling is not self.previous_sibling):
  315. self.next_sibling.previous_sibling = self.previous_sibling
  316. self.previous_sibling = self.next_sibling = None
  317. return self
  318. def _last_descendant(self, is_initialized=True, accept_self=True):
  319. """Finds the last element beneath this object to be parsed.
  320. :param is_initialized: Has `setup` been called on this PageElement
  321. yet?
  322. :param accept_self: Is `self` an acceptable answer to the question?
  323. """
  324. if is_initialized and self.next_sibling is not None:
  325. last_child = self.next_sibling.previous_element
  326. else:
  327. last_child = self
  328. while isinstance(last_child, Tag) and last_child.contents:
  329. last_child = last_child.contents[-1]
  330. if not accept_self and last_child is self:
  331. last_child = None
  332. return last_child
  333. # BS3: Not part of the API!
  334. _lastRecursiveChild = _last_descendant
  335. def insert(self, position, new_child):
  336. """Insert a new PageElement in the list of this PageElement's children.
  337. This works the same way as `list.insert`.
  338. :param position: The numeric position that should be occupied
  339. in `self.children` by the new PageElement.
  340. :param new_child: A PageElement.
  341. """
  342. if new_child is None:
  343. raise ValueError("Cannot insert None into a tag.")
  344. if new_child is self:
  345. raise ValueError("Cannot insert a tag into itself.")
  346. if (isinstance(new_child, str)
  347. and not isinstance(new_child, NavigableString)):
  348. new_child = NavigableString(new_child)
  349. from bs4 import BeautifulSoup
  350. if isinstance(new_child, BeautifulSoup):
  351. # We don't want to end up with a situation where one BeautifulSoup
  352. # object contains another. Insert the children one at a time.
  353. for subchild in list(new_child.contents):
  354. self.insert(position, subchild)
  355. position += 1
  356. return
  357. position = min(position, len(self.contents))
  358. if hasattr(new_child, 'parent') and new_child.parent is not None:
  359. # We're 'inserting' an element that's already one
  360. # of this object's children.
  361. if new_child.parent is self:
  362. current_index = self.index(new_child)
  363. if current_index < position:
  364. # We're moving this element further down the list
  365. # of this object's children. That means that when
  366. # we extract this element, our target index will
  367. # jump down one.
  368. position -= 1
  369. new_child.extract()
  370. new_child.parent = self
  371. previous_child = None
  372. if position == 0:
  373. new_child.previous_sibling = None
  374. new_child.previous_element = self
  375. else:
  376. previous_child = self.contents[position - 1]
  377. new_child.previous_sibling = previous_child
  378. new_child.previous_sibling.next_sibling = new_child
  379. new_child.previous_element = previous_child._last_descendant(False)
  380. if new_child.previous_element is not None:
  381. new_child.previous_element.next_element = new_child
  382. new_childs_last_element = new_child._last_descendant(False)
  383. if position >= len(self.contents):
  384. new_child.next_sibling = None
  385. parent = self
  386. parents_next_sibling = None
  387. while parents_next_sibling is None and parent is not None:
  388. parents_next_sibling = parent.next_sibling
  389. parent = parent.parent
  390. if parents_next_sibling is not None:
  391. # We found the element that comes next in the document.
  392. break
  393. if parents_next_sibling is not None:
  394. new_childs_last_element.next_element = parents_next_sibling
  395. else:
  396. # The last element of this tag is the last element in
  397. # the document.
  398. new_childs_last_element.next_element = None
  399. else:
  400. next_child = self.contents[position]
  401. new_child.next_sibling = next_child
  402. if new_child.next_sibling is not None:
  403. new_child.next_sibling.previous_sibling = new_child
  404. new_childs_last_element.next_element = next_child
  405. if new_childs_last_element.next_element is not None:
  406. new_childs_last_element.next_element.previous_element = new_childs_last_element
  407. self.contents.insert(position, new_child)
  408. def append(self, tag):
  409. """Appends the given PageElement to the contents of this one.
  410. :param tag: A PageElement.
  411. """
  412. self.insert(len(self.contents), tag)
  413. def extend(self, tags):
  414. """Appends the given PageElements to this one's contents.
  415. :param tags: A list of PageElements.
  416. """
  417. if isinstance(tags, Tag):
  418. # Calling self.append() on another tag's contents will change
  419. # the list we're iterating over. Make a list that won't
  420. # change.
  421. tags = list(tags.contents)
  422. for tag in tags:
  423. self.append(tag)
  424. def insert_before(self, *args):
  425. """Makes the given element(s) the immediate predecessor of this one.
  426. All the elements will have the same parent, and the given elements
  427. will be immediately before this one.
  428. :param args: One or more PageElements.
  429. """
  430. parent = self.parent
  431. if parent is None:
  432. raise ValueError(
  433. "Element has no parent, so 'before' has no meaning.")
  434. if any(x is self for x in args):
  435. raise ValueError("Can't insert an element before itself.")
  436. for predecessor in args:
  437. # Extract first so that the index won't be screwed up if they
  438. # are siblings.
  439. if isinstance(predecessor, PageElement):
  440. predecessor.extract()
  441. index = parent.index(self)
  442. parent.insert(index, predecessor)
  443. def insert_after(self, *args):
  444. """Makes the given element(s) the immediate successor of this one.
  445. The elements will have the same parent, and the given elements
  446. will be immediately after this one.
  447. :param args: One or more PageElements.
  448. """
  449. # Do all error checking before modifying the tree.
  450. parent = self.parent
  451. if parent is None:
  452. raise ValueError(
  453. "Element has no parent, so 'after' has no meaning.")
  454. if any(x is self for x in args):
  455. raise ValueError("Can't insert an element after itself.")
  456. offset = 0
  457. for successor in args:
  458. # Extract first so that the index won't be screwed up if they
  459. # are siblings.
  460. if isinstance(successor, PageElement):
  461. successor.extract()
  462. index = parent.index(self)
  463. parent.insert(index+1+offset, successor)
  464. offset += 1
  465. def find_next(self, name=None, attrs={}, string=None, **kwargs):
  466. """Find the first PageElement that matches the given criteria and
  467. appears later in the document than this PageElement.
  468. All find_* methods take a common set of arguments. See the online
  469. documentation for detailed explanations.
  470. :param name: A filter on tag name.
  471. :param attrs: A dictionary of filters on attribute values.
  472. :param string: A filter for a NavigableString with specific text.
  473. :kwargs: A dictionary of filters on attribute values.
  474. :return: A PageElement.
  475. :rtype: bs4.element.Tag | bs4.element.NavigableString
  476. """
  477. return self._find_one(self.find_all_next, name, attrs, string, **kwargs)
  478. findNext = find_next # BS3
  479. def find_all_next(self, name=None, attrs={}, string=None, limit=None,
  480. **kwargs):
  481. """Find all PageElements that match the given criteria and appear
  482. later in the document than this PageElement.
  483. All find_* methods take a common set of arguments. See the online
  484. documentation for detailed explanations.
  485. :param name: A filter on tag name.
  486. :param attrs: A dictionary of filters on attribute values.
  487. :param string: A filter for a NavigableString with specific text.
  488. :param limit: Stop looking after finding this many results.
  489. :kwargs: A dictionary of filters on attribute values.
  490. :return: A ResultSet containing PageElements.
  491. """
  492. return self._find_all(name, attrs, string, limit, self.next_elements,
  493. **kwargs)
  494. findAllNext = find_all_next # BS3
  495. def find_next_sibling(self, name=None, attrs={}, string=None, **kwargs):
  496. """Find the closest sibling to this PageElement that matches the
  497. given criteria and appears later in the document.
  498. All find_* methods take a common set of arguments. See the
  499. online documentation for detailed explanations.
  500. :param name: A filter on tag name.
  501. :param attrs: A dictionary of filters on attribute values.
  502. :param string: A filter for a NavigableString with specific text.
  503. :kwargs: A dictionary of filters on attribute values.
  504. :return: A PageElement.
  505. :rtype: bs4.element.Tag | bs4.element.NavigableString
  506. """
  507. return self._find_one(self.find_next_siblings, name, attrs, string,
  508. **kwargs)
  509. findNextSibling = find_next_sibling # BS3
  510. def find_next_siblings(self, name=None, attrs={}, string=None, limit=None,
  511. **kwargs):
  512. """Find all siblings of this PageElement that match the given criteria
  513. and appear later in the document.
  514. All find_* methods take a common set of arguments. See the online
  515. documentation for detailed explanations.
  516. :param name: A filter on tag name.
  517. :param attrs: A dictionary of filters on attribute values.
  518. :param string: A filter for a NavigableString with specific text.
  519. :param limit: Stop looking after finding this many results.
  520. :kwargs: A dictionary of filters on attribute values.
  521. :return: A ResultSet of PageElements.
  522. :rtype: bs4.element.ResultSet
  523. """
  524. return self._find_all(name, attrs, string, limit,
  525. self.next_siblings, **kwargs)
  526. findNextSiblings = find_next_siblings # BS3
  527. fetchNextSiblings = find_next_siblings # BS2
  528. def find_previous(self, name=None, attrs={}, string=None, **kwargs):
  529. """Look backwards in the document from this PageElement and find the
  530. first PageElement that matches the given criteria.
  531. All find_* methods take a common set of arguments. See the online
  532. documentation for detailed explanations.
  533. :param name: A filter on tag name.
  534. :param attrs: A dictionary of filters on attribute values.
  535. :param string: A filter for a NavigableString with specific text.
  536. :kwargs: A dictionary of filters on attribute values.
  537. :return: A PageElement.
  538. :rtype: bs4.element.Tag | bs4.element.NavigableString
  539. """
  540. return self._find_one(
  541. self.find_all_previous, name, attrs, string, **kwargs)
  542. findPrevious = find_previous # BS3
  543. def find_all_previous(self, name=None, attrs={}, string=None, limit=None,
  544. **kwargs):
  545. """Look backwards in the document from this PageElement and find all
  546. PageElements that match the given criteria.
  547. All find_* methods take a common set of arguments. See the online
  548. documentation for detailed explanations.
  549. :param name: A filter on tag name.
  550. :param attrs: A dictionary of filters on attribute values.
  551. :param string: A filter for a NavigableString with specific text.
  552. :param limit: Stop looking after finding this many results.
  553. :kwargs: A dictionary of filters on attribute values.
  554. :return: A ResultSet of PageElements.
  555. :rtype: bs4.element.ResultSet
  556. """
  557. return self._find_all(name, attrs, string, limit, self.previous_elements,
  558. **kwargs)
  559. findAllPrevious = find_all_previous # BS3
  560. fetchPrevious = find_all_previous # BS2
  561. def find_previous_sibling(self, name=None, attrs={}, string=None, **kwargs):
  562. """Returns the closest sibling to this PageElement that matches the
  563. given criteria and appears earlier in the document.
  564. All find_* methods take a common set of arguments. See the online
  565. documentation for detailed explanations.
  566. :param name: A filter on tag name.
  567. :param attrs: A dictionary of filters on attribute values.
  568. :param string: A filter for a NavigableString with specific text.
  569. :kwargs: A dictionary of filters on attribute values.
  570. :return: A PageElement.
  571. :rtype: bs4.element.Tag | bs4.element.NavigableString
  572. """
  573. return self._find_one(self.find_previous_siblings, name, attrs, string,
  574. **kwargs)
  575. findPreviousSibling = find_previous_sibling # BS3
  576. def find_previous_siblings(self, name=None, attrs={}, string=None,
  577. limit=None, **kwargs):
  578. """Returns all siblings to this PageElement that match the
  579. given criteria and appear earlier in the document.
  580. All find_* methods take a common set of arguments. See the online
  581. documentation for detailed explanations.
  582. :param name: A filter on tag name.
  583. :param attrs: A dictionary of filters on attribute values.
  584. :param string: A filter for a NavigableString with specific text.
  585. :param limit: Stop looking after finding this many results.
  586. :kwargs: A dictionary of filters on attribute values.
  587. :return: A ResultSet of PageElements.
  588. :rtype: bs4.element.ResultSet
  589. """
  590. return self._find_all(name, attrs, string, limit,
  591. self.previous_siblings, **kwargs)
  592. findPreviousSiblings = find_previous_siblings # BS3
  593. fetchPreviousSiblings = find_previous_siblings # BS2
  594. def find_parent(self, name=None, attrs={}, **kwargs):
  595. """Find the closest parent of this PageElement that matches the given
  596. criteria.
  597. All find_* methods take a common set of arguments. See the online
  598. documentation for detailed explanations.
  599. :param name: A filter on tag name.
  600. :param attrs: A dictionary of filters on attribute values.
  601. :kwargs: A dictionary of filters on attribute values.
  602. :return: A PageElement.
  603. :rtype: bs4.element.Tag | bs4.element.NavigableString
  604. """
  605. # NOTE: We can't use _find_one because findParents takes a different
  606. # set of arguments.
  607. r = None
  608. l = self.find_parents(name, attrs, 1, **kwargs)
  609. if l:
  610. r = l[0]
  611. return r
  612. findParent = find_parent # BS3
  613. def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
  614. """Find all parents of this PageElement that match the given criteria.
  615. All find_* methods take a common set of arguments. See the online
  616. documentation for detailed explanations.
  617. :param name: A filter on tag name.
  618. :param attrs: A dictionary of filters on attribute values.
  619. :param limit: Stop looking after finding this many results.
  620. :kwargs: A dictionary of filters on attribute values.
  621. :return: A PageElement.
  622. :rtype: bs4.element.Tag | bs4.element.NavigableString
  623. """
  624. return self._find_all(name, attrs, None, limit, self.parents,
  625. **kwargs)
  626. findParents = find_parents # BS3
  627. fetchParents = find_parents # BS2
  628. @property
  629. def next(self):
  630. """The PageElement, if any, that was parsed just after this one.
  631. :return: A PageElement.
  632. :rtype: bs4.element.Tag | bs4.element.NavigableString
  633. """
  634. return self.next_element
  635. @property
  636. def previous(self):
  637. """The PageElement, if any, that was parsed just before this one.
  638. :return: A PageElement.
  639. :rtype: bs4.element.Tag | bs4.element.NavigableString
  640. """
  641. return self.previous_element
  642. #These methods do the real heavy lifting.
  643. def _find_one(self, method, name, attrs, string, **kwargs):
  644. r = None
  645. l = method(name, attrs, string, 1, **kwargs)
  646. if l:
  647. r = l[0]
  648. return r
  649. def _find_all(self, name, attrs, string, limit, generator, **kwargs):
  650. "Iterates over a generator looking for things that match."
  651. if string is None and 'text' in kwargs:
  652. string = kwargs.pop('text')
  653. warnings.warn(
  654. "The 'text' argument to find()-type methods is deprecated. Use 'string' instead.",
  655. DeprecationWarning
  656. )
  657. if isinstance(name, SoupStrainer):
  658. strainer = name
  659. else:
  660. strainer = SoupStrainer(name, attrs, string, **kwargs)
  661. if string is None and not limit and not attrs and not kwargs:
  662. if name is True or name is None:
  663. # Optimization to find all tags.
  664. result = (element for element in generator
  665. if isinstance(element, Tag))
  666. return ResultSet(strainer, result)
  667. elif isinstance(name, str):
  668. # Optimization to find all tags with a given name.
  669. if name.count(':') == 1:
  670. # This is a name with a prefix. If this is a namespace-aware document,
  671. # we need to match the local name against tag.name. If not,
  672. # we need to match the fully-qualified name against tag.name.
  673. prefix, local_name = name.split(':', 1)
  674. else:
  675. prefix = None
  676. local_name = name
  677. result = (element for element in generator
  678. if isinstance(element, Tag)
  679. and (
  680. element.name == name
  681. ) or (
  682. element.name == local_name
  683. and (prefix is None or element.prefix == prefix)
  684. )
  685. )
  686. return ResultSet(strainer, result)
  687. results = ResultSet(strainer)
  688. while True:
  689. try:
  690. i = next(generator)
  691. except StopIteration:
  692. break
  693. if i:
  694. found = strainer.search(i)
  695. if found:
  696. results.append(found)
  697. if limit and len(results) >= limit:
  698. break
  699. return results
  700. #These generators can be used to navigate starting from both
  701. #NavigableStrings and Tags.
  702. @property
  703. def next_elements(self):
  704. """All PageElements that were parsed after this one.
  705. :yield: A sequence of PageElements.
  706. """
  707. i = self.next_element
  708. while i is not None:
  709. yield i
  710. i = i.next_element
  711. @property
  712. def next_siblings(self):
  713. """All PageElements that are siblings of this one but were parsed
  714. later.
  715. :yield: A sequence of PageElements.
  716. """
  717. i = self.next_sibling
  718. while i is not None:
  719. yield i
  720. i = i.next_sibling
  721. @property
  722. def previous_elements(self):
  723. """All PageElements that were parsed before this one.
  724. :yield: A sequence of PageElements.
  725. """
  726. i = self.previous_element
  727. while i is not None:
  728. yield i
  729. i = i.previous_element
  730. @property
  731. def previous_siblings(self):
  732. """All PageElements that are siblings of this one but were parsed
  733. earlier.
  734. :yield: A sequence of PageElements.
  735. """
  736. i = self.previous_sibling
  737. while i is not None:
  738. yield i
  739. i = i.previous_sibling
  740. @property
  741. def parents(self):
  742. """All PageElements that are parents of this PageElement.
  743. :yield: A sequence of PageElements.
  744. """
  745. i = self.parent
  746. while i is not None:
  747. yield i
  748. i = i.parent
  749. @property
  750. def decomposed(self):
  751. """Check whether a PageElement has been decomposed.
  752. :rtype: bool
  753. """
  754. return getattr(self, '_decomposed', False) or False
  755. # Old non-property versions of the generators, for backwards
  756. # compatibility with BS3.
  757. def nextGenerator(self):
  758. return self.next_elements
  759. def nextSiblingGenerator(self):
  760. return self.next_siblings
  761. def previousGenerator(self):
  762. return self.previous_elements
  763. def previousSiblingGenerator(self):
  764. return self.previous_siblings
  765. def parentGenerator(self):
  766. return self.parents
  767. class NavigableString(str, PageElement):
  768. """A Python Unicode string that is part of a parse tree.
  769. When Beautiful Soup parses the markup <b>penguin</b>, it will
  770. create a NavigableString for the string "penguin".
  771. """
  772. PREFIX = ''
  773. SUFFIX = ''
  774. # We can't tell just by looking at a string whether it's contained
  775. # in an XML document or an HTML document.
  776. known_xml = None
  777. def __new__(cls, value):
  778. """Create a new NavigableString.
  779. When unpickling a NavigableString, this method is called with
  780. the string in DEFAULT_OUTPUT_ENCODING. That encoding needs to be
  781. passed in to the superclass's __new__ or the superclass won't know
  782. how to handle non-ASCII characters.
  783. """
  784. if isinstance(value, str):
  785. u = str.__new__(cls, value)
  786. else:
  787. u = str.__new__(cls, value, DEFAULT_OUTPUT_ENCODING)
  788. u.setup()
  789. return u
  790. def __copy__(self):
  791. """A copy of a NavigableString has the same contents and class
  792. as the original, but it is not connected to the parse tree.
  793. """
  794. return type(self)(self)
  795. def __getnewargs__(self):
  796. return (str(self),)
  797. def __getattr__(self, attr):
  798. """text.string gives you text. This is for backwards
  799. compatibility for Navigable*String, but for CData* it lets you
  800. get the string without the CData wrapper."""
  801. if attr == 'string':
  802. return self
  803. else:
  804. raise AttributeError(
  805. "'%s' object has no attribute '%s'" % (
  806. self.__class__.__name__, attr))
  807. def output_ready(self, formatter="minimal"):
  808. """Run the string through the provided formatter.
  809. :param formatter: A Formatter object, or a string naming one of the standard formatters.
  810. """
  811. output = self.format_string(self, formatter)
  812. return self.PREFIX + output + self.SUFFIX
  813. @property
  814. def name(self):
  815. """Since a NavigableString is not a Tag, it has no .name.
  816. This property is implemented so that code like this doesn't crash
  817. when run on a mixture of Tag and NavigableString objects:
  818. [x.name for x in tag.children]
  819. """
  820. return None
  821. @name.setter
  822. def name(self, name):
  823. """Prevent NavigableString.name from ever being set."""
  824. raise AttributeError("A NavigableString cannot be given a name.")
  825. def _all_strings(self, strip=False, types=PageElement.default):
  826. """Yield all strings of certain classes, possibly stripping them.
  827. This makes it easy for NavigableString to implement methods
  828. like get_text() as conveniences, creating a consistent
  829. text-extraction API across all PageElements.
  830. :param strip: If True, all strings will be stripped before being
  831. yielded.
  832. :param types: A tuple of NavigableString subclasses. If this
  833. NavigableString isn't one of those subclasses, the
  834. sequence will be empty. By default, the subclasses
  835. considered are NavigableString and CData objects. That
  836. means no comments, processing instructions, etc.
  837. :yield: A sequence that either contains this string, or is empty.
  838. """
  839. if types is self.default:
  840. # This is kept in Tag because it's full of subclasses of
  841. # this class, which aren't defined until later in the file.
  842. types = Tag.DEFAULT_INTERESTING_STRING_TYPES
  843. # Do nothing if the caller is looking for specific types of
  844. # string, and we're of a different type.
  845. #
  846. # We check specific types instead of using isinstance(self,
  847. # types) because all of these classes subclass
  848. # NavigableString. Anyone who's using this feature probably
  849. # wants generic NavigableStrings but not other stuff.
  850. my_type = type(self)
  851. if types is not None:
  852. if isinstance(types, type):
  853. # Looking for a single type.
  854. if my_type is not types:
  855. return
  856. elif my_type not in types:
  857. # Looking for one of a list of types.
  858. return
  859. value = self
  860. if strip:
  861. value = value.strip()
  862. if len(value) > 0:
  863. yield value
  864. strings = property(_all_strings)
  865. class PreformattedString(NavigableString):
  866. """A NavigableString not subject to the normal formatting rules.
  867. This is an abstract class used for special kinds of strings such
  868. as comments (the Comment class) and CDATA blocks (the CData
  869. class).
  870. """
  871. PREFIX = ''
  872. SUFFIX = ''
  873. def output_ready(self, formatter=None):
  874. """Make this string ready for output by adding any subclass-specific
  875. prefix or suffix.
  876. :param formatter: A Formatter object, or a string naming one
  877. of the standard formatters. The string will be passed into the
  878. Formatter, but only to trigger any side effects: the return
  879. value is ignored.
  880. :return: The string, with any subclass-specific prefix and
  881. suffix added on.
  882. """
  883. if formatter is not None:
  884. ignore = self.format_string(self, formatter)
  885. return self.PREFIX + self + self.SUFFIX
  886. class CData(PreformattedString):
  887. """A CDATA block."""
  888. PREFIX = '<![CDATA['
  889. SUFFIX = ']]>'
  890. class ProcessingInstruction(PreformattedString):
  891. """A SGML processing instruction."""
  892. PREFIX = '<?'
  893. SUFFIX = '>'
  894. class XMLProcessingInstruction(ProcessingInstruction):
  895. """An XML processing instruction."""
  896. PREFIX = '<?'
  897. SUFFIX = '?>'
  898. class Comment(PreformattedString):
  899. """An HTML or XML comment."""
  900. PREFIX = '<!--'
  901. SUFFIX = '-->'
  902. class Declaration(PreformattedString):
  903. """An XML declaration."""
  904. PREFIX = '<?'
  905. SUFFIX = '?>'
  906. class Doctype(PreformattedString):
  907. """A document type declaration."""
  908. @classmethod
  909. def for_name_and_ids(cls, name, pub_id, system_id):
  910. """Generate an appropriate document type declaration for a given
  911. public ID and system ID.
  912. :param name: The name of the document's root element, e.g. 'html'.
  913. :param pub_id: The Formal Public Identifier for this document type,
  914. e.g. '-//W3C//DTD XHTML 1.1//EN'
  915. :param system_id: The system identifier for this document type,
  916. e.g. 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'
  917. :return: A Doctype.
  918. """
  919. value = name or ''
  920. if pub_id is not None:
  921. value += ' PUBLIC "%s"' % pub_id
  922. if system_id is not None:
  923. value += ' "%s"' % system_id
  924. elif system_id is not None:
  925. value += ' SYSTEM "%s"' % system_id
  926. return Doctype(value)
  927. PREFIX = '<!DOCTYPE '
  928. SUFFIX = '>\n'
  929. class Stylesheet(NavigableString):
  930. """A NavigableString representing an stylesheet (probably
  931. CSS).
  932. Used to distinguish embedded stylesheets from textual content.
  933. """
  934. pass
  935. class Script(NavigableString):
  936. """A NavigableString representing an executable script (probably
  937. Javascript).
  938. Used to distinguish executable code from textual content.
  939. """
  940. pass
  941. class TemplateString(NavigableString):
  942. """A NavigableString representing a string found inside an HTML
  943. template embedded in a larger document.
  944. Used to distinguish such strings from the main body of the document.
  945. """
  946. pass
  947. class RubyTextString(NavigableString):
  948. """A NavigableString representing the contents of the <rt> HTML
  949. element.
  950. https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rt-element
  951. Can be used to distinguish such strings from the strings they're
  952. annotating.
  953. """
  954. pass
  955. class RubyParenthesisString(NavigableString):
  956. """A NavigableString representing the contents of the <rp> HTML
  957. element.
  958. https://dev.w3.org/html5/spec-LC/text-level-semantics.html#the-rp-element
  959. """
  960. pass
  961. class Tag(PageElement):
  962. """Represents an HTML or XML tag that is part of a parse tree, along
  963. with its attributes and contents.
  964. When Beautiful Soup parses the markup <b>penguin</b>, it will
  965. create a Tag object representing the <b> tag.
  966. """
  967. def __init__(self, parser=None, builder=None, name=None, namespace=None,
  968. prefix=None, attrs=None, parent=None, previous=None,
  969. is_xml=None, sourceline=None, sourcepos=None,
  970. can_be_empty_element=None, cdata_list_attributes=None,
  971. preserve_whitespace_tags=None,
  972. interesting_string_types=None,
  973. namespaces=None
  974. ):
  975. """Basic constructor.
  976. :param parser: A BeautifulSoup object.
  977. :param builder: A TreeBuilder.
  978. :param name: The name of the tag.
  979. :param namespace: The URI of this Tag's XML namespace, if any.
  980. :param prefix: The prefix for this Tag's XML namespace, if any.
  981. :param attrs: A dictionary of this Tag's attribute values.
  982. :param parent: The PageElement to use as this Tag's parent.
  983. :param previous: The PageElement that was parsed immediately before
  984. this tag.
  985. :param is_xml: If True, this is an XML tag. Otherwise, this is an
  986. HTML tag.
  987. :param sourceline: The line number where this tag was found in its
  988. source document.
  989. :param sourcepos: The character position within `sourceline` where this
  990. tag was found.
  991. :param can_be_empty_element: If True, this tag should be
  992. represented as <tag/>. If False, this tag should be represented
  993. as <tag></tag>.
  994. :param cdata_list_attributes: A list of attributes whose values should
  995. be treated as CDATA if they ever show up on this tag.
  996. :param preserve_whitespace_tags: A list of tag names whose contents
  997. should have their whitespace preserved.
  998. :param interesting_string_types: This is a NavigableString
  999. subclass or a tuple of them. When iterating over this
  1000. Tag's strings in methods like Tag.strings or Tag.get_text,
  1001. these are the types of strings that are interesting enough
  1002. to be considered. The default is to consider
  1003. NavigableString and CData the only interesting string
  1004. subtypes.
  1005. :param namespaces: A dictionary mapping currently active
  1006. namespace prefixes to URIs. This can be used later to
  1007. construct CSS selectors.
  1008. """
  1009. if parser is None:
  1010. self.parser_class = None
  1011. else:
  1012. # We don't actually store the parser object: that lets extracted
  1013. # chunks be garbage-collected.
  1014. self.parser_class = parser.__class__
  1015. if name is None:
  1016. raise ValueError("No value provided for new tag's name.")
  1017. self.name = name
  1018. self.namespace = namespace
  1019. self._namespaces = namespaces or {}
  1020. self.prefix = prefix
  1021. if ((not builder or builder.store_line_numbers)
  1022. and (sourceline is not None or sourcepos is not None)):
  1023. self.sourceline = sourceline
  1024. self.sourcepos = sourcepos
  1025. if attrs is None:
  1026. attrs = {}
  1027. elif attrs:
  1028. if builder is not None and builder.cdata_list_attributes:
  1029. attrs = builder._replace_cdata_list_attribute_values(
  1030. self.name, attrs)
  1031. else:
  1032. attrs = dict(attrs)
  1033. else:
  1034. attrs = dict(attrs)
  1035. # If possible, determine ahead of time whether this tag is an
  1036. # XML tag.
  1037. if builder:
  1038. self.known_xml = builder.is_xml
  1039. else:
  1040. self.known_xml = is_xml
  1041. self.attrs = attrs
  1042. self.contents = []
  1043. self.setup(parent, previous)
  1044. self.hidden = False
  1045. if builder is None:
  1046. # In the absence of a TreeBuilder, use whatever values were
  1047. # passed in here. They're probably None, unless this is a copy of some
  1048. # other tag.
  1049. self.can_be_empty_element = can_be_empty_element
  1050. self.cdata_list_attributes = cdata_list_attributes
  1051. self.preserve_whitespace_tags = preserve_whitespace_tags
  1052. self.interesting_string_types = interesting_string_types
  1053. else:
  1054. # Set up any substitutions for this tag, such as the charset in a META tag.
  1055. builder.set_up_substitutions(self)
  1056. # Ask the TreeBuilder whether this tag might be an empty-element tag.
  1057. self.can_be_empty_element = builder.can_be_empty_element(name)
  1058. # Keep track of the list of attributes of this tag that
  1059. # might need to be treated as a list.
  1060. #
  1061. # For performance reasons, we store the whole data structure
  1062. # rather than asking the question of every tag. Asking would
  1063. # require building a new data structure every time, and
  1064. # (unlike can_be_empty_element), we almost never need
  1065. # to check this.
  1066. self.cdata_list_attributes = builder.cdata_list_attributes
  1067. # Keep track of the names that might cause this tag to be treated as a
  1068. # whitespace-preserved tag.
  1069. self.preserve_whitespace_tags = builder.preserve_whitespace_tags
  1070. if self.name in builder.string_containers:
  1071. # This sort of tag uses a special string container
  1072. # subclass for most of its strings. When we ask the
  1073. self.interesting_string_types = builder.string_containers[self.name]
  1074. else:
  1075. self.interesting_string_types = self.DEFAULT_INTERESTING_STRING_TYPES
  1076. parserClass = _alias("parser_class") # BS3
  1077. def __copy__(self):
  1078. """A copy of a Tag is a new Tag, unconnected to the parse tree.
  1079. Its contents are a copy of the old Tag's contents.
  1080. """
  1081. clone = type(self)(
  1082. None, self.builder, self.name, self.namespace,
  1083. self.prefix, self.attrs, is_xml=self._is_xml,
  1084. sourceline=self.sourceline, sourcepos=self.sourcepos,
  1085. can_be_empty_element=self.can_be_empty_element,
  1086. cdata_list_attributes=self.cdata_list_attributes,
  1087. preserve_whitespace_tags=self.preserve_whitespace_tags
  1088. )
  1089. for attr in ('can_be_empty_element', 'hidden'):
  1090. setattr(clone, attr, getattr(self, attr))
  1091. for child in self.contents:
  1092. clone.append(child.__copy__())
  1093. return clone
  1094. @property
  1095. def is_empty_element(self):
  1096. """Is this tag an empty-element tag? (aka a self-closing tag)
  1097. A tag that has contents is never an empty-element tag.
  1098. A tag that has no contents may or may not be an empty-element
  1099. tag. It depends on the builder used to create the tag. If the
  1100. builder has a designated list of empty-element tags, then only
  1101. a tag whose name shows up in that list is considered an
  1102. empty-element tag.
  1103. If the builder has no designated list of empty-element tags,
  1104. then any tag with no contents is an empty-element tag.
  1105. """
  1106. return len(self.contents) == 0 and self.can_be_empty_element
  1107. isSelfClosing = is_empty_element # BS3
  1108. @property
  1109. def string(self):
  1110. """Convenience property to get the single string within this
  1111. PageElement.
  1112. TODO It might make sense to have NavigableString.string return
  1113. itself.
  1114. :return: If this element has a single string child, return
  1115. value is that string. If this element has one child tag,
  1116. return value is the 'string' attribute of the child tag,
  1117. recursively. If this element is itself a string, has no
  1118. children, or has more than one child, return value is None.
  1119. """
  1120. if len(self.contents) != 1:
  1121. return None
  1122. child = self.contents[0]
  1123. if isinstance(child, NavigableString):
  1124. return child
  1125. return child.string
  1126. @string.setter
  1127. def string(self, string):
  1128. """Replace this PageElement's contents with `string`."""
  1129. self.clear()
  1130. self.append(string.__class__(string))
  1131. DEFAULT_INTERESTING_STRING_TYPES = (NavigableString, CData)
  1132. def _all_strings(self, strip=False, types=PageElement.default):
  1133. """Yield all strings of certain classes, possibly stripping them.
  1134. :param strip: If True, all strings will be stripped before being
  1135. yielded.
  1136. :param types: A tuple of NavigableString subclasses. Any strings of
  1137. a subclass not found in this list will be ignored. By
  1138. default, the subclasses considered are the ones found in
  1139. self.interesting_string_types. If that's not specified,
  1140. only NavigableString and CData objects will be
  1141. considered. That means no comments, processing
  1142. instructions, etc.
  1143. :yield: A sequence of strings.
  1144. """
  1145. if types is self.default:
  1146. types = self.interesting_string_types
  1147. for descendant in self.descendants:
  1148. if (types is None and not isinstance(descendant, NavigableString)):
  1149. continue
  1150. descendant_type = type(descendant)
  1151. if isinstance(types, type):
  1152. if descendant_type is not types:
  1153. # We're not interested in strings of this type.
  1154. continue
  1155. elif types is not None and descendant_type not in types:
  1156. # We're not interested in strings of this type.
  1157. continue
  1158. if strip:
  1159. descendant = descendant.strip()
  1160. if len(descendant) == 0:
  1161. continue
  1162. yield descendant
  1163. strings = property(_all_strings)
  1164. def decompose(self):
  1165. """Recursively destroys this PageElement and its children.
  1166. This element will be removed from the tree and wiped out; so
  1167. will everything beneath it.
  1168. The behavior of a decomposed PageElement is undefined and you
  1169. should never use one for anything, but if you need to _check_
  1170. whether an element has been decomposed, you can use the
  1171. `decomposed` property.
  1172. """
  1173. self.extract()
  1174. i = self
  1175. while i is not None:
  1176. n = i.next_element
  1177. i.__dict__.clear()
  1178. i.contents = []
  1179. i._decomposed = True
  1180. i = n
  1181. def clear(self, decompose=False):
  1182. """Wipe out all children of this PageElement by calling extract()
  1183. on them.
  1184. :param decompose: If this is True, decompose() (a more
  1185. destructive method) will be called instead of extract().
  1186. """
  1187. if decompose:
  1188. for element in self.contents[:]:
  1189. if isinstance(element, Tag):
  1190. element.decompose()
  1191. else:
  1192. element.extract()
  1193. else:
  1194. for element in self.contents[:]:
  1195. element.extract()
  1196. def smooth(self):
  1197. """Smooth out this element's children by consolidating consecutive
  1198. strings.
  1199. This makes pretty-printed output look more natural following a
  1200. lot of operations that modified the tree.
  1201. """
  1202. # Mark the first position of every pair of children that need
  1203. # to be consolidated. Do this rather than making a copy of
  1204. # self.contents, since in most cases very few strings will be
  1205. # affected.
  1206. marked = []
  1207. for i, a in enumerate(self.contents):
  1208. if isinstance(a, Tag):
  1209. # Recursively smooth children.
  1210. a.smooth()
  1211. if i == len(self.contents)-1:
  1212. # This is the last item in .contents, and it's not a
  1213. # tag. There's no chance it needs any work.
  1214. continue
  1215. b = self.contents[i+1]
  1216. if (isinstance(a, NavigableString)
  1217. and isinstance(b, NavigableString)
  1218. and not isinstance(a, PreformattedString)
  1219. and not isinstance(b, PreformattedString)
  1220. ):
  1221. marked.append(i)
  1222. # Go over the marked positions in reverse order, so that
  1223. # removing items from .contents won't affect the remaining
  1224. # positions.
  1225. for i in reversed(marked):
  1226. a = self.contents[i]
  1227. b = self.contents[i+1]
  1228. b.extract()
  1229. n = NavigableString(a+b)
  1230. a.replace_with(n)
  1231. def index(self, element):
  1232. """Find the index of a child by identity, not value.
  1233. Avoids issues with tag.contents.index(element) getting the
  1234. index of equal elements.
  1235. :param element: Look for this PageElement in `self.contents`.
  1236. """
  1237. for i, child in enumerate(self.contents):
  1238. if child is element:
  1239. return i
  1240. raise ValueError("Tag.index: element not in tag")
  1241. def get(self, key, default=None):
  1242. """Returns the value of the 'key' attribute for the tag, or
  1243. the value given for 'default' if it doesn't have that
  1244. attribute."""
  1245. return self.attrs.get(key, default)
  1246. def get_attribute_list(self, key, default=None):
  1247. """The same as get(), but always returns a list.
  1248. :param key: The attribute to look for.
  1249. :param default: Use this value if the attribute is not present
  1250. on this PageElement.
  1251. :return: A list of values, probably containing only a single
  1252. value.
  1253. """
  1254. value = self.get(key, default)
  1255. if not isinstance(value, list):
  1256. value = [value]
  1257. return value
  1258. def has_attr(self, key):
  1259. """Does this PageElement have an attribute with the given name?"""
  1260. return key in self.attrs
  1261. def __hash__(self):
  1262. return str(self).__hash__()
  1263. def __getitem__(self, key):
  1264. """tag[key] returns the value of the 'key' attribute for the Tag,
  1265. and throws an exception if it's not there."""
  1266. return self.attrs[key]
  1267. def __iter__(self):
  1268. "Iterating over a Tag iterates over its contents."
  1269. return iter(self.contents)
  1270. def __len__(self):
  1271. "The length of a Tag is the length of its list of contents."
  1272. return len(self.contents)
  1273. def __contains__(self, x):
  1274. return x in self.contents
  1275. def __bool__(self):
  1276. "A tag is non-None even if it has no contents."
  1277. return True
  1278. def __setitem__(self, key, value):
  1279. """Setting tag[key] sets the value of the 'key' attribute for the
  1280. tag."""
  1281. self.attrs[key] = value
  1282. def __delitem__(self, key):
  1283. "Deleting tag[key] deletes all 'key' attributes for the tag."
  1284. self.attrs.pop(key, None)
  1285. def __call__(self, *args, **kwargs):
  1286. """Calling a Tag like a function is the same as calling its
  1287. find_all() method. Eg. tag('a') returns a list of all the A tags
  1288. found within this tag."""
  1289. return self.find_all(*args, **kwargs)
  1290. def __getattr__(self, tag):
  1291. """Calling tag.subtag is the same as calling tag.find(name="subtag")"""
  1292. #print("Getattr %s.%s" % (self.__class__, tag))
  1293. if len(tag) > 3 and tag.endswith('Tag'):
  1294. # BS3: soup.aTag -> "soup.find("a")
  1295. tag_name = tag[:-3]
  1296. warnings.warn(
  1297. '.%(name)sTag is deprecated, use .find("%(name)s") instead. If you really were looking for a tag called %(name)sTag, use .find("%(name)sTag")' % dict(
  1298. name=tag_name
  1299. ),
  1300. DeprecationWarning
  1301. )
  1302. return self.find(tag_name)
  1303. # We special case contents to avoid recursion.
  1304. elif not tag.startswith("__") and not tag == "contents":
  1305. return self.find(tag)
  1306. raise AttributeError(
  1307. "'%s' object has no attribute '%s'" % (self.__class__, tag))
  1308. def __eq__(self, other):
  1309. """Returns true iff this Tag has the same name, the same attributes,
  1310. and the same contents (recursively) as `other`."""
  1311. if self is other:
  1312. return True
  1313. if (not hasattr(other, 'name') or
  1314. not hasattr(other, 'attrs') or
  1315. not hasattr(other, 'contents') or
  1316. self.name != other.name or
  1317. self.attrs != other.attrs or
  1318. len(self) != len(other)):
  1319. return False
  1320. for i, my_child in enumerate(self.contents):
  1321. if my_child != other.contents[i]:
  1322. return False
  1323. return True
  1324. def __ne__(self, other):
  1325. """Returns true iff this Tag is not identical to `other`,
  1326. as defined in __eq__."""
  1327. return not self == other
  1328. def __repr__(self, encoding="unicode-escape"):
  1329. """Renders this PageElement as a string.
  1330. :param encoding: The encoding to use (Python 2 only).
  1331. TODO: This is now ignored and a warning should be issued
  1332. if a value is provided.
  1333. :return: A (Unicode) string.
  1334. """
  1335. # "The return value must be a string object", i.e. Unicode
  1336. return self.decode()
  1337. def __unicode__(self):
  1338. """Renders this PageElement as a Unicode string."""
  1339. return self.decode()
  1340. __str__ = __repr__ = __unicode__
  1341. def encode(self, encoding=DEFAULT_OUTPUT_ENCODING,
  1342. indent_level=None, formatter="minimal",
  1343. errors="xmlcharrefreplace"):
  1344. """Render a bytestring representation of this PageElement and its
  1345. contents.
  1346. :param encoding: The destination encoding.
  1347. :param indent_level: Each line of the rendering will be
  1348. indented this many levels. (The formatter decides what a
  1349. 'level' means in terms of spaces or other characters
  1350. output.) Used internally in recursive calls while
  1351. pretty-printing.
  1352. :param formatter: A Formatter object, or a string naming one of
  1353. the standard formatters.
  1354. :param errors: An error handling strategy such as
  1355. 'xmlcharrefreplace'. This value is passed along into
  1356. encode() and its value should be one of the constants
  1357. defined by Python.
  1358. :return: A bytestring.
  1359. """
  1360. # Turn the data structure into Unicode, then encode the
  1361. # Unicode.
  1362. u = self.decode(indent_level, encoding, formatter)
  1363. return u.encode(encoding, errors)
  1364. def decode(self, indent_level=None,
  1365. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  1366. formatter="minimal"):
  1367. """Render a Unicode representation of this PageElement and its
  1368. contents.
  1369. :param indent_level: Each line of the rendering will be
  1370. indented this many spaces. Used internally in
  1371. recursive calls while pretty-printing.
  1372. :param eventual_encoding: The tag is destined to be
  1373. encoded into this encoding. This method is _not_
  1374. responsible for performing that encoding. This information
  1375. is passed in so that it can be substituted in if the
  1376. document contains a <META> tag that mentions the document's
  1377. encoding.
  1378. :param formatter: A Formatter object, or a string naming one of
  1379. the standard formatters.
  1380. """
  1381. # First off, turn a non-Formatter `formatter` into a Formatter
  1382. # object. This will stop the lookup from happening over and
  1383. # over again.
  1384. if not isinstance(formatter, Formatter):
  1385. formatter = self.formatter_for_name(formatter)
  1386. attributes = formatter.attributes(self)
  1387. attrs = []
  1388. for key, val in attributes:
  1389. if val is None:
  1390. decoded = key
  1391. else:
  1392. if isinstance(val, list) or isinstance(val, tuple):
  1393. val = ' '.join(val)
  1394. elif not isinstance(val, str):
  1395. val = str(val)
  1396. elif (
  1397. isinstance(val, AttributeValueWithCharsetSubstitution)
  1398. and eventual_encoding is not None
  1399. ):
  1400. val = val.encode(eventual_encoding)
  1401. text = formatter.attribute_value(val)
  1402. decoded = (
  1403. str(key) + '='
  1404. + formatter.quoted_attribute_value(text))
  1405. attrs.append(decoded)
  1406. close = ''
  1407. closeTag = ''
  1408. prefix = ''
  1409. if self.prefix:
  1410. prefix = self.prefix + ":"
  1411. if self.is_empty_element:
  1412. close = formatter.void_element_close_prefix or ''
  1413. else:
  1414. closeTag = '</%s%s>' % (prefix, self.name)
  1415. pretty_print = self._should_pretty_print(indent_level)
  1416. space = ''
  1417. indent_space = ''
  1418. if indent_level is not None:
  1419. indent_space = (formatter.indent * (indent_level - 1))
  1420. if pretty_print:
  1421. space = indent_space
  1422. indent_contents = indent_level + 1
  1423. else:
  1424. indent_contents = None
  1425. contents = self.decode_contents(
  1426. indent_contents, eventual_encoding, formatter
  1427. )
  1428. if self.hidden:
  1429. # This is the 'document root' object.
  1430. s = contents
  1431. else:
  1432. s = []
  1433. attribute_string = ''
  1434. if attrs:
  1435. attribute_string = ' ' + ' '.join(attrs)
  1436. if indent_level is not None:
  1437. # Even if this particular tag is not pretty-printed,
  1438. # we should indent up to the start of the tag.
  1439. s.append(indent_space)
  1440. s.append('<%s%s%s%s>' % (
  1441. prefix, self.name, attribute_string, close))
  1442. if pretty_print:
  1443. s.append("\n")
  1444. s.append(contents)
  1445. if pretty_print and contents and contents[-1] != "\n":
  1446. s.append("\n")
  1447. if pretty_print and closeTag:
  1448. s.append(space)
  1449. s.append(closeTag)
  1450. if indent_level is not None and closeTag and self.next_sibling:
  1451. # Even if this particular tag is not pretty-printed,
  1452. # we're now done with the tag, and we should add a
  1453. # newline if appropriate.
  1454. s.append("\n")
  1455. s = ''.join(s)
  1456. return s
  1457. def _should_pretty_print(self, indent_level):
  1458. """Should this tag be pretty-printed?
  1459. Most of them should, but some (such as <pre> in HTML
  1460. documents) should not.
  1461. """
  1462. return (
  1463. indent_level is not None
  1464. and (
  1465. not self.preserve_whitespace_tags
  1466. or self.name not in self.preserve_whitespace_tags
  1467. )
  1468. )
  1469. def prettify(self, encoding=None, formatter="minimal"):
  1470. """Pretty-print this PageElement as a string.
  1471. :param encoding: The eventual encoding of the string. If this is None,
  1472. a Unicode string will be returned.
  1473. :param formatter: A Formatter object, or a string naming one of
  1474. the standard formatters.
  1475. :return: A Unicode string (if encoding==None) or a bytestring
  1476. (otherwise).
  1477. """
  1478. if encoding is None:
  1479. return self.decode(True, formatter=formatter)
  1480. else:
  1481. return self.encode(encoding, True, formatter=formatter)
  1482. def decode_contents(self, indent_level=None,
  1483. eventual_encoding=DEFAULT_OUTPUT_ENCODING,
  1484. formatter="minimal"):
  1485. """Renders the contents of this tag as a Unicode string.
  1486. :param indent_level: Each line of the rendering will be
  1487. indented this many levels. (The formatter decides what a
  1488. 'level' means in terms of spaces or other characters
  1489. output.) Used internally in recursive calls while
  1490. pretty-printing.
  1491. :param eventual_encoding: The tag is destined to be
  1492. encoded into this encoding. decode_contents() is _not_
  1493. responsible for performing that encoding. This information
  1494. is passed in so that it can be substituted in if the
  1495. document contains a <META> tag that mentions the document's
  1496. encoding.
  1497. :param formatter: A Formatter object, or a string naming one of
  1498. the standard Formatters.
  1499. """
  1500. # First off, turn a string formatter into a Formatter object. This
  1501. # will stop the lookup from happening over and over again.
  1502. if not isinstance(formatter, Formatter):
  1503. formatter = self.formatter_for_name(formatter)
  1504. pretty_print = (indent_level is not None)
  1505. s = []
  1506. for c in self:
  1507. text = None
  1508. if isinstance(c, NavigableString):
  1509. text = c.output_ready(formatter)
  1510. elif isinstance(c, Tag):
  1511. s.append(c.decode(indent_level, eventual_encoding,
  1512. formatter))
  1513. preserve_whitespace = (
  1514. self.preserve_whitespace_tags and self.name in self.preserve_whitespace_tags
  1515. )
  1516. if text and indent_level and not preserve_whitespace:
  1517. text = text.strip()
  1518. if text:
  1519. if pretty_print and not preserve_whitespace:
  1520. s.append(formatter.indent * (indent_level - 1))
  1521. s.append(text)
  1522. if pretty_print and not preserve_whitespace:
  1523. s.append("\n")
  1524. return ''.join(s)
  1525. def encode_contents(
  1526. self, indent_level=None, encoding=DEFAULT_OUTPUT_ENCODING,
  1527. formatter="minimal"):
  1528. """Renders the contents of this PageElement as a bytestring.
  1529. :param indent_level: Each line of the rendering will be
  1530. indented this many levels. (The formatter decides what a
  1531. 'level' means in terms of spaces or other characters
  1532. output.) Used internally in recursive calls while
  1533. pretty-printing.
  1534. :param eventual_encoding: The bytestring will be in this encoding.
  1535. :param formatter: A Formatter object, or a string naming one of
  1536. the standard Formatters.
  1537. :return: A bytestring.
  1538. """
  1539. contents = self.decode_contents(indent_level, encoding, formatter)
  1540. return contents.encode(encoding)
  1541. # Old method for BS3 compatibility
  1542. def renderContents(self, encoding=DEFAULT_OUTPUT_ENCODING,
  1543. prettyPrint=False, indentLevel=0):
  1544. """Deprecated method for BS3 compatibility."""
  1545. if not prettyPrint:
  1546. indentLevel = None
  1547. return self.encode_contents(
  1548. indent_level=indentLevel, encoding=encoding)
  1549. #Soup methods
  1550. def find(self, name=None, attrs={}, recursive=True, string=None,
  1551. **kwargs):
  1552. """Look in the children of this PageElement and find the first
  1553. PageElement that matches the given criteria.
  1554. All find_* methods take a common set of arguments. See the online
  1555. documentation for detailed explanations.
  1556. :param name: A filter on tag name.
  1557. :param attrs: A dictionary of filters on attribute values.
  1558. :param recursive: If this is True, find() will perform a
  1559. recursive search of this PageElement's children. Otherwise,
  1560. only the direct children will be considered.
  1561. :param limit: Stop looking after finding this many results.
  1562. :kwargs: A dictionary of filters on attribute values.
  1563. :return: A PageElement.
  1564. :rtype: bs4.element.Tag | bs4.element.NavigableString
  1565. """
  1566. r = None
  1567. l = self.find_all(name, attrs, recursive, string, 1, **kwargs)
  1568. if l:
  1569. r = l[0]
  1570. return r
  1571. findChild = find #BS2
  1572. def find_all(self, name=None, attrs={}, recursive=True, string=None,
  1573. limit=None, **kwargs):
  1574. """Look in the children of this PageElement and find all
  1575. PageElements that match the given criteria.
  1576. All find_* methods take a common set of arguments. See the online
  1577. documentation for detailed explanations.
  1578. :param name: A filter on tag name.
  1579. :param attrs: A dictionary of filters on attribute values.
  1580. :param recursive: If this is True, find_all() will perform a
  1581. recursive search of this PageElement's children. Otherwise,
  1582. only the direct children will be considered.
  1583. :param limit: Stop looking after finding this many results.
  1584. :kwargs: A dictionary of filters on attribute values.
  1585. :return: A ResultSet of PageElements.
  1586. :rtype: bs4.element.ResultSet
  1587. """
  1588. generator = self.descendants
  1589. if not recursive:
  1590. generator = self.children
  1591. return self._find_all(name, attrs, string, limit, generator, **kwargs)
  1592. findAll = find_all # BS3
  1593. findChildren = find_all # BS2
  1594. #Generator methods
  1595. @property
  1596. def children(self):
  1597. """Iterate over all direct children of this PageElement.
  1598. :yield: A sequence of PageElements.
  1599. """
  1600. # return iter() to make the purpose of the method clear
  1601. return iter(self.contents) # XXX This seems to be untested.
  1602. @property
  1603. def descendants(self):
  1604. """Iterate over all children of this PageElement in a
  1605. breadth-first sequence.
  1606. :yield: A sequence of PageElements.
  1607. """
  1608. if not len(self.contents):
  1609. return
  1610. stopNode = self._last_descendant().next_element
  1611. current = self.contents[0]
  1612. while current is not stopNode:
  1613. yield current
  1614. current = current.next_element
  1615. # CSS selector code
  1616. def select_one(self, selector, namespaces=None, **kwargs):
  1617. """Perform a CSS selection operation on the current element.
  1618. :param selector: A CSS selector.
  1619. :param namespaces: A dictionary mapping namespace prefixes
  1620. used in the CSS selector to namespace URIs. By default,
  1621. Beautiful Soup will use the prefixes it encountered while
  1622. parsing the document.
  1623. :param kwargs: Keyword arguments to be passed into SoupSieve's
  1624. soupsieve.select() method.
  1625. :return: A Tag.
  1626. :rtype: bs4.element.Tag
  1627. """
  1628. value = self.select(selector, namespaces, 1, **kwargs)
  1629. if value:
  1630. return value[0]
  1631. return None
  1632. def select(self, selector, namespaces=None, limit=None, **kwargs):
  1633. """Perform a CSS selection operation on the current element.
  1634. This uses the SoupSieve library.
  1635. :param selector: A string containing a CSS selector.
  1636. :param namespaces: A dictionary mapping namespace prefixes
  1637. used in the CSS selector to namespace URIs. By default,
  1638. Beautiful Soup will use the prefixes it encountered while
  1639. parsing the document.
  1640. :param limit: After finding this number of results, stop looking.
  1641. :param kwargs: Keyword arguments to be passed into SoupSieve's
  1642. soupsieve.select() method.
  1643. :return: A ResultSet of Tags.
  1644. :rtype: bs4.element.ResultSet
  1645. """
  1646. if namespaces is None:
  1647. namespaces = self._namespaces
  1648. if limit is None:
  1649. limit = 0
  1650. if soupsieve is None:
  1651. raise NotImplementedError(
  1652. "Cannot execute CSS selectors because the soupsieve package is not installed."
  1653. )
  1654. results = soupsieve.select(selector, self, namespaces, limit, **kwargs)
  1655. # We do this because it's more consistent and because
  1656. # ResultSet.__getattr__ has a helpful error message.
  1657. return ResultSet(None, results)
  1658. # Old names for backwards compatibility
  1659. def childGenerator(self):
  1660. """Deprecated generator."""
  1661. return self.children
  1662. def recursiveChildGenerator(self):
  1663. """Deprecated generator."""
  1664. return self.descendants
  1665. def has_key(self, key):
  1666. """Deprecated method. This was kind of misleading because has_key()
  1667. (attributes) was different from __in__ (contents).
  1668. has_key() is gone in Python 3, anyway.
  1669. """
  1670. warnings.warn(
  1671. 'has_key is deprecated. Use has_attr(key) instead.',
  1672. DeprecationWarning
  1673. )
  1674. return self.has_attr(key)
  1675. # Next, a couple classes to represent queries and their results.
  1676. class SoupStrainer(object):
  1677. """Encapsulates a number of ways of matching a markup element (tag or
  1678. string).
  1679. This is primarily used to underpin the find_* methods, but you can
  1680. create one yourself and pass it in as `parse_only` to the
  1681. `BeautifulSoup` constructor, to parse a subset of a large
  1682. document.
  1683. """
  1684. def __init__(self, name=None, attrs={}, string=None, **kwargs):
  1685. """Constructor.
  1686. The SoupStrainer constructor takes the same arguments passed
  1687. into the find_* methods. See the online documentation for
  1688. detailed explanations.
  1689. :param name: A filter on tag name.
  1690. :param attrs: A dictionary of filters on attribute values.
  1691. :param string: A filter for a NavigableString with specific text.
  1692. :kwargs: A dictionary of filters on attribute values.
  1693. """
  1694. if string is None and 'text' in kwargs:
  1695. string = kwargs.pop('text')
  1696. warnings.warn(
  1697. "The 'text' argument to the SoupStrainer constructor is deprecated. Use 'string' instead.",
  1698. DeprecationWarning
  1699. )
  1700. self.name = self._normalize_search_value(name)
  1701. if not isinstance(attrs, dict):
  1702. # Treat a non-dict value for attrs as a search for the 'class'
  1703. # attribute.
  1704. kwargs['class'] = attrs
  1705. attrs = None
  1706. if 'class_' in kwargs:
  1707. # Treat class_="foo" as a search for the 'class'
  1708. # attribute, overriding any non-dict value for attrs.
  1709. kwargs['class'] = kwargs['class_']
  1710. del kwargs['class_']
  1711. if kwargs:
  1712. if attrs:
  1713. attrs = attrs.copy()
  1714. attrs.update(kwargs)
  1715. else:
  1716. attrs = kwargs
  1717. normalized_attrs = {}
  1718. for key, value in list(attrs.items()):
  1719. normalized_attrs[key] = self._normalize_search_value(value)
  1720. self.attrs = normalized_attrs
  1721. self.string = self._normalize_search_value(string)
  1722. # DEPRECATED but just in case someone is checking this.
  1723. self.text = self.string
  1724. def _normalize_search_value(self, value):
  1725. # Leave it alone if it's a Unicode string, a callable, a
  1726. # regular expression, a boolean, or None.
  1727. if (isinstance(value, str) or isinstance(value, Callable) or hasattr(value, 'match')
  1728. or isinstance(value, bool) or value is None):
  1729. return value
  1730. # If it's a bytestring, convert it to Unicode, treating it as UTF-8.
  1731. if isinstance(value, bytes):
  1732. return value.decode("utf8")
  1733. # If it's listlike, convert it into a list of strings.
  1734. if hasattr(value, '__iter__'):
  1735. new_value = []
  1736. for v in value:
  1737. if (hasattr(v, '__iter__') and not isinstance(v, bytes)
  1738. and not isinstance(v, str)):
  1739. # This is almost certainly the user's mistake. In the
  1740. # interests of avoiding infinite loops, we'll let
  1741. # it through as-is rather than doing a recursive call.
  1742. new_value.append(v)
  1743. else:
  1744. new_value.append(self._normalize_search_value(v))
  1745. return new_value
  1746. # Otherwise, convert it into a Unicode string.
  1747. # The unicode(str()) thing is so this will do the same thing on Python 2
  1748. # and Python 3.
  1749. return str(str(value))
  1750. def __str__(self):
  1751. """A human-readable representation of this SoupStrainer."""
  1752. if self.string:
  1753. return self.string
  1754. else:
  1755. return "%s|%s" % (self.name, self.attrs)
  1756. def search_tag(self, markup_name=None, markup_attrs={}):
  1757. """Check whether a Tag with the given name and attributes would
  1758. match this SoupStrainer.
  1759. Used prospectively to decide whether to even bother creating a Tag
  1760. object.
  1761. :param markup_name: A tag name as found in some markup.
  1762. :param markup_attrs: A dictionary of attributes as found in some markup.
  1763. :return: True if the prospective tag would match this SoupStrainer;
  1764. False otherwise.
  1765. """
  1766. found = None
  1767. markup = None
  1768. if isinstance(markup_name, Tag):
  1769. markup = markup_name
  1770. markup_attrs = markup
  1771. if isinstance(self.name, str):
  1772. # Optimization for a very common case where the user is
  1773. # searching for a tag with one specific name, and we're
  1774. # looking at a tag with a different name.
  1775. if markup and not markup.prefix and self.name != markup.name:
  1776. return False
  1777. call_function_with_tag_data = (
  1778. isinstance(self.name, Callable)
  1779. and not isinstance(markup_name, Tag))
  1780. if ((not self.name)
  1781. or call_function_with_tag_data
  1782. or (markup and self._matches(markup, self.name))
  1783. or (not markup and self._matches(markup_name, self.name))):
  1784. if call_function_with_tag_data:
  1785. match = self.name(markup_name, markup_attrs)
  1786. else:
  1787. match = True
  1788. markup_attr_map = None
  1789. for attr, match_against in list(self.attrs.items()):
  1790. if not markup_attr_map:
  1791. if hasattr(markup_attrs, 'get'):
  1792. markup_attr_map = markup_attrs
  1793. else:
  1794. markup_attr_map = {}
  1795. for k, v in markup_attrs:
  1796. markup_attr_map[k] = v
  1797. attr_value = markup_attr_map.get(attr)
  1798. if not self._matches(attr_value, match_against):
  1799. match = False
  1800. break
  1801. if match:
  1802. if markup:
  1803. found = markup
  1804. else:
  1805. found = markup_name
  1806. if found and self.string and not self._matches(found.string, self.string):
  1807. found = None
  1808. return found
  1809. # For BS3 compatibility.
  1810. searchTag = search_tag
  1811. def search(self, markup):
  1812. """Find all items in `markup` that match this SoupStrainer.
  1813. Used by the core _find_all() method, which is ultimately
  1814. called by all find_* methods.
  1815. :param markup: A PageElement or a list of them.
  1816. """
  1817. # print('looking for %s in %s' % (self, markup))
  1818. found = None
  1819. # If given a list of items, scan it for a text element that
  1820. # matches.
  1821. if hasattr(markup, '__iter__') and not isinstance(markup, (Tag, str)):
  1822. for element in markup:
  1823. if isinstance(element, NavigableString) \
  1824. and self.search(element):
  1825. found = element
  1826. break
  1827. # If it's a Tag, make sure its name or attributes match.
  1828. # Don't bother with Tags if we're searching for text.
  1829. elif isinstance(markup, Tag):
  1830. if not self.string or self.name or self.attrs:
  1831. found = self.search_tag(markup)
  1832. # If it's text, make sure the text matches.
  1833. elif isinstance(markup, NavigableString) or \
  1834. isinstance(markup, str):
  1835. if not self.name and not self.attrs and self._matches(markup, self.string):
  1836. found = markup
  1837. else:
  1838. raise Exception(
  1839. "I don't know how to match against a %s" % markup.__class__)
  1840. return found
  1841. def _matches(self, markup, match_against, already_tried=None):
  1842. # print(u"Matching %s against %s" % (markup, match_against))
  1843. result = False
  1844. if isinstance(markup, list) or isinstance(markup, tuple):
  1845. # This should only happen when searching a multi-valued attribute
  1846. # like 'class'.
  1847. for item in markup:
  1848. if self._matches(item, match_against):
  1849. return True
  1850. # We didn't match any particular value of the multivalue
  1851. # attribute, but maybe we match the attribute value when
  1852. # considered as a string.
  1853. if self._matches(' '.join(markup), match_against):
  1854. return True
  1855. return False
  1856. if match_against is True:
  1857. # True matches any non-None value.
  1858. return markup is not None
  1859. if isinstance(match_against, Callable):
  1860. return match_against(markup)
  1861. # Custom callables take the tag as an argument, but all
  1862. # other ways of matching match the tag name as a string.
  1863. original_markup = markup
  1864. if isinstance(markup, Tag):
  1865. markup = markup.name
  1866. # Ensure that `markup` is either a Unicode string, or None.
  1867. markup = self._normalize_search_value(markup)
  1868. if markup is None:
  1869. # None matches None, False, an empty string, an empty list, and so on.
  1870. return not match_against
  1871. if (hasattr(match_against, '__iter__')
  1872. and not isinstance(match_against, str)):
  1873. # We're asked to match against an iterable of items.
  1874. # The markup must be match at least one item in the
  1875. # iterable. We'll try each one in turn.
  1876. #
  1877. # To avoid infinite recursion we need to keep track of
  1878. # items we've already seen.
  1879. if not already_tried:
  1880. already_tried = set()
  1881. for item in match_against:
  1882. if item.__hash__:
  1883. key = item
  1884. else:
  1885. key = id(item)
  1886. if key in already_tried:
  1887. continue
  1888. else:
  1889. already_tried.add(key)
  1890. if self._matches(original_markup, item, already_tried):
  1891. return True
  1892. else:
  1893. return False
  1894. # Beyond this point we might need to run the test twice: once against
  1895. # the tag's name and once against its prefixed name.
  1896. match = False
  1897. if not match and isinstance(match_against, str):
  1898. # Exact string match
  1899. match = markup == match_against
  1900. if not match and hasattr(match_against, 'search'):
  1901. # Regexp match
  1902. return match_against.search(markup)
  1903. if (not match
  1904. and isinstance(original_markup, Tag)
  1905. and original_markup.prefix):
  1906. # Try the whole thing again with the prefixed tag name.
  1907. return self._matches(
  1908. original_markup.prefix + ':' + original_markup.name, match_against
  1909. )
  1910. return match
  1911. class ResultSet(list):
  1912. """A ResultSet is just a list that keeps track of the SoupStrainer
  1913. that created it."""
  1914. def __init__(self, source, result=()):
  1915. """Constructor.
  1916. :param source: A SoupStrainer.
  1917. :param result: A list of PageElements.
  1918. """
  1919. super(ResultSet, self).__init__(result)
  1920. self.source = source
  1921. def __getattr__(self, key):
  1922. """Raise a helpful exception to explain a common code fix."""
  1923. raise AttributeError(
  1924. "ResultSet object has no attribute '%s'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?" % key
  1925. )