css_match.py 57 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584
  1. """CSS matcher."""
  2. from datetime import datetime
  3. from . import util
  4. import re
  5. from . import css_types as ct
  6. import unicodedata
  7. import bs4 # type: ignore[import]
  8. from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast
  9. # Empty tag pattern (whitespace okay)
  10. RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
  11. RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
  12. # Relationships
  13. REL_PARENT = ' '
  14. REL_CLOSE_PARENT = '>'
  15. REL_SIBLING = '~'
  16. REL_CLOSE_SIBLING = '+'
  17. # Relationships for :has() (forward looking)
  18. REL_HAS_PARENT = ': '
  19. REL_HAS_CLOSE_PARENT = ':>'
  20. REL_HAS_SIBLING = ':~'
  21. REL_HAS_CLOSE_SIBLING = ':+'
  22. NS_XHTML = 'http://www.w3.org/1999/xhtml'
  23. NS_XML = 'http://www.w3.org/XML/1998/namespace'
  24. DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
  25. RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
  26. DIR_MAP = {
  27. 'ltr': ct.SEL_DIR_LTR,
  28. 'rtl': ct.SEL_DIR_RTL,
  29. 'auto': 0
  30. }
  31. RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
  32. RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
  33. RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
  34. RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
  35. RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
  36. RE_DATETIME = re.compile(
  37. r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
  38. )
  39. RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
  40. MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
  41. FEB = 2
  42. SHORT_MONTH = 30
  43. LONG_MONTH = 31
  44. FEB_MONTH = 28
  45. FEB_LEAP_MONTH = 29
  46. DAYS_IN_WEEK = 7
  47. class _FakeParent:
  48. """
  49. Fake parent class.
  50. When we have a fragment with no `BeautifulSoup` document object,
  51. we can't evaluate `nth` selectors properly. Create a temporary
  52. fake parent so we can traverse the root element as a child.
  53. """
  54. def __init__(self, element: 'bs4.Tag') -> None:
  55. """Initialize."""
  56. self.contents = [element]
  57. def __len__(self) -> 'bs4.PageElement':
  58. """Length."""
  59. return len(self.contents)
  60. class _DocumentNav:
  61. """Navigate a Beautiful Soup document."""
  62. @classmethod
  63. def assert_valid_input(cls, tag: Any) -> None:
  64. """Check if valid input tag or document."""
  65. # Fail on unexpected types.
  66. if not cls.is_tag(tag):
  67. raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag)))
  68. @staticmethod
  69. def is_doc(obj: 'bs4.Tag') -> bool:
  70. """Is `BeautifulSoup` object."""
  71. return isinstance(obj, bs4.BeautifulSoup)
  72. @staticmethod
  73. def is_tag(obj: 'bs4.PageElement') -> bool:
  74. """Is tag."""
  75. return isinstance(obj, bs4.Tag)
  76. @staticmethod
  77. def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover
  78. """Is declaration."""
  79. return isinstance(obj, bs4.Declaration)
  80. @staticmethod
  81. def is_cdata(obj: 'bs4.PageElement') -> bool:
  82. """Is CDATA."""
  83. return isinstance(obj, bs4.CData)
  84. @staticmethod
  85. def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover
  86. """Is processing instruction."""
  87. return isinstance(obj, bs4.ProcessingInstruction)
  88. @staticmethod
  89. def is_navigable_string(obj: 'bs4.PageElement') -> bool:
  90. """Is navigable string."""
  91. return isinstance(obj, bs4.NavigableString)
  92. @staticmethod
  93. def is_special_string(obj: 'bs4.PageElement') -> bool:
  94. """Is special string."""
  95. return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
  96. @classmethod
  97. def is_content_string(cls, obj: 'bs4.PageElement') -> bool:
  98. """Check if node is content string."""
  99. return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
  100. @staticmethod
  101. def create_fake_parent(el: 'bs4.Tag') -> _FakeParent:
  102. """Create fake parent for a given element."""
  103. return _FakeParent(el)
  104. @staticmethod
  105. def is_xml_tree(el: 'bs4.Tag') -> bool:
  106. """Check if element (or document) is from a XML tree."""
  107. return bool(el._is_xml)
  108. def is_iframe(self, el: 'bs4.Tag') -> bool:
  109. """Check if element is an `iframe`."""
  110. return bool(
  111. ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
  112. self.is_html_tag(el) # type: ignore[attr-defined]
  113. )
  114. def is_root(self, el: 'bs4.Tag') -> bool:
  115. """
  116. Return whether element is a root element.
  117. We check that the element is the root of the tree (which we have already pre-calculated),
  118. and we check if it is the root element under an `iframe`.
  119. """
  120. root = self.root and self.root is el # type: ignore[attr-defined]
  121. if not root:
  122. parent = self.get_parent(el)
  123. root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
  124. return root
  125. def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']:
  126. """Get contents or contents in reverse."""
  127. if not no_iframe or not self.is_iframe(el):
  128. for content in el.contents:
  129. yield content
  130. def get_children(
  131. self,
  132. el: 'bs4.Tag',
  133. start: Optional[int] = None,
  134. reverse: bool = False,
  135. tags: bool = True,
  136. no_iframe: bool = False
  137. ) -> Iterator['bs4.PageElement']:
  138. """Get children."""
  139. if not no_iframe or not self.is_iframe(el):
  140. last = len(el.contents) - 1
  141. if start is None:
  142. index = last if reverse else 0
  143. else:
  144. index = start
  145. end = -1 if reverse else last + 1
  146. incr = -1 if reverse else 1
  147. if 0 <= index <= last:
  148. while index != end:
  149. node = el.contents[index]
  150. index += incr
  151. if not tags or self.is_tag(node):
  152. yield node
  153. def get_descendants(
  154. self,
  155. el: 'bs4.Tag',
  156. tags: bool = True,
  157. no_iframe: bool = False
  158. ) -> Iterator['bs4.PageElement']:
  159. """Get descendants."""
  160. if not no_iframe or not self.is_iframe(el):
  161. next_good = None
  162. for child in el.descendants:
  163. if next_good is not None:
  164. if child is not next_good:
  165. continue
  166. next_good = None
  167. is_tag = self.is_tag(child)
  168. if no_iframe and is_tag and self.is_iframe(child):
  169. if child.next_sibling is not None:
  170. next_good = child.next_sibling
  171. else:
  172. last_child = child
  173. while self.is_tag(last_child) and last_child.contents:
  174. last_child = last_child.contents[-1]
  175. next_good = last_child.next_element
  176. yield child
  177. if next_good is None:
  178. break
  179. # Coverage isn't seeing this even though it's executed
  180. continue # pragma: no cover
  181. if not tags or is_tag:
  182. yield child
  183. def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag':
  184. """Get parent."""
  185. parent = el.parent
  186. if no_iframe and parent is not None and self.is_iframe(parent):
  187. parent = None
  188. return parent
  189. @staticmethod
  190. def get_tag_name(el: 'bs4.Tag') -> Optional[str]:
  191. """Get tag."""
  192. return cast(Optional[str], el.name)
  193. @staticmethod
  194. def get_prefix_name(el: 'bs4.Tag') -> Optional[str]:
  195. """Get prefix."""
  196. return cast(Optional[str], el.prefix)
  197. @staticmethod
  198. def get_uri(el: 'bs4.Tag') -> Optional[str]:
  199. """Get namespace `URI`."""
  200. return cast(Optional[str], el.namespace)
  201. @classmethod
  202. def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
  203. """Get next sibling tag."""
  204. sibling = el.next_sibling
  205. while tags and not cls.is_tag(sibling) and sibling is not None:
  206. sibling = sibling.next_sibling
  207. return sibling
  208. @classmethod
  209. def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
  210. """Get previous sibling tag."""
  211. sibling = el.previous_sibling
  212. while tags and not cls.is_tag(sibling) and sibling is not None:
  213. sibling = sibling.previous_sibling
  214. return sibling
  215. @staticmethod
  216. def has_html_ns(el: 'bs4.Tag') -> bool:
  217. """
  218. Check if element has an HTML namespace.
  219. This is a bit different than whether a element is treated as having an HTML namespace,
  220. like we do in the case of `is_html_tag`.
  221. """
  222. ns = getattr(el, 'namespace') if el else None
  223. return bool(ns and ns == NS_XHTML)
  224. @staticmethod
  225. def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]:
  226. """Return namespace and attribute name without the prefix."""
  227. return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
  228. @classmethod
  229. def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]:
  230. """Normalize the value to be a string or list of strings."""
  231. # Treat `None` as empty string.
  232. if value is None:
  233. return ''
  234. # Pass through strings
  235. if (isinstance(value, str)):
  236. return value
  237. # If it's a byte string, convert it to Unicode, treating it as UTF-8.
  238. if isinstance(value, bytes):
  239. return value.decode("utf8")
  240. # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
  241. if isinstance(value, Sequence):
  242. new_value = []
  243. for v in value:
  244. if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
  245. # This is most certainly a user error and will crash and burn later.
  246. # To keep things working, we'll do what we do with all objects,
  247. # And convert them to strings.
  248. new_value.append(str(v))
  249. else:
  250. # Convert the child to a string
  251. new_value.append(cast(str, cls.normalize_value(v)))
  252. return new_value
  253. # Try and make anything else a string
  254. return str(value)
  255. @classmethod
  256. def get_attribute_by_name(
  257. cls,
  258. el: 'bs4.Tag',
  259. name: str,
  260. default: Optional[Union[str, Sequence[str]]] = None
  261. ) -> Optional[Union[str, Sequence[str]]]:
  262. """Get attribute by name."""
  263. value = default
  264. if el._is_xml:
  265. try:
  266. value = cls.normalize_value(el.attrs[name])
  267. except KeyError:
  268. pass
  269. else:
  270. for k, v in el.attrs.items():
  271. if util.lower(k) == name:
  272. value = cls.normalize_value(v)
  273. break
  274. return value
  275. @classmethod
  276. def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]:
  277. """Iterate attributes."""
  278. for k, v in el.attrs.items():
  279. yield k, cls.normalize_value(v)
  280. @classmethod
  281. def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]:
  282. """Get classes."""
  283. classes = cls.get_attribute_by_name(el, 'class', [])
  284. if isinstance(classes, str):
  285. classes = RE_NOT_WS.findall(classes)
  286. return cast(Sequence[str], classes)
  287. def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str:
  288. """Get text."""
  289. return ''.join(
  290. [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
  291. )
  292. def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]:
  293. """Get Own Text."""
  294. return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
  295. class Inputs:
  296. """Class for parsing and validating input items."""
  297. @staticmethod
  298. def validate_day(year: int, month: int, day: int) -> bool:
  299. """Validate day."""
  300. max_days = LONG_MONTH
  301. if month == FEB:
  302. max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
  303. elif month in MONTHS_30:
  304. max_days = SHORT_MONTH
  305. return 1 <= day <= max_days
  306. @staticmethod
  307. def validate_week(year: int, week: int) -> bool:
  308. """Validate week."""
  309. max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
  310. if max_week == 1:
  311. max_week = 53
  312. return 1 <= week <= max_week
  313. @staticmethod
  314. def validate_month(month: int) -> bool:
  315. """Validate month."""
  316. return 1 <= month <= 12
  317. @staticmethod
  318. def validate_year(year: int) -> bool:
  319. """Validate year."""
  320. return 1 <= year
  321. @staticmethod
  322. def validate_hour(hour: int) -> bool:
  323. """Validate hour."""
  324. return 0 <= hour <= 23
  325. @staticmethod
  326. def validate_minutes(minutes: int) -> bool:
  327. """Validate minutes."""
  328. return 0 <= minutes <= 59
  329. @classmethod
  330. def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]:
  331. """Parse the input value."""
  332. parsed = None # type: Optional[Tuple[float, ...]]
  333. if value is None:
  334. return value
  335. if itype == "date":
  336. m = RE_DATE.match(value)
  337. if m:
  338. year = int(m.group('year'), 10)
  339. month = int(m.group('month'), 10)
  340. day = int(m.group('day'), 10)
  341. if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
  342. parsed = (year, month, day)
  343. elif itype == "month":
  344. m = RE_MONTH.match(value)
  345. if m:
  346. year = int(m.group('year'), 10)
  347. month = int(m.group('month'), 10)
  348. if cls.validate_year(year) and cls.validate_month(month):
  349. parsed = (year, month)
  350. elif itype == "week":
  351. m = RE_WEEK.match(value)
  352. if m:
  353. year = int(m.group('year'), 10)
  354. week = int(m.group('week'), 10)
  355. if cls.validate_year(year) and cls.validate_week(year, week):
  356. parsed = (year, week)
  357. elif itype == "time":
  358. m = RE_TIME.match(value)
  359. if m:
  360. hour = int(m.group('hour'), 10)
  361. minutes = int(m.group('minutes'), 10)
  362. if cls.validate_hour(hour) and cls.validate_minutes(minutes):
  363. parsed = (hour, minutes)
  364. elif itype == "datetime-local":
  365. m = RE_DATETIME.match(value)
  366. if m:
  367. year = int(m.group('year'), 10)
  368. month = int(m.group('month'), 10)
  369. day = int(m.group('day'), 10)
  370. hour = int(m.group('hour'), 10)
  371. minutes = int(m.group('minutes'), 10)
  372. if (
  373. cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
  374. cls.validate_hour(hour) and cls.validate_minutes(minutes)
  375. ):
  376. parsed = (year, month, day, hour, minutes)
  377. elif itype in ("number", "range"):
  378. m = RE_NUM.match(value)
  379. if m:
  380. parsed = (float(m.group('value')),)
  381. return parsed
  382. class CSSMatch(_DocumentNav):
  383. """Perform CSS matching."""
  384. def __init__(
  385. self,
  386. selectors: ct.SelectorList,
  387. scope: 'bs4.Tag',
  388. namespaces: Optional[ct.Namespaces],
  389. flags: int
  390. ) -> None:
  391. """Initialize."""
  392. self.assert_valid_input(scope)
  393. self.tag = scope
  394. self.cached_meta_lang = [] # type: List[Tuple[str, str]]
  395. self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']]
  396. self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]]
  397. self.selectors = selectors
  398. self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]]
  399. self.flags = flags
  400. self.iframe_restrict = False
  401. # Find the root element for the whole tree
  402. doc = scope
  403. parent = self.get_parent(doc)
  404. while parent:
  405. doc = parent
  406. parent = self.get_parent(doc)
  407. root = None
  408. if not self.is_doc(doc):
  409. root = doc
  410. else:
  411. for child in self.get_children(doc):
  412. root = child
  413. break
  414. self.root = root
  415. self.scope = scope if scope is not doc else root
  416. self.has_html_namespace = self.has_html_ns(root)
  417. # A document can be both XML and HTML (XHTML)
  418. self.is_xml = self.is_xml_tree(doc)
  419. self.is_html = not self.is_xml or self.has_html_namespace
  420. def supports_namespaces(self) -> bool:
  421. """Check if namespaces are supported in the HTML type."""
  422. return self.is_xml or self.has_html_namespace
  423. def get_tag_ns(self, el: 'bs4.Tag') -> str:
  424. """Get tag namespace."""
  425. if self.supports_namespaces():
  426. namespace = ''
  427. ns = self.get_uri(el)
  428. if ns:
  429. namespace = ns
  430. else:
  431. namespace = NS_XHTML
  432. return namespace
  433. def is_html_tag(self, el: 'bs4.Tag') -> bool:
  434. """Check if tag is in HTML namespace."""
  435. return self.get_tag_ns(el) == NS_XHTML
  436. def get_tag(self, el: 'bs4.Tag') -> Optional[str]:
  437. """Get tag."""
  438. name = self.get_tag_name(el)
  439. return util.lower(name) if name is not None and not self.is_xml else name
  440. def get_prefix(self, el: 'bs4.Tag') -> Optional[str]:
  441. """Get prefix."""
  442. prefix = self.get_prefix_name(el)
  443. return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
  444. def find_bidi(self, el: 'bs4.Tag') -> Optional[int]:
  445. """Get directionality from element text."""
  446. for node in self.get_children(el, tags=False):
  447. # Analyze child text nodes
  448. if self.is_tag(node):
  449. # Avoid analyzing certain elements specified in the specification.
  450. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
  451. if (
  452. self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
  453. not self.is_html_tag(node) or
  454. direction is not None
  455. ):
  456. continue # pragma: no cover
  457. # Check directionality of this node's text
  458. value = self.find_bidi(node)
  459. if value is not None:
  460. return value
  461. # Direction could not be determined
  462. continue # pragma: no cover
  463. # Skip `doctype` comments, etc.
  464. if self.is_special_string(node):
  465. continue
  466. # Analyze text nodes for directionality.
  467. for c in node:
  468. bidi = unicodedata.bidirectional(c)
  469. if bidi in ('AL', 'R', 'L'):
  470. return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
  471. return None
  472. def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
  473. """Filter the language tags."""
  474. match = True
  475. lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
  476. ranges = lang_range.split('-')
  477. subtags = lang_tag.lower().split('-')
  478. length = len(ranges)
  479. rindex = 0
  480. sindex = 0
  481. r = ranges[rindex]
  482. s = subtags[sindex]
  483. # Primary tag needs to match
  484. if r != '*' and r != s:
  485. match = False
  486. rindex += 1
  487. sindex += 1
  488. # Match until we run out of ranges
  489. while match and rindex < length:
  490. r = ranges[rindex]
  491. try:
  492. s = subtags[sindex]
  493. except IndexError:
  494. # Ran out of subtags,
  495. # but we still have ranges
  496. match = False
  497. continue
  498. # Empty range
  499. if not r:
  500. match = False
  501. continue
  502. # Matched range
  503. elif s == r:
  504. rindex += 1
  505. # Implicit wildcard cannot match
  506. # singletons
  507. elif len(s) == 1:
  508. match = False
  509. continue
  510. # Implicitly matched, so grab next subtag
  511. sindex += 1
  512. return match
  513. def match_attribute_name(
  514. self,
  515. el: 'bs4.Tag',
  516. attr: str,
  517. prefix: Optional[str]
  518. ) -> Optional[Union[str, Sequence[str]]]:
  519. """Match attribute name and return value if it exists."""
  520. value = None
  521. if self.supports_namespaces():
  522. value = None
  523. # If we have not defined namespaces, we can't very well find them, so don't bother trying.
  524. if prefix:
  525. ns = self.namespaces.get(prefix)
  526. if ns is None and prefix != '*':
  527. return None
  528. else:
  529. ns = None
  530. for k, v in self.iter_attributes(el):
  531. # Get attribute parts
  532. namespace, name = self.split_namespace(el, k)
  533. # Can't match a prefix attribute as we haven't specified one to match
  534. # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
  535. if ns is None:
  536. if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
  537. value = v
  538. break
  539. # Coverage is not finding this even though it is executed.
  540. # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
  541. # Ignore the false positive message.
  542. continue # pragma: no cover
  543. # We can't match our desired prefix attribute as the attribute doesn't have a prefix
  544. if namespace is None or ns != namespace and prefix != '*':
  545. continue
  546. # The attribute doesn't match.
  547. if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
  548. continue
  549. value = v
  550. break
  551. else:
  552. for k, v in self.iter_attributes(el):
  553. if util.lower(attr) != util.lower(k):
  554. continue
  555. value = v
  556. break
  557. return value
  558. def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
  559. """Match the namespace of the element."""
  560. match = True
  561. namespace = self.get_tag_ns(el)
  562. default_namespace = self.namespaces.get('')
  563. tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
  564. # We must match the default namespace if one is not provided
  565. if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
  566. match = False
  567. # If we specified `|tag`, we must not have a namespace.
  568. elif (tag.prefix is not None and tag.prefix == '' and namespace):
  569. match = False
  570. # Verify prefix matches
  571. elif (
  572. tag.prefix and
  573. tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
  574. ):
  575. match = False
  576. return match
  577. def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool:
  578. """Match attributes."""
  579. match = True
  580. if attributes:
  581. for a in attributes:
  582. temp = self.match_attribute_name(el, a.attribute, a.prefix)
  583. pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
  584. if temp is None:
  585. match = False
  586. break
  587. value = temp if isinstance(temp, str) else ' '.join(temp)
  588. if pattern is None:
  589. continue
  590. elif pattern.match(value) is None:
  591. match = False
  592. break
  593. return match
  594. def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
  595. """Match tag name."""
  596. name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
  597. return not (
  598. name is not None and
  599. name not in (self.get_tag(el), '*')
  600. )
  601. def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool:
  602. """Match the tag."""
  603. match = True
  604. if tag is not None:
  605. # Verify namespace
  606. if not self.match_namespace(el, tag):
  607. match = False
  608. if not self.match_tagname(el, tag):
  609. match = False
  610. return match
  611. def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
  612. """Match past relationship."""
  613. found = False
  614. # I don't think this can ever happen, but it makes `mypy` happy
  615. if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
  616. return found
  617. if relation[0].rel_type == REL_PARENT:
  618. parent = self.get_parent(el, no_iframe=self.iframe_restrict)
  619. while not found and parent:
  620. found = self.match_selectors(parent, relation)
  621. parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
  622. elif relation[0].rel_type == REL_CLOSE_PARENT:
  623. parent = self.get_parent(el, no_iframe=self.iframe_restrict)
  624. if parent:
  625. found = self.match_selectors(parent, relation)
  626. elif relation[0].rel_type == REL_SIBLING:
  627. sibling = self.get_previous(el)
  628. while not found and sibling:
  629. found = self.match_selectors(sibling, relation)
  630. sibling = self.get_previous(sibling)
  631. elif relation[0].rel_type == REL_CLOSE_SIBLING:
  632. sibling = self.get_previous(el)
  633. if sibling and self.is_tag(sibling):
  634. found = self.match_selectors(sibling, relation)
  635. return found
  636. def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool:
  637. """Match future child."""
  638. match = False
  639. if recursive:
  640. children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']]
  641. else:
  642. children = self.get_children
  643. for child in children(parent, no_iframe=self.iframe_restrict):
  644. match = self.match_selectors(child, relation)
  645. if match:
  646. break
  647. return match
  648. def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
  649. """Match future relationship."""
  650. found = False
  651. # I don't think this can ever happen, but it makes `mypy` happy
  652. if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
  653. return found
  654. if relation[0].rel_type == REL_HAS_PARENT:
  655. found = self.match_future_child(el, relation, True)
  656. elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
  657. found = self.match_future_child(el, relation)
  658. elif relation[0].rel_type == REL_HAS_SIBLING:
  659. sibling = self.get_next(el)
  660. while not found and sibling:
  661. found = self.match_selectors(sibling, relation)
  662. sibling = self.get_next(sibling)
  663. elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
  664. sibling = self.get_next(el)
  665. if sibling and self.is_tag(sibling):
  666. found = self.match_selectors(sibling, relation)
  667. return found
  668. def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
  669. """Match relationship to other elements."""
  670. found = False
  671. if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
  672. return found
  673. if relation[0].rel_type.startswith(':'):
  674. found = self.match_future_relations(el, relation)
  675. else:
  676. found = self.match_past_relations(el, relation)
  677. return found
  678. def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool:
  679. """Match element's ID."""
  680. found = True
  681. for i in ids:
  682. if i != self.get_attribute_by_name(el, 'id', ''):
  683. found = False
  684. break
  685. return found
  686. def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool:
  687. """Match element's classes."""
  688. current_classes = self.get_classes(el)
  689. found = True
  690. for c in classes:
  691. if c not in current_classes:
  692. found = False
  693. break
  694. return found
  695. def match_root(self, el: 'bs4.Tag') -> bool:
  696. """Match element as root."""
  697. is_root = self.is_root(el)
  698. if is_root:
  699. sibling = self.get_previous(el, tags=False)
  700. while is_root and sibling is not None:
  701. if (
  702. self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
  703. self.is_cdata(sibling)
  704. ):
  705. is_root = False
  706. else:
  707. sibling = self.get_previous(sibling, tags=False)
  708. if is_root:
  709. sibling = self.get_next(el, tags=False)
  710. while is_root and sibling is not None:
  711. if (
  712. self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
  713. self.is_cdata(sibling)
  714. ):
  715. is_root = False
  716. else:
  717. sibling = self.get_next(sibling, tags=False)
  718. return is_root
  719. def match_scope(self, el: 'bs4.Tag') -> bool:
  720. """Match element as scope."""
  721. return self.scope is el
  722. def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool:
  723. """Match tag type for `nth` matches."""
  724. return(
  725. (self.get_tag(child) == self.get_tag(el)) and
  726. (self.get_tag_ns(child) == self.get_tag_ns(el))
  727. )
  728. def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool:
  729. """Match `nth` elements."""
  730. matched = True
  731. for n in nth:
  732. matched = False
  733. if n.selectors and not self.match_selectors(el, n.selectors):
  734. break
  735. parent = self.get_parent(el)
  736. if parent is None:
  737. parent = self.create_fake_parent(el)
  738. last = n.last
  739. last_index = len(parent) - 1
  740. index = last_index if last else 0
  741. relative_index = 0
  742. a = n.a
  743. b = n.b
  744. var = n.n
  745. count = 0
  746. count_incr = 1
  747. factor = -1 if last else 1
  748. idx = last_idx = a * count + b if var else a
  749. # We can only adjust bounds within a variable index
  750. if var:
  751. # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
  752. # Otherwise, increment to try to get in bounds.
  753. adjust = None
  754. while idx < 1 or idx > last_index:
  755. if idx < 0:
  756. diff_low = 0 - idx
  757. if adjust is not None and adjust == 1:
  758. break
  759. adjust = -1
  760. count += count_incr
  761. idx = last_idx = a * count + b if var else a
  762. diff = 0 - idx
  763. if diff >= diff_low:
  764. break
  765. else:
  766. diff_high = idx - last_index
  767. if adjust is not None and adjust == -1:
  768. break
  769. adjust = 1
  770. count += count_incr
  771. idx = last_idx = a * count + b if var else a
  772. diff = idx - last_index
  773. if diff >= diff_high:
  774. break
  775. diff_high = diff
  776. # If a < 0, our count is working backwards, so floor the index by increasing the count.
  777. # Find the count that yields the lowest, in bound value and use that.
  778. # Lastly reverse count increment so that we'll increase our index.
  779. lowest = count
  780. if a < 0:
  781. while idx >= 1:
  782. lowest = count
  783. count += count_incr
  784. idx = last_idx = a * count + b if var else a
  785. count_incr = -1
  786. count = lowest
  787. idx = last_idx = a * count + b if var else a
  788. # Evaluate elements while our calculated nth index is still in range
  789. while 1 <= idx <= last_index + 1:
  790. child = None
  791. # Evaluate while our child index is still in range.
  792. for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
  793. index += factor
  794. if not self.is_tag(child):
  795. continue
  796. # Handle `of S` in `nth-child`
  797. if n.selectors and not self.match_selectors(child, n.selectors):
  798. continue
  799. # Handle `of-type`
  800. if n.of_type and not self.match_nth_tag_type(el, child):
  801. continue
  802. relative_index += 1
  803. if relative_index == idx:
  804. if child is el:
  805. matched = True
  806. else:
  807. break
  808. if child is el:
  809. break
  810. if child is el:
  811. break
  812. last_idx = idx
  813. count += count_incr
  814. if count < 0:
  815. # Count is counting down and has now ventured into invalid territory.
  816. break
  817. idx = a * count + b if var else a
  818. if last_idx == idx:
  819. break
  820. if not matched:
  821. break
  822. return matched
  823. def match_empty(self, el: 'bs4.Tag') -> bool:
  824. """Check if element is empty (if requested)."""
  825. is_empty = True
  826. for child in self.get_children(el, tags=False):
  827. if self.is_tag(child):
  828. is_empty = False
  829. break
  830. elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
  831. is_empty = False
  832. break
  833. return is_empty
  834. def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool:
  835. """Match selectors."""
  836. match = True
  837. for sel in selectors:
  838. if not self.match_selectors(el, sel):
  839. match = False
  840. return match
  841. def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool:
  842. """Match element if it contains text."""
  843. match = True
  844. content = None # type: Optional[Union[str, Sequence[str]]]
  845. for contain_list in contains:
  846. if content is None:
  847. if contain_list.own:
  848. content = self.get_own_text(el, no_iframe=self.is_html)
  849. else:
  850. content = self.get_text(el, no_iframe=self.is_html)
  851. found = False
  852. for text in contain_list.text:
  853. if contain_list.own:
  854. for c in content:
  855. if text in c:
  856. found = True
  857. break
  858. if found:
  859. break
  860. else:
  861. if text in content:
  862. found = True
  863. break
  864. if not found:
  865. match = False
  866. return match
  867. def match_default(self, el: 'bs4.Tag') -> bool:
  868. """Match default."""
  869. match = False
  870. # Find this input's form
  871. form = None
  872. parent = self.get_parent(el, no_iframe=True)
  873. while parent and form is None:
  874. if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
  875. form = parent
  876. else:
  877. parent = self.get_parent(parent, no_iframe=True)
  878. # Look in form cache to see if we've already located its default button
  879. found_form = False
  880. for f, t in self.cached_default_forms:
  881. if f is form:
  882. found_form = True
  883. if t is el:
  884. match = True
  885. break
  886. # We didn't have the form cached, so look for its default button
  887. if not found_form:
  888. for child in self.get_descendants(form, no_iframe=True):
  889. name = self.get_tag(child)
  890. # Can't do nested forms (haven't figured out why we never hit this)
  891. if name == 'form': # pragma: no cover
  892. break
  893. if name in ('input', 'button'):
  894. v = self.get_attribute_by_name(child, 'type', '')
  895. if v and util.lower(v) == 'submit':
  896. self.cached_default_forms.append((form, child))
  897. if el is child:
  898. match = True
  899. break
  900. return match
  901. def match_indeterminate(self, el: 'bs4.Tag') -> bool:
  902. """Match default."""
  903. match = False
  904. name = cast(str, self.get_attribute_by_name(el, 'name'))
  905. def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']:
  906. """Find this input's form."""
  907. form = None
  908. parent = self.get_parent(el, no_iframe=True)
  909. while form is None:
  910. if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
  911. form = parent
  912. break
  913. last_parent = parent
  914. parent = self.get_parent(parent, no_iframe=True)
  915. if parent is None:
  916. form = last_parent
  917. break
  918. return form
  919. form = get_parent_form(el)
  920. # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
  921. found_form = False
  922. for f, n, i in self.cached_indeterminate_forms:
  923. if f is form and n == name:
  924. found_form = True
  925. if i is True:
  926. match = True
  927. break
  928. # We didn't have the form cached, so validate that the radio button is indeterminate
  929. if not found_form:
  930. checked = False
  931. for child in self.get_descendants(form, no_iframe=True):
  932. if child is el:
  933. continue
  934. tag_name = self.get_tag(child)
  935. if tag_name == 'input':
  936. is_radio = False
  937. check = False
  938. has_name = False
  939. for k, v in self.iter_attributes(child):
  940. if util.lower(k) == 'type' and util.lower(v) == 'radio':
  941. is_radio = True
  942. elif util.lower(k) == 'name' and v == name:
  943. has_name = True
  944. elif util.lower(k) == 'checked':
  945. check = True
  946. if is_radio and check and has_name and get_parent_form(child) is form:
  947. checked = True
  948. break
  949. if checked:
  950. break
  951. if not checked:
  952. match = True
  953. self.cached_indeterminate_forms.append((form, name, match))
  954. return match
  955. def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool:
  956. """Match languages."""
  957. match = False
  958. has_ns = self.supports_namespaces()
  959. root = self.root
  960. has_html_namespace = self.has_html_namespace
  961. # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
  962. parent = el
  963. found_lang = None
  964. last = None
  965. while not found_lang:
  966. has_html_ns = self.has_html_ns(parent)
  967. for k, v in self.iter_attributes(parent):
  968. attr_ns, attr = self.split_namespace(parent, k)
  969. if (
  970. ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
  971. (
  972. has_ns and not has_html_ns and attr_ns == NS_XML and
  973. (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
  974. )
  975. ):
  976. found_lang = v
  977. break
  978. last = parent
  979. parent = self.get_parent(parent, no_iframe=self.is_html)
  980. if parent is None:
  981. root = last
  982. has_html_namespace = self.has_html_ns(root)
  983. parent = last
  984. break
  985. # Use cached meta language.
  986. if not found_lang and self.cached_meta_lang:
  987. for cache in self.cached_meta_lang:
  988. if root is cache[0]:
  989. found_lang = cache[1]
  990. # If we couldn't find a language, and the document is HTML, look to meta to determine language.
  991. if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
  992. # Find head
  993. found = False
  994. for tag in ('html', 'head'):
  995. found = False
  996. for child in self.get_children(parent, no_iframe=self.is_html):
  997. if self.get_tag(child) == tag and self.is_html_tag(child):
  998. found = True
  999. parent = child
  1000. break
  1001. if not found: # pragma: no cover
  1002. break
  1003. # Search meta tags
  1004. if found:
  1005. for child in parent:
  1006. if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
  1007. c_lang = False
  1008. content = None
  1009. for k, v in self.iter_attributes(child):
  1010. if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
  1011. c_lang = True
  1012. if util.lower(k) == 'content':
  1013. content = v
  1014. if c_lang and content:
  1015. found_lang = content
  1016. self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
  1017. break
  1018. if found_lang:
  1019. break
  1020. if not found_lang:
  1021. self.cached_meta_lang.append((cast(str, root), ''))
  1022. # If we determined a language, compare.
  1023. if found_lang:
  1024. for patterns in langs:
  1025. match = False
  1026. for pattern in patterns:
  1027. if self.extended_language_filter(pattern, cast(str, found_lang)):
  1028. match = True
  1029. if not match:
  1030. break
  1031. return match
  1032. def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool:
  1033. """Check directionality."""
  1034. # If we have to match both left and right, we can't match either.
  1035. if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
  1036. return False
  1037. if el is None or not self.is_html_tag(el):
  1038. return False
  1039. # Element has defined direction of left to right or right to left
  1040. direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
  1041. if direction not in (None, 0):
  1042. return direction == directionality
  1043. # Element is the document element (the root) and no direction assigned, assume left to right.
  1044. is_root = self.is_root(el)
  1045. if is_root and direction is None:
  1046. return ct.SEL_DIR_LTR == directionality
  1047. # If `input[type=telephone]` and no direction is assigned, assume left to right.
  1048. name = self.get_tag(el)
  1049. is_input = name == 'input'
  1050. is_textarea = name == 'textarea'
  1051. is_bdi = name == 'bdi'
  1052. itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
  1053. if is_input and itype == 'tel' and direction is None:
  1054. return ct.SEL_DIR_LTR == directionality
  1055. # Auto handling for text inputs
  1056. if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
  1057. if is_textarea:
  1058. temp = []
  1059. for node in self.get_contents(el, no_iframe=True):
  1060. if self.is_content_string(node):
  1061. temp.append(node)
  1062. value = ''.join(temp)
  1063. else:
  1064. value = cast(str, self.get_attribute_by_name(el, 'value', ''))
  1065. if value:
  1066. for c in value:
  1067. bidi = unicodedata.bidirectional(c)
  1068. if bidi in ('AL', 'R', 'L'):
  1069. direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
  1070. return direction == directionality
  1071. # Assume left to right
  1072. return ct.SEL_DIR_LTR == directionality
  1073. elif is_root:
  1074. return ct.SEL_DIR_LTR == directionality
  1075. return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
  1076. # Auto handling for `bdi` and other non text inputs.
  1077. if (is_bdi and direction is None) or direction == 0:
  1078. direction = self.find_bidi(el)
  1079. if direction is not None:
  1080. return direction == directionality
  1081. elif is_root:
  1082. return ct.SEL_DIR_LTR == directionality
  1083. return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
  1084. # Match parents direction
  1085. return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
  1086. def match_range(self, el: 'bs4.Tag', condition: int) -> bool:
  1087. """
  1088. Match range.
  1089. Behavior is modeled after what we see in browsers. Browsers seem to evaluate
  1090. if the value is out of range, and if not, it is in range. So a missing value
  1091. will not evaluate out of range; therefore, value is in range. Personally, I
  1092. feel like this should evaluate as neither in or out of range.
  1093. """
  1094. out_of_range = False
  1095. itype = util.lower(self.get_attribute_by_name(el, 'type'))
  1096. mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
  1097. mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
  1098. # There is no valid min or max, so we cannot evaluate a range
  1099. if mn is None and mx is None:
  1100. return False
  1101. value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
  1102. if value is not None:
  1103. if itype in ("date", "datetime-local", "month", "week", "number", "range"):
  1104. if mn is not None and value < mn:
  1105. out_of_range = True
  1106. if not out_of_range and mx is not None and value > mx:
  1107. out_of_range = True
  1108. elif itype == "time":
  1109. if mn is not None and mx is not None and mn > mx:
  1110. # Time is periodic, so this is a reversed/discontinuous range
  1111. if value < mn and value > mx:
  1112. out_of_range = True
  1113. else:
  1114. if mn is not None and value < mn:
  1115. out_of_range = True
  1116. if not out_of_range and mx is not None and value > mx:
  1117. out_of_range = True
  1118. return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
  1119. def match_defined(self, el: 'bs4.Tag') -> bool:
  1120. """
  1121. Match defined.
  1122. `:defined` is related to custom elements in a browser.
  1123. - If the document is XML (not XHTML), all tags will match.
  1124. - Tags that are not custom (don't have a hyphen) are marked defined.
  1125. - If the tag has a prefix (without or without a namespace), it will not match.
  1126. This is of course requires the parser to provide us with the proper prefix and namespace info,
  1127. if it doesn't, there is nothing we can do.
  1128. """
  1129. name = self.get_tag(el)
  1130. return (
  1131. name is not None and (
  1132. name.find('-') == -1 or
  1133. name.find(':') != -1 or
  1134. self.get_prefix(el) is not None
  1135. )
  1136. )
  1137. def match_placeholder_shown(self, el: 'bs4.Tag') -> bool:
  1138. """
  1139. Match placeholder shown according to HTML spec.
  1140. - text area should be checked if they have content. A single newline does not count as content.
  1141. """
  1142. match = False
  1143. content = self.get_text(el)
  1144. if content in ('', '\n'):
  1145. match = True
  1146. return match
  1147. def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool:
  1148. """Check if element matches one of the selectors."""
  1149. match = False
  1150. is_not = selectors.is_not
  1151. is_html = selectors.is_html
  1152. # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
  1153. if is_html:
  1154. namespaces = self.namespaces
  1155. iframe_restrict = self.iframe_restrict
  1156. self.namespaces = {'html': NS_XHTML}
  1157. self.iframe_restrict = True
  1158. if not is_html or self.is_html:
  1159. for selector in selectors:
  1160. match = is_not
  1161. # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
  1162. if isinstance(selector, ct.SelectorNull):
  1163. continue
  1164. # Verify tag matches
  1165. if not self.match_tag(el, selector.tag):
  1166. continue
  1167. # Verify tag is defined
  1168. if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
  1169. continue
  1170. # Verify element is root
  1171. if selector.flags & ct.SEL_ROOT and not self.match_root(el):
  1172. continue
  1173. # Verify element is scope
  1174. if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
  1175. continue
  1176. # Verify element has placeholder shown
  1177. if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
  1178. continue
  1179. # Verify `nth` matches
  1180. if not self.match_nth(el, selector.nth):
  1181. continue
  1182. if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
  1183. continue
  1184. # Verify id matches
  1185. if selector.ids and not self.match_id(el, selector.ids):
  1186. continue
  1187. # Verify classes match
  1188. if selector.classes and not self.match_classes(el, selector.classes):
  1189. continue
  1190. # Verify attribute(s) match
  1191. if not self.match_attributes(el, selector.attributes):
  1192. continue
  1193. # Verify ranges
  1194. if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
  1195. continue
  1196. # Verify language patterns
  1197. if selector.lang and not self.match_lang(el, selector.lang):
  1198. continue
  1199. # Verify pseudo selector patterns
  1200. if selector.selectors and not self.match_subselectors(el, selector.selectors):
  1201. continue
  1202. # Verify relationship selectors
  1203. if selector.relation and not self.match_relations(el, selector.relation):
  1204. continue
  1205. # Validate that the current default selector match corresponds to the first submit button in the form
  1206. if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
  1207. continue
  1208. # Validate that the unset radio button is among radio buttons with the same name in a form that are
  1209. # also not set.
  1210. if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
  1211. continue
  1212. # Validate element directionality
  1213. if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
  1214. continue
  1215. # Validate that the tag contains the specified text.
  1216. if selector.contains and not self.match_contains(el, selector.contains):
  1217. continue
  1218. match = not is_not
  1219. break
  1220. # Restore actual namespaces being used for external selector lists
  1221. if is_html:
  1222. self.namespaces = namespaces
  1223. self.iframe_restrict = iframe_restrict
  1224. return match
  1225. def select(self, limit: int = 0) -> Iterator['bs4.Tag']:
  1226. """Match all tags under the targeted tag."""
  1227. lim = None if limit < 1 else limit
  1228. for child in self.get_descendants(self.tag):
  1229. if self.match(child):
  1230. yield child
  1231. if lim is not None:
  1232. lim -= 1
  1233. if lim < 1:
  1234. break
  1235. def closest(self) -> Optional['bs4.Tag']:
  1236. """Match closest ancestor."""
  1237. current = self.tag
  1238. closest = None
  1239. while closest is None and current is not None:
  1240. if self.match(current):
  1241. closest = current
  1242. else:
  1243. current = self.get_parent(current)
  1244. return closest
  1245. def filter(self) -> List['bs4.Tag']: # noqa A001
  1246. """Filter tag's children."""
  1247. return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
  1248. def match(self, el: 'bs4.Tag') -> bool:
  1249. """Match."""
  1250. return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
  1251. class SoupSieve(ct.Immutable):
  1252. """Compiled Soup Sieve selector matching object."""
  1253. pattern: str
  1254. selectors: ct.SelectorList
  1255. namespaces: Optional[ct.Namespaces]
  1256. custom: Dict[str, str]
  1257. flags: int
  1258. __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
  1259. def __init__(
  1260. self,
  1261. pattern: str,
  1262. selectors: ct.SelectorList,
  1263. namespaces: Optional[ct.Namespaces],
  1264. custom: Optional[ct.CustomSelectors],
  1265. flags: int
  1266. ):
  1267. """Initialize."""
  1268. super().__init__(
  1269. pattern=pattern,
  1270. selectors=selectors,
  1271. namespaces=namespaces,
  1272. custom=custom,
  1273. flags=flags
  1274. )
  1275. def match(self, tag: 'bs4.Tag') -> bool:
  1276. """Match."""
  1277. return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
  1278. def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag':
  1279. """Match closest ancestor."""
  1280. return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
  1281. def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001
  1282. """
  1283. Filter.
  1284. `CSSMatch` can cache certain searches for tags of the same document,
  1285. so if we are given a tag, all tags are from the same document,
  1286. and we can take advantage of the optimization.
  1287. Any other kind of iterable could have tags from different documents or detached tags,
  1288. so for those, we use a new `CSSMatch` for each item in the iterable.
  1289. """
  1290. if CSSMatch.is_tag(iterable):
  1291. return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
  1292. else:
  1293. return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
  1294. def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag':
  1295. """Select a single tag."""
  1296. tags = self.select(tag, limit=1)
  1297. return tags[0] if tags else None
  1298. def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']:
  1299. """Select the specified tags."""
  1300. return list(self.iselect(tag, limit))
  1301. def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']:
  1302. """Iterate the specified tags."""
  1303. for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
  1304. yield el
  1305. def __repr__(self) -> str: # pragma: no cover
  1306. """Representation."""
  1307. return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
  1308. self.pattern,
  1309. self.namespaces,
  1310. self.custom,
  1311. self.flags
  1312. )
  1313. __str__ = __repr__
  1314. ct.pickle_register(SoupSieve)