css_parser.py 47 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310
  1. """CSS selector parser."""
  2. import re
  3. from functools import lru_cache
  4. from . import util
  5. from . import css_match as cm
  6. from . import css_types as ct
  7. from .util import SelectorSyntaxError
  8. import warnings
  9. from typing import Optional, Dict, Match, Tuple, Type, Any, List, Union, Iterator, cast
  10. UNICODE_REPLACEMENT_CHAR = 0xFFFD
  11. # Simple pseudo classes that take no parameters
  12. PSEUDO_SIMPLE = {
  13. ":any-link",
  14. ":empty",
  15. ":first-child",
  16. ":first-of-type",
  17. ":in-range",
  18. ":out-of-range",
  19. ":last-child",
  20. ":last-of-type",
  21. ":link",
  22. ":only-child",
  23. ":only-of-type",
  24. ":root",
  25. ':checked',
  26. ':default',
  27. ':disabled',
  28. ':enabled',
  29. ':indeterminate',
  30. ':optional',
  31. ':placeholder-shown',
  32. ':read-only',
  33. ':read-write',
  34. ':required',
  35. ':scope',
  36. ':defined'
  37. }
  38. # Supported, simple pseudo classes that match nothing in the Soup Sieve environment
  39. PSEUDO_SIMPLE_NO_MATCH = {
  40. ':active',
  41. ':current',
  42. ':focus',
  43. ':focus-visible',
  44. ':focus-within',
  45. ':future',
  46. ':host',
  47. ':hover',
  48. ':local-link',
  49. ':past',
  50. ':paused',
  51. ':playing',
  52. ':target',
  53. ':target-within',
  54. ':user-invalid',
  55. ':visited'
  56. }
  57. # Complex pseudo classes that take selector lists
  58. PSEUDO_COMPLEX = {
  59. ':contains',
  60. ':-soup-contains',
  61. ':-soup-contains-own',
  62. ':has',
  63. ':is',
  64. ':matches',
  65. ':not',
  66. ':where'
  67. }
  68. PSEUDO_COMPLEX_NO_MATCH = {
  69. ':current',
  70. ':host',
  71. ':host-context'
  72. }
  73. # Complex pseudo classes that take very specific parameters and are handled special
  74. PSEUDO_SPECIAL = {
  75. ':dir',
  76. ':lang',
  77. ':nth-child',
  78. ':nth-last-child',
  79. ':nth-last-of-type',
  80. ':nth-of-type'
  81. }
  82. PSEUDO_SUPPORTED = PSEUDO_SIMPLE | PSEUDO_SIMPLE_NO_MATCH | PSEUDO_COMPLEX | PSEUDO_COMPLEX_NO_MATCH | PSEUDO_SPECIAL
  83. # Sub-patterns parts
  84. # Whitespace
  85. NEWLINE = r'(?:\r\n|(?!\r\n)[\n\f\r])'
  86. WS = r'(?:[ \t]|{})'.format(NEWLINE)
  87. # Comments
  88. COMMENTS = r'(?:/\*[^*]*\*+(?:[^/*][^*]*\*+)*/)'
  89. # Whitespace with comments included
  90. WSC = r'(?:{ws}|{comments})'.format(ws=WS, comments=COMMENTS)
  91. # CSS escapes
  92. CSS_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$))'.format(ws=WS)
  93. CSS_STRING_ESCAPES = r'(?:\\(?:[a-f0-9]{{1,6}}{ws}?|[^\r\n\f]|$|{nl}))'.format(ws=WS, nl=NEWLINE)
  94. # CSS Identifier
  95. IDENTIFIER = r'''
  96. (?:(?:-?(?:[^\x00-\x2f\x30-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})+|--)
  97. (?:[^\x00-\x2c\x2e\x2f\x3A-\x40\x5B-\x5E\x60\x7B-\x9f]|{esc})*)
  98. '''.format(esc=CSS_ESCAPES)
  99. # `nth` content
  100. NTH = r'(?:[-+])?(?:[0-9]+n?|n)(?:(?<=n){ws}*(?:[-+]){ws}*(?:[0-9]+))?'.format(ws=WSC)
  101. # Value: quoted string or identifier
  102. VALUE = r'''
  103. (?:"(?:\\(?:.|{nl})|[^\\"\r\n\f]+)*?"|'(?:\\(?:.|{nl})|[^\\'\r\n\f]+)*?'|{ident}+)
  104. '''.format(nl=NEWLINE, ident=IDENTIFIER)
  105. # Attribute value comparison. `!=` is handled special as it is non-standard.
  106. ATTR = r'''
  107. (?:{ws}*(?P<cmp>[!~^|*$]?=){ws}*(?P<value>{value})(?:{ws}+(?P<case>[is]))?)?{ws}*\]
  108. '''.format(ws=WSC, value=VALUE)
  109. # Selector patterns
  110. # IDs (`#id`)
  111. PAT_ID = r'\#{ident}'.format(ident=IDENTIFIER)
  112. # Classes (`.class`)
  113. PAT_CLASS = r'\.{ident}'.format(ident=IDENTIFIER)
  114. # Prefix:Tag (`prefix|tag`)
  115. PAT_TAG = r'(?P<tag_ns>(?:{ident}|\*)?\|)?(?P<tag_name>{ident}|\*)'.format(ident=IDENTIFIER)
  116. # Attributes (`[attr]`, `[attr=value]`, etc.)
  117. PAT_ATTR = r'''
  118. \[{ws}*(?P<attr_ns>(?:{ident}|\*)?\|)?(?P<attr_name>{ident}){attr}
  119. '''.format(ws=WSC, ident=IDENTIFIER, attr=ATTR)
  120. # Pseudo class (`:pseudo-class`, `:pseudo-class(`)
  121. PAT_PSEUDO_CLASS = r'(?P<name>:{ident})(?P<open>\({ws}*)?'.format(ws=WSC, ident=IDENTIFIER)
  122. # Pseudo class special patterns. Matches `:pseudo-class(` for special case pseudo classes.
  123. PAT_PSEUDO_CLASS_SPECIAL = r'(?P<name>:{ident})(?P<open>\({ws}*)'.format(ws=WSC, ident=IDENTIFIER)
  124. # Custom pseudo class (`:--custom-pseudo`)
  125. PAT_PSEUDO_CLASS_CUSTOM = r'(?P<name>:(?=--){ident})'.format(ident=IDENTIFIER)
  126. # Closing pseudo group (`)`)
  127. PAT_PSEUDO_CLOSE = r'{ws}*\)'.format(ws=WSC)
  128. # Pseudo element (`::pseudo-element`)
  129. PAT_PSEUDO_ELEMENT = r':{}'.format(PAT_PSEUDO_CLASS)
  130. # At rule (`@page`, etc.) (not supported)
  131. PAT_AT_RULE = r'@P{ident}'.format(ident=IDENTIFIER)
  132. # Pseudo class `nth-child` (`:nth-child(an+b [of S]?)`, `:first-child`, etc.)
  133. PAT_PSEUDO_NTH_CHILD = r'''
  134. (?P<pseudo_nth_child>{name}
  135. (?P<nth_child>{nth}|even|odd))(?:{wsc}*\)|(?P<of>{comments}*{ws}{wsc}*of{comments}*{ws}{wsc}*))
  136. '''.format(name=PAT_PSEUDO_CLASS_SPECIAL, wsc=WSC, comments=COMMENTS, ws=WS, nth=NTH)
  137. # Pseudo class `nth-of-type` (`:nth-of-type(an+b)`, `:first-of-type`, etc.)
  138. PAT_PSEUDO_NTH_TYPE = r'''
  139. (?P<pseudo_nth_type>{name}
  140. (?P<nth_type>{nth}|even|odd)){ws}*\)
  141. '''.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, nth=NTH)
  142. # Pseudo class language (`:lang("*-de", en)`)
  143. PAT_PSEUDO_LANG = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
  144. name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
  145. )
  146. # Pseudo class direction (`:dir(ltr)`)
  147. PAT_PSEUDO_DIR = r'{name}(?P<dir>ltr|rtl){ws}*\)'.format(name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC)
  148. # Combining characters (`>`, `~`, ` `, `+`, `,`)
  149. PAT_COMBINE = r'{wsc}*?(?P<relation>[,+>~]|{ws}(?![,+>~])){wsc}*'.format(ws=WS, wsc=WSC)
  150. # Extra: Contains (`:contains(text)`)
  151. PAT_PSEUDO_CONTAINS = r'{name}(?P<values>{value}(?:{ws}*,{ws}*{value})*){ws}*\)'.format(
  152. name=PAT_PSEUDO_CLASS_SPECIAL, ws=WSC, value=VALUE
  153. )
  154. # Regular expressions
  155. # CSS escape pattern
  156. RE_CSS_ESC = re.compile(r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$))'.format(ws=WSC), re.I)
  157. RE_CSS_STR_ESC = re.compile(
  158. r'(?:(\\[a-f0-9]{{1,6}}{ws}?)|(\\[^\r\n\f])|(\\$)|(\\{nl}))'.format(ws=WS, nl=NEWLINE), re.I
  159. )
  160. # Pattern to break up `nth` specifiers
  161. RE_NTH = re.compile(
  162. r'(?P<s1>[-+])?(?P<a>[0-9]+n?|n)(?:(?<=n){ws}*(?P<s2>[-+]){ws}*(?P<b>[0-9]+))?'.format(ws=WSC),
  163. re.I
  164. )
  165. # Pattern to iterate multiple values.
  166. RE_VALUES = re.compile(r'(?:(?P<value>{value})|(?P<split>{ws}*,{ws}*))'.format(ws=WSC, value=VALUE), re.X)
  167. # Whitespace checks
  168. RE_WS = re.compile(WS)
  169. RE_WS_BEGIN = re.compile('^{}*'.format(WSC))
  170. RE_WS_END = re.compile('{}*$'.format(WSC))
  171. RE_CUSTOM = re.compile(r'^{}$'.format(PAT_PSEUDO_CLASS_CUSTOM), re.X)
  172. # Constants
  173. # List split token
  174. COMMA_COMBINATOR = ','
  175. # Relation token for descendant
  176. WS_COMBINATOR = " "
  177. # Parse flags
  178. FLG_PSEUDO = 0x01
  179. FLG_NOT = 0x02
  180. FLG_RELATIVE = 0x04
  181. FLG_DEFAULT = 0x08
  182. FLG_HTML = 0x10
  183. FLG_INDETERMINATE = 0x20
  184. FLG_OPEN = 0x40
  185. FLG_IN_RANGE = 0x80
  186. FLG_OUT_OF_RANGE = 0x100
  187. FLG_PLACEHOLDER_SHOWN = 0x200
  188. FLG_FORGIVE = 0x400
  189. # Maximum cached patterns to store
  190. _MAXCACHE = 500
  191. @lru_cache(maxsize=_MAXCACHE)
  192. def _cached_css_compile(
  193. pattern: str,
  194. namespaces: Optional[ct.Namespaces],
  195. custom: Optional[ct.CustomSelectors],
  196. flags: int
  197. ) -> cm.SoupSieve:
  198. """Cached CSS compile."""
  199. custom_selectors = process_custom(custom)
  200. return cm.SoupSieve(
  201. pattern,
  202. CSSParser(
  203. pattern,
  204. custom=custom_selectors,
  205. flags=flags
  206. ).process_selectors(),
  207. namespaces,
  208. custom,
  209. flags
  210. )
  211. def _purge_cache() -> None:
  212. """Purge the cache."""
  213. _cached_css_compile.cache_clear()
  214. def process_custom(custom: Optional[ct.CustomSelectors]) -> Dict[str, Union[str, ct.SelectorList]]:
  215. """Process custom."""
  216. custom_selectors = {}
  217. if custom is not None:
  218. for key, value in custom.items():
  219. name = util.lower(key)
  220. if RE_CUSTOM.match(name) is None:
  221. raise SelectorSyntaxError("The name '{}' is not a valid custom pseudo-class name".format(name))
  222. if name in custom_selectors:
  223. raise KeyError("The custom selector '{}' has already been registered".format(name))
  224. custom_selectors[css_unescape(name)] = value
  225. return custom_selectors
  226. def css_unescape(content: str, string: bool = False) -> str:
  227. """
  228. Unescape CSS value.
  229. Strings allow for spanning the value on multiple strings by escaping a new line.
  230. """
  231. def replace(m: Match[str]) -> str:
  232. """Replace with the appropriate substitute."""
  233. if m.group(1):
  234. codepoint = int(m.group(1)[1:], 16)
  235. if codepoint == 0:
  236. codepoint = UNICODE_REPLACEMENT_CHAR
  237. value = chr(codepoint)
  238. elif m.group(2):
  239. value = m.group(2)[1:]
  240. elif m.group(3):
  241. value = '\ufffd'
  242. else:
  243. value = ''
  244. return value
  245. return (RE_CSS_ESC if not string else RE_CSS_STR_ESC).sub(replace, content)
  246. def escape(ident: str) -> str:
  247. """Escape identifier."""
  248. string = []
  249. length = len(ident)
  250. start_dash = length > 0 and ident[0] == '-'
  251. if length == 1 and start_dash:
  252. # Need to escape identifier that is a single `-` with no other characters
  253. string.append('\\{}'.format(ident))
  254. else:
  255. for index, c in enumerate(ident):
  256. codepoint = ord(c)
  257. if codepoint == 0x00:
  258. string.append('\ufffd')
  259. elif (0x01 <= codepoint <= 0x1F) or codepoint == 0x7F:
  260. string.append('\\{:x} '.format(codepoint))
  261. elif (index == 0 or (start_dash and index == 1)) and (0x30 <= codepoint <= 0x39):
  262. string.append('\\{:x} '.format(codepoint))
  263. elif (
  264. codepoint in (0x2D, 0x5F) or codepoint >= 0x80 or (0x30 <= codepoint <= 0x39) or
  265. (0x30 <= codepoint <= 0x39) or (0x41 <= codepoint <= 0x5A) or (0x61 <= codepoint <= 0x7A)
  266. ):
  267. string.append(c)
  268. else:
  269. string.append('\\{}'.format(c))
  270. return ''.join(string)
  271. class SelectorPattern:
  272. """Selector pattern."""
  273. def __init__(self, name: str, pattern: str) -> None:
  274. """Initialize."""
  275. self.name = name
  276. self.re_pattern = re.compile(pattern, re.I | re.X | re.U)
  277. def get_name(self) -> str:
  278. """Get name."""
  279. return self.name
  280. def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
  281. """Match the selector."""
  282. return self.re_pattern.match(selector, index)
  283. class SpecialPseudoPattern(SelectorPattern):
  284. """Selector pattern."""
  285. def __init__(self, patterns: Tuple[Tuple[str, Tuple[str, ...], str, Type[SelectorPattern]], ...]) -> None:
  286. """Initialize."""
  287. self.patterns = {}
  288. for p in patterns:
  289. name = p[0]
  290. pattern = p[3](name, p[2])
  291. for pseudo in p[1]:
  292. self.patterns[pseudo] = pattern
  293. self.matched_name = None # type: Optional[SelectorPattern]
  294. self.re_pseudo_name = re.compile(PAT_PSEUDO_CLASS_SPECIAL, re.I | re.X | re.U)
  295. def get_name(self) -> str:
  296. """Get name."""
  297. return '' if self.matched_name is None else self.matched_name.get_name()
  298. def match(self, selector: str, index: int, flags: int) -> Optional[Match[str]]:
  299. """Match the selector."""
  300. pseudo = None
  301. m = self.re_pseudo_name.match(selector, index)
  302. if m:
  303. name = util.lower(css_unescape(m.group('name')))
  304. pattern = self.patterns.get(name)
  305. if pattern:
  306. pseudo = pattern.match(selector, index, flags)
  307. if pseudo:
  308. self.matched_name = pattern
  309. return pseudo
  310. class _Selector:
  311. """
  312. Intermediate selector class.
  313. This stores selector data for a compound selector as we are acquiring them.
  314. Once we are done collecting the data for a compound selector, we freeze
  315. the data in an object that can be pickled and hashed.
  316. """
  317. def __init__(self, **kwargs: Any) -> None:
  318. """Initialize."""
  319. self.tag = kwargs.get('tag', None) # type: Optional[ct.SelectorTag]
  320. self.ids = kwargs.get('ids', []) # type: List[str]
  321. self.classes = kwargs.get('classes', []) # type: List[str]
  322. self.attributes = kwargs.get('attributes', []) # type: List[ct.SelectorAttribute]
  323. self.nth = kwargs.get('nth', []) # type: List[ct.SelectorNth]
  324. self.selectors = kwargs.get('selectors', []) # type: List[ct.SelectorList]
  325. self.relations = kwargs.get('relations', []) # type: List[_Selector]
  326. self.rel_type = kwargs.get('rel_type', None) # type: Optional[str]
  327. self.contains = kwargs.get('contains', []) # type: List[ct.SelectorContains]
  328. self.lang = kwargs.get('lang', []) # type: List[ct.SelectorLang]
  329. self.flags = kwargs.get('flags', 0) # type: int
  330. self.no_match = kwargs.get('no_match', False) # type: bool
  331. def _freeze_relations(self, relations: List['_Selector']) -> ct.SelectorList:
  332. """Freeze relation."""
  333. if relations:
  334. sel = relations[0]
  335. sel.relations.extend(relations[1:])
  336. return ct.SelectorList([sel.freeze()])
  337. else:
  338. return ct.SelectorList()
  339. def freeze(self) -> Union[ct.Selector, ct.SelectorNull]:
  340. """Freeze self."""
  341. if self.no_match:
  342. return ct.SelectorNull()
  343. else:
  344. return ct.Selector(
  345. self.tag,
  346. tuple(self.ids),
  347. tuple(self.classes),
  348. tuple(self.attributes),
  349. tuple(self.nth),
  350. tuple(self.selectors),
  351. self._freeze_relations(self.relations),
  352. self.rel_type,
  353. tuple(self.contains),
  354. tuple(self.lang),
  355. self.flags
  356. )
  357. def __str__(self) -> str: # pragma: no cover
  358. """String representation."""
  359. return (
  360. '_Selector(tag={!r}, ids={!r}, classes={!r}, attributes={!r}, nth={!r}, selectors={!r}, '
  361. 'relations={!r}, rel_type={!r}, contains={!r}, lang={!r}, flags={!r}, no_match={!r})'
  362. ).format(
  363. self.tag, self.ids, self.classes, self.attributes, self.nth, self.selectors,
  364. self.relations, self.rel_type, self.contains, self.lang, self.flags, self.no_match
  365. )
  366. __repr__ = __str__
  367. class CSSParser:
  368. """Parse CSS selectors."""
  369. css_tokens = (
  370. SelectorPattern("pseudo_close", PAT_PSEUDO_CLOSE),
  371. SpecialPseudoPattern(
  372. (
  373. (
  374. "pseudo_contains",
  375. (':contains', ':-soup-contains', ':-soup-contains-own'),
  376. PAT_PSEUDO_CONTAINS,
  377. SelectorPattern
  378. ),
  379. ("pseudo_nth_child", (':nth-child', ':nth-last-child'), PAT_PSEUDO_NTH_CHILD, SelectorPattern),
  380. ("pseudo_nth_type", (':nth-of-type', ':nth-last-of-type'), PAT_PSEUDO_NTH_TYPE, SelectorPattern),
  381. ("pseudo_lang", (':lang',), PAT_PSEUDO_LANG, SelectorPattern),
  382. ("pseudo_dir", (':dir',), PAT_PSEUDO_DIR, SelectorPattern)
  383. )
  384. ),
  385. SelectorPattern("pseudo_class_custom", PAT_PSEUDO_CLASS_CUSTOM),
  386. SelectorPattern("pseudo_class", PAT_PSEUDO_CLASS),
  387. SelectorPattern("pseudo_element", PAT_PSEUDO_ELEMENT),
  388. SelectorPattern("at_rule", PAT_AT_RULE),
  389. SelectorPattern("id", PAT_ID),
  390. SelectorPattern("class", PAT_CLASS),
  391. SelectorPattern("tag", PAT_TAG),
  392. SelectorPattern("attribute", PAT_ATTR),
  393. SelectorPattern("combine", PAT_COMBINE)
  394. )
  395. def __init__(
  396. self,
  397. selector: str,
  398. custom: Optional[Dict[str, Union[str, ct.SelectorList]]] = None,
  399. flags: int = 0
  400. ) -> None:
  401. """Initialize."""
  402. self.pattern = selector.replace('\x00', '\ufffd')
  403. self.flags = flags
  404. self.debug = self.flags & util.DEBUG
  405. self.custom = {} if custom is None else custom
  406. def parse_attribute_selector(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  407. """Create attribute selector from the returned regex match."""
  408. inverse = False
  409. op = m.group('cmp')
  410. case = util.lower(m.group('case')) if m.group('case') else None
  411. ns = css_unescape(m.group('attr_ns')[:-1]) if m.group('attr_ns') else ''
  412. attr = css_unescape(m.group('attr_name'))
  413. is_type = False
  414. pattern2 = None
  415. value = ''
  416. if case:
  417. flags = (re.I if case == 'i' else 0) | re.DOTALL
  418. elif util.lower(attr) == 'type':
  419. flags = re.I | re.DOTALL
  420. is_type = True
  421. else:
  422. flags = re.DOTALL
  423. if op:
  424. if m.group('value').startswith(('"', "'")):
  425. value = css_unescape(m.group('value')[1:-1], True)
  426. else:
  427. value = css_unescape(m.group('value'))
  428. if not op:
  429. # Attribute name
  430. pattern = None
  431. elif op.startswith('^'):
  432. # Value start with
  433. pattern = re.compile(r'^%s.*' % re.escape(value), flags)
  434. elif op.startswith('$'):
  435. # Value ends with
  436. pattern = re.compile(r'.*?%s$' % re.escape(value), flags)
  437. elif op.startswith('*'):
  438. # Value contains
  439. pattern = re.compile(r'.*?%s.*' % re.escape(value), flags)
  440. elif op.startswith('~'):
  441. # Value contains word within space separated list
  442. # `~=` should match nothing if it is empty or contains whitespace,
  443. # so if either of these cases is present, use `[^\s\S]` which cannot be matched.
  444. value = r'[^\s\S]' if not value or RE_WS.search(value) else re.escape(value)
  445. pattern = re.compile(r'.*?(?:(?<=^)|(?<=[ \t\r\n\f]))%s(?=(?:[ \t\r\n\f]|$)).*' % value, flags)
  446. elif op.startswith('|'):
  447. # Value starts with word in dash separated list
  448. pattern = re.compile(r'^%s(?:-.*)?$' % re.escape(value), flags)
  449. else:
  450. # Value matches
  451. pattern = re.compile(r'^%s$' % re.escape(value), flags)
  452. if op.startswith('!'):
  453. # Equivalent to `:not([attr=value])`
  454. inverse = True
  455. if is_type and pattern:
  456. pattern2 = re.compile(pattern.pattern)
  457. # Append the attribute selector
  458. sel_attr = ct.SelectorAttribute(attr, ns, pattern, pattern2)
  459. if inverse:
  460. # If we are using `!=`, we need to nest the pattern under a `:not()`.
  461. sub_sel = _Selector()
  462. sub_sel.attributes.append(sel_attr)
  463. not_list = ct.SelectorList([sub_sel.freeze()], True, False)
  464. sel.selectors.append(not_list)
  465. else:
  466. sel.attributes.append(sel_attr)
  467. has_selector = True
  468. return has_selector
  469. def parse_tag_pattern(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  470. """Parse tag pattern from regex match."""
  471. prefix = css_unescape(m.group('tag_ns')[:-1]) if m.group('tag_ns') else None
  472. tag = css_unescape(m.group('tag_name'))
  473. sel.tag = ct.SelectorTag(tag, prefix)
  474. has_selector = True
  475. return has_selector
  476. def parse_pseudo_class_custom(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  477. """
  478. Parse custom pseudo class alias.
  479. Compile custom selectors as we need them. When compiling a custom selector,
  480. set it to `None` in the dictionary so we can avoid an infinite loop.
  481. """
  482. pseudo = util.lower(css_unescape(m.group('name')))
  483. selector = self.custom.get(pseudo)
  484. if selector is None:
  485. raise SelectorSyntaxError(
  486. "Undefined custom selector '{}' found at position {}".format(pseudo, m.end(0)),
  487. self.pattern,
  488. m.end(0)
  489. )
  490. if not isinstance(selector, ct.SelectorList):
  491. del self.custom[pseudo]
  492. selector = CSSParser(
  493. selector, custom=self.custom, flags=self.flags
  494. ).process_selectors(flags=FLG_PSEUDO)
  495. self.custom[pseudo] = selector
  496. sel.selectors.append(selector)
  497. has_selector = True
  498. return has_selector
  499. def parse_pseudo_class(
  500. self,
  501. sel: _Selector,
  502. m: Match[str],
  503. has_selector: bool,
  504. iselector: Iterator[Tuple[str, Match[str]]],
  505. is_html: bool
  506. ) -> Tuple[bool, bool]:
  507. """Parse pseudo class."""
  508. complex_pseudo = False
  509. pseudo = util.lower(css_unescape(m.group('name')))
  510. if m.group('open'):
  511. complex_pseudo = True
  512. if complex_pseudo and pseudo in PSEUDO_COMPLEX:
  513. has_selector = self.parse_pseudo_open(sel, pseudo, has_selector, iselector, m.end(0))
  514. elif not complex_pseudo and pseudo in PSEUDO_SIMPLE:
  515. if pseudo == ':root':
  516. sel.flags |= ct.SEL_ROOT
  517. elif pseudo == ':defined':
  518. sel.flags |= ct.SEL_DEFINED
  519. is_html = True
  520. elif pseudo == ':scope':
  521. sel.flags |= ct.SEL_SCOPE
  522. elif pseudo == ':empty':
  523. sel.flags |= ct.SEL_EMPTY
  524. elif pseudo in (':link', ':any-link'):
  525. sel.selectors.append(CSS_LINK)
  526. elif pseudo == ':checked':
  527. sel.selectors.append(CSS_CHECKED)
  528. elif pseudo == ':default':
  529. sel.selectors.append(CSS_DEFAULT)
  530. elif pseudo == ':indeterminate':
  531. sel.selectors.append(CSS_INDETERMINATE)
  532. elif pseudo == ":disabled":
  533. sel.selectors.append(CSS_DISABLED)
  534. elif pseudo == ":enabled":
  535. sel.selectors.append(CSS_ENABLED)
  536. elif pseudo == ":required":
  537. sel.selectors.append(CSS_REQUIRED)
  538. elif pseudo == ":optional":
  539. sel.selectors.append(CSS_OPTIONAL)
  540. elif pseudo == ":read-only":
  541. sel.selectors.append(CSS_READ_ONLY)
  542. elif pseudo == ":read-write":
  543. sel.selectors.append(CSS_READ_WRITE)
  544. elif pseudo == ":in-range":
  545. sel.selectors.append(CSS_IN_RANGE)
  546. elif pseudo == ":out-of-range":
  547. sel.selectors.append(CSS_OUT_OF_RANGE)
  548. elif pseudo == ":placeholder-shown":
  549. sel.selectors.append(CSS_PLACEHOLDER_SHOWN)
  550. elif pseudo == ':first-child':
  551. sel.nth.append(ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()))
  552. elif pseudo == ':last-child':
  553. sel.nth.append(ct.SelectorNth(1, False, 0, False, True, ct.SelectorList()))
  554. elif pseudo == ':first-of-type':
  555. sel.nth.append(ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()))
  556. elif pseudo == ':last-of-type':
  557. sel.nth.append(ct.SelectorNth(1, False, 0, True, True, ct.SelectorList()))
  558. elif pseudo == ':only-child':
  559. sel.nth.extend(
  560. [
  561. ct.SelectorNth(1, False, 0, False, False, ct.SelectorList()),
  562. ct.SelectorNth(1, False, 0, False, True, ct.SelectorList())
  563. ]
  564. )
  565. elif pseudo == ':only-of-type':
  566. sel.nth.extend(
  567. [
  568. ct.SelectorNth(1, False, 0, True, False, ct.SelectorList()),
  569. ct.SelectorNth(1, False, 0, True, True, ct.SelectorList())
  570. ]
  571. )
  572. has_selector = True
  573. elif complex_pseudo and pseudo in PSEUDO_COMPLEX_NO_MATCH:
  574. self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
  575. sel.no_match = True
  576. has_selector = True
  577. elif not complex_pseudo and pseudo in PSEUDO_SIMPLE_NO_MATCH:
  578. sel.no_match = True
  579. has_selector = True
  580. elif pseudo in PSEUDO_SUPPORTED:
  581. raise SelectorSyntaxError(
  582. "Invalid syntax for pseudo class '{}'".format(pseudo),
  583. self.pattern,
  584. m.start(0)
  585. )
  586. else:
  587. raise NotImplementedError(
  588. "'{}' pseudo-class is not implemented at this time".format(pseudo)
  589. )
  590. return has_selector, is_html
  591. def parse_pseudo_nth(
  592. self,
  593. sel: _Selector,
  594. m: Match[str],
  595. has_selector: bool,
  596. iselector: Iterator[Tuple[str, Match[str]]]
  597. ) -> bool:
  598. """Parse `nth` pseudo."""
  599. mdict = m.groupdict()
  600. if mdict.get('pseudo_nth_child'):
  601. postfix = '_child'
  602. else:
  603. postfix = '_type'
  604. mdict['name'] = util.lower(css_unescape(mdict['name']))
  605. content = util.lower(mdict.get('nth' + postfix))
  606. if content == 'even':
  607. # 2n
  608. s1 = 2
  609. s2 = 0
  610. var = True
  611. elif content == 'odd':
  612. # 2n+1
  613. s1 = 2
  614. s2 = 1
  615. var = True
  616. else:
  617. nth_parts = cast(Match[str], RE_NTH.match(content))
  618. _s1 = '-' if nth_parts.group('s1') and nth_parts.group('s1') == '-' else ''
  619. a = nth_parts.group('a')
  620. var = a.endswith('n')
  621. if a.startswith('n'):
  622. _s1 += '1'
  623. elif var:
  624. _s1 += a[:-1]
  625. else:
  626. _s1 += a
  627. _s2 = '-' if nth_parts.group('s2') and nth_parts.group('s2') == '-' else ''
  628. if nth_parts.group('b'):
  629. _s2 += nth_parts.group('b')
  630. else:
  631. _s2 = '0'
  632. s1 = int(_s1, 10)
  633. s2 = int(_s2, 10)
  634. pseudo_sel = mdict['name']
  635. if postfix == '_child':
  636. if m.group('of'):
  637. # Parse the rest of `of S`.
  638. nth_sel = self.parse_selectors(iselector, m.end(0), FLG_PSEUDO | FLG_OPEN)
  639. else:
  640. # Use default `*|*` for `of S`.
  641. nth_sel = CSS_NTH_OF_S_DEFAULT
  642. if pseudo_sel == ':nth-child':
  643. sel.nth.append(ct.SelectorNth(s1, var, s2, False, False, nth_sel))
  644. elif pseudo_sel == ':nth-last-child':
  645. sel.nth.append(ct.SelectorNth(s1, var, s2, False, True, nth_sel))
  646. else:
  647. if pseudo_sel == ':nth-of-type':
  648. sel.nth.append(ct.SelectorNth(s1, var, s2, True, False, ct.SelectorList()))
  649. elif pseudo_sel == ':nth-last-of-type':
  650. sel.nth.append(ct.SelectorNth(s1, var, s2, True, True, ct.SelectorList()))
  651. has_selector = True
  652. return has_selector
  653. def parse_pseudo_open(
  654. self,
  655. sel: _Selector,
  656. name: str,
  657. has_selector: bool,
  658. iselector: Iterator[Tuple[str, Match[str]]],
  659. index: int
  660. ) -> bool:
  661. """Parse pseudo with opening bracket."""
  662. flags = FLG_PSEUDO | FLG_OPEN
  663. if name == ':not':
  664. flags |= FLG_NOT
  665. elif name == ':has':
  666. flags |= FLG_RELATIVE | FLG_FORGIVE
  667. elif name in (':where', ':is'):
  668. flags |= FLG_FORGIVE
  669. sel.selectors.append(self.parse_selectors(iselector, index, flags))
  670. has_selector = True
  671. return has_selector
  672. def parse_has_combinator(
  673. self,
  674. sel: _Selector,
  675. m: Match[str],
  676. has_selector: bool,
  677. selectors: List[_Selector],
  678. rel_type: str,
  679. index: int
  680. ) -> Tuple[bool, _Selector, str]:
  681. """Parse combinator tokens."""
  682. combinator = m.group('relation').strip()
  683. if not combinator:
  684. combinator = WS_COMBINATOR
  685. if combinator == COMMA_COMBINATOR:
  686. if not has_selector:
  687. # If we've not captured any selector parts, the comma is either at the beginning of the pattern
  688. # or following another comma, both of which are unexpected. But shouldn't fail the pseudo-class.
  689. sel.no_match = True
  690. sel.rel_type = rel_type
  691. selectors[-1].relations.append(sel)
  692. rel_type = ":" + WS_COMBINATOR
  693. selectors.append(_Selector())
  694. else:
  695. if has_selector:
  696. # End the current selector and associate the leading combinator with this selector.
  697. sel.rel_type = rel_type
  698. selectors[-1].relations.append(sel)
  699. elif rel_type[1:] != WS_COMBINATOR:
  700. # It's impossible to have two whitespace combinators after each other as the patterns
  701. # will gobble up trailing whitespace. It is also impossible to have a whitespace
  702. # combinator after any other kind for the same reason. But we could have
  703. # multiple non-whitespace combinators. So if the current combinator is not a whitespace,
  704. # then we've hit the multiple combinator case, so we should fail.
  705. raise SelectorSyntaxError(
  706. 'The multiple combinators at position {}'.format(index),
  707. self.pattern,
  708. index
  709. )
  710. # Set the leading combinator for the next selector.
  711. rel_type = ':' + combinator
  712. sel = _Selector()
  713. has_selector = False
  714. return has_selector, sel, rel_type
  715. def parse_combinator(
  716. self,
  717. sel: _Selector,
  718. m: Match[str],
  719. has_selector: bool,
  720. selectors: List[_Selector],
  721. relations: List[_Selector],
  722. is_pseudo: bool,
  723. is_forgive: bool,
  724. index: int
  725. ) -> Tuple[bool, _Selector]:
  726. """Parse combinator tokens."""
  727. combinator = m.group('relation').strip()
  728. if not combinator:
  729. combinator = WS_COMBINATOR
  730. if not has_selector:
  731. if not is_forgive or combinator != COMMA_COMBINATOR:
  732. raise SelectorSyntaxError(
  733. "The combinator '{}' at position {}, must have a selector before it".format(combinator, index),
  734. self.pattern,
  735. index
  736. )
  737. # If we are in a forgiving pseudo class, just make the selector a "no match"
  738. if combinator == COMMA_COMBINATOR:
  739. sel.no_match = True
  740. del relations[:]
  741. selectors.append(sel)
  742. else:
  743. if combinator == COMMA_COMBINATOR:
  744. if not sel.tag and not is_pseudo:
  745. # Implied `*`
  746. sel.tag = ct.SelectorTag('*', None)
  747. sel.relations.extend(relations)
  748. selectors.append(sel)
  749. del relations[:]
  750. else:
  751. sel.relations.extend(relations)
  752. sel.rel_type = combinator
  753. del relations[:]
  754. relations.append(sel)
  755. sel = _Selector()
  756. has_selector = False
  757. return has_selector, sel
  758. def parse_class_id(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  759. """Parse HTML classes and ids."""
  760. selector = m.group(0)
  761. if selector.startswith('.'):
  762. sel.classes.append(css_unescape(selector[1:]))
  763. else:
  764. sel.ids.append(css_unescape(selector[1:]))
  765. has_selector = True
  766. return has_selector
  767. def parse_pseudo_contains(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  768. """Parse contains."""
  769. pseudo = util.lower(css_unescape(m.group('name')))
  770. if pseudo == ":contains":
  771. warnings.warn(
  772. "The pseudo class ':contains' is deprecated, ':-soup-contains' should be used moving forward.",
  773. FutureWarning
  774. )
  775. contains_own = pseudo == ":-soup-contains-own"
  776. values = css_unescape(m.group('values'))
  777. patterns = []
  778. for token in RE_VALUES.finditer(values):
  779. if token.group('split'):
  780. continue
  781. value = token.group('value')
  782. if value.startswith(("'", '"')):
  783. value = css_unescape(value[1:-1], True)
  784. else:
  785. value = css_unescape(value)
  786. patterns.append(value)
  787. sel.contains.append(ct.SelectorContains(patterns, contains_own))
  788. has_selector = True
  789. return has_selector
  790. def parse_pseudo_lang(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  791. """Parse pseudo language."""
  792. values = m.group('values')
  793. patterns = []
  794. for token in RE_VALUES.finditer(values):
  795. if token.group('split'):
  796. continue
  797. value = token.group('value')
  798. if value.startswith(('"', "'")):
  799. value = css_unescape(value[1:-1], True)
  800. else:
  801. value = css_unescape(value)
  802. patterns.append(value)
  803. sel.lang.append(ct.SelectorLang(patterns))
  804. has_selector = True
  805. return has_selector
  806. def parse_pseudo_dir(self, sel: _Selector, m: Match[str], has_selector: bool) -> bool:
  807. """Parse pseudo direction."""
  808. value = ct.SEL_DIR_LTR if util.lower(m.group('dir')) == 'ltr' else ct.SEL_DIR_RTL
  809. sel.flags |= value
  810. has_selector = True
  811. return has_selector
  812. def parse_selectors(
  813. self,
  814. iselector: Iterator[Tuple[str, Match[str]]],
  815. index: int = 0,
  816. flags: int = 0
  817. ) -> ct.SelectorList:
  818. """Parse selectors."""
  819. # Initialize important variables
  820. sel = _Selector()
  821. selectors = []
  822. has_selector = False
  823. closed = False
  824. relations = [] # type: List[_Selector]
  825. rel_type = ":" + WS_COMBINATOR
  826. # Setup various flags
  827. is_open = bool(flags & FLG_OPEN)
  828. is_pseudo = bool(flags & FLG_PSEUDO)
  829. is_relative = bool(flags & FLG_RELATIVE)
  830. is_not = bool(flags & FLG_NOT)
  831. is_html = bool(flags & FLG_HTML)
  832. is_default = bool(flags & FLG_DEFAULT)
  833. is_indeterminate = bool(flags & FLG_INDETERMINATE)
  834. is_in_range = bool(flags & FLG_IN_RANGE)
  835. is_out_of_range = bool(flags & FLG_OUT_OF_RANGE)
  836. is_placeholder_shown = bool(flags & FLG_PLACEHOLDER_SHOWN)
  837. is_forgive = bool(flags & FLG_FORGIVE)
  838. # Print out useful debug stuff
  839. if self.debug: # pragma: no cover
  840. if is_pseudo:
  841. print(' is_pseudo: True')
  842. if is_open:
  843. print(' is_open: True')
  844. if is_relative:
  845. print(' is_relative: True')
  846. if is_not:
  847. print(' is_not: True')
  848. if is_html:
  849. print(' is_html: True')
  850. if is_default:
  851. print(' is_default: True')
  852. if is_indeterminate:
  853. print(' is_indeterminate: True')
  854. if is_in_range:
  855. print(' is_in_range: True')
  856. if is_out_of_range:
  857. print(' is_out_of_range: True')
  858. if is_placeholder_shown:
  859. print(' is_placeholder_shown: True')
  860. if is_forgive:
  861. print(' is_forgive: True')
  862. # The algorithm for relative selectors require an initial selector in the selector list
  863. if is_relative:
  864. selectors.append(_Selector())
  865. try:
  866. while True:
  867. key, m = next(iselector)
  868. # Handle parts
  869. if key == "at_rule":
  870. raise NotImplementedError("At-rules found at position {}".format(m.start(0)))
  871. elif key == 'pseudo_class_custom':
  872. has_selector = self.parse_pseudo_class_custom(sel, m, has_selector)
  873. elif key == 'pseudo_class':
  874. has_selector, is_html = self.parse_pseudo_class(sel, m, has_selector, iselector, is_html)
  875. elif key == 'pseudo_element':
  876. raise NotImplementedError("Pseudo-element found at position {}".format(m.start(0)))
  877. elif key == 'pseudo_contains':
  878. has_selector = self.parse_pseudo_contains(sel, m, has_selector)
  879. elif key in ('pseudo_nth_type', 'pseudo_nth_child'):
  880. has_selector = self.parse_pseudo_nth(sel, m, has_selector, iselector)
  881. elif key == 'pseudo_lang':
  882. has_selector = self.parse_pseudo_lang(sel, m, has_selector)
  883. elif key == 'pseudo_dir':
  884. has_selector = self.parse_pseudo_dir(sel, m, has_selector)
  885. # Currently only supports HTML
  886. is_html = True
  887. elif key == 'pseudo_close':
  888. if not has_selector:
  889. if not is_forgive:
  890. raise SelectorSyntaxError(
  891. "Expected a selector at position {}".format(m.start(0)),
  892. self.pattern,
  893. m.start(0)
  894. )
  895. sel.no_match = True
  896. if is_open:
  897. closed = True
  898. break
  899. else:
  900. raise SelectorSyntaxError(
  901. "Unmatched pseudo-class close at position {}".format(m.start(0)),
  902. self.pattern,
  903. m.start(0)
  904. )
  905. elif key == 'combine':
  906. if is_relative:
  907. has_selector, sel, rel_type = self.parse_has_combinator(
  908. sel, m, has_selector, selectors, rel_type, index
  909. )
  910. else:
  911. has_selector, sel = self.parse_combinator(
  912. sel, m, has_selector, selectors, relations, is_pseudo, is_forgive, index
  913. )
  914. elif key == 'attribute':
  915. has_selector = self.parse_attribute_selector(sel, m, has_selector)
  916. elif key == 'tag':
  917. if has_selector:
  918. raise SelectorSyntaxError(
  919. "Tag name found at position {} instead of at the start".format(m.start(0)),
  920. self.pattern,
  921. m.start(0)
  922. )
  923. has_selector = self.parse_tag_pattern(sel, m, has_selector)
  924. elif key in ('class', 'id'):
  925. has_selector = self.parse_class_id(sel, m, has_selector)
  926. index = m.end(0)
  927. except StopIteration:
  928. pass
  929. # Handle selectors that are not closed
  930. if is_open and not closed:
  931. raise SelectorSyntaxError(
  932. "Unclosed pseudo-class at position {}".format(index),
  933. self.pattern,
  934. index
  935. )
  936. # Cleanup completed selector piece
  937. if has_selector:
  938. if not sel.tag and not is_pseudo:
  939. # Implied `*`
  940. sel.tag = ct.SelectorTag('*', None)
  941. if is_relative:
  942. sel.rel_type = rel_type
  943. selectors[-1].relations.append(sel)
  944. else:
  945. sel.relations.extend(relations)
  946. del relations[:]
  947. selectors.append(sel)
  948. # Forgive empty slots in pseudo-classes that have lists (and are forgiving)
  949. elif is_forgive:
  950. if is_relative:
  951. # Handle relative selectors pseudo-classes with empty slots like `:has()`
  952. if selectors and selectors[-1].rel_type is None and rel_type == ': ':
  953. sel.rel_type = rel_type
  954. sel.no_match = True
  955. selectors[-1].relations.append(sel)
  956. has_selector = True
  957. else:
  958. # Handle normal pseudo-classes with empty slots
  959. if not selectors or not relations:
  960. # Others like `:is()` etc.
  961. sel.no_match = True
  962. del relations[:]
  963. selectors.append(sel)
  964. has_selector = True
  965. if not has_selector:
  966. # We will always need to finish a selector when `:has()` is used as it leads with combining.
  967. # May apply to others as well.
  968. raise SelectorSyntaxError(
  969. 'Expected a selector at position {}'.format(index),
  970. self.pattern,
  971. index
  972. )
  973. # Some patterns require additional logic, such as default. We try to make these the
  974. # last pattern, and append the appropriate flag to that selector which communicates
  975. # to the matcher what additional logic is required.
  976. if is_default:
  977. selectors[-1].flags = ct.SEL_DEFAULT
  978. if is_indeterminate:
  979. selectors[-1].flags = ct.SEL_INDETERMINATE
  980. if is_in_range:
  981. selectors[-1].flags = ct.SEL_IN_RANGE
  982. if is_out_of_range:
  983. selectors[-1].flags = ct.SEL_OUT_OF_RANGE
  984. if is_placeholder_shown:
  985. selectors[-1].flags = ct.SEL_PLACEHOLDER_SHOWN
  986. # Return selector list
  987. return ct.SelectorList([s.freeze() for s in selectors], is_not, is_html)
  988. def selector_iter(self, pattern: str) -> Iterator[Tuple[str, Match[str]]]:
  989. """Iterate selector tokens."""
  990. # Ignore whitespace and comments at start and end of pattern
  991. m = RE_WS_BEGIN.search(pattern)
  992. index = m.end(0) if m else 0
  993. m = RE_WS_END.search(pattern)
  994. end = (m.start(0) - 1) if m else (len(pattern) - 1)
  995. if self.debug: # pragma: no cover
  996. print('## PARSING: {!r}'.format(pattern))
  997. while index <= end:
  998. m = None
  999. for v in self.css_tokens:
  1000. m = v.match(pattern, index, self.flags)
  1001. if m:
  1002. name = v.get_name()
  1003. if self.debug: # pragma: no cover
  1004. print("TOKEN: '{}' --> {!r} at position {}".format(name, m.group(0), m.start(0)))
  1005. index = m.end(0)
  1006. yield name, m
  1007. break
  1008. if m is None:
  1009. c = pattern[index]
  1010. # If the character represents the start of one of the known selector types,
  1011. # throw an exception mentioning that the known selector type is in error;
  1012. # otherwise, report the invalid character.
  1013. if c == '[':
  1014. msg = "Malformed attribute selector at position {}".format(index)
  1015. elif c == '.':
  1016. msg = "Malformed class selector at position {}".format(index)
  1017. elif c == '#':
  1018. msg = "Malformed id selector at position {}".format(index)
  1019. elif c == ':':
  1020. msg = "Malformed pseudo-class selector at position {}".format(index)
  1021. else:
  1022. msg = "Invalid character {!r} position {}".format(c, index)
  1023. raise SelectorSyntaxError(msg, self.pattern, index)
  1024. if self.debug: # pragma: no cover
  1025. print('## END PARSING')
  1026. def process_selectors(self, index: int = 0, flags: int = 0) -> ct.SelectorList:
  1027. """Process selectors."""
  1028. return self.parse_selectors(self.selector_iter(self.pattern), index, flags)
  1029. # Precompile CSS selector lists for pseudo-classes (additional logic may be required beyond the pattern)
  1030. # A few patterns are order dependent as they use patterns previous compiled.
  1031. # CSS pattern for `:link` and `:any-link`
  1032. CSS_LINK = CSSParser(
  1033. 'html|*:is(a, area)[href]'
  1034. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1035. # CSS pattern for `:checked`
  1036. CSS_CHECKED = CSSParser(
  1037. '''
  1038. html|*:is(input[type=checkbox], input[type=radio])[checked], html|option[selected]
  1039. '''
  1040. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1041. # CSS pattern for `:default` (must compile CSS_CHECKED first)
  1042. CSS_DEFAULT = CSSParser(
  1043. '''
  1044. :checked,
  1045. /*
  1046. This pattern must be at the end.
  1047. Special logic is applied to the last selector.
  1048. */
  1049. html|form html|*:is(button, input)[type="submit"]
  1050. '''
  1051. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_DEFAULT)
  1052. # CSS pattern for `:indeterminate`
  1053. CSS_INDETERMINATE = CSSParser(
  1054. '''
  1055. html|input[type="checkbox"][indeterminate],
  1056. html|input[type="radio"]:is(:not([name]), [name=""]):not([checked]),
  1057. html|progress:not([value]),
  1058. /*
  1059. This pattern must be at the end.
  1060. Special logic is applied to the last selector.
  1061. */
  1062. html|input[type="radio"][name]:not([name='']):not([checked])
  1063. '''
  1064. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_INDETERMINATE)
  1065. # CSS pattern for `:disabled`
  1066. CSS_DISABLED = CSSParser(
  1067. '''
  1068. html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset)[disabled],
  1069. html|optgroup[disabled] > html|option,
  1070. html|fieldset[disabled] > html|*:is(input:not([type=hidden]), button, select, textarea, fieldset),
  1071. html|fieldset[disabled] >
  1072. html|*:not(legend:nth-of-type(1)) html|*:is(input:not([type=hidden]), button, select, textarea, fieldset)
  1073. '''
  1074. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1075. # CSS pattern for `:enabled`
  1076. CSS_ENABLED = CSSParser(
  1077. '''
  1078. html|*:is(input:not([type=hidden]), button, select, textarea, fieldset, optgroup, option, fieldset):not(:disabled)
  1079. '''
  1080. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1081. # CSS pattern for `:required`
  1082. CSS_REQUIRED = CSSParser(
  1083. 'html|*:is(input, textarea, select)[required]'
  1084. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1085. # CSS pattern for `:optional`
  1086. CSS_OPTIONAL = CSSParser(
  1087. 'html|*:is(input, textarea, select):not([required])'
  1088. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1089. # CSS pattern for `:placeholder-shown`
  1090. CSS_PLACEHOLDER_SHOWN = CSSParser(
  1091. '''
  1092. html|input:is(
  1093. :not([type]),
  1094. [type=""],
  1095. [type=text],
  1096. [type=search],
  1097. [type=url],
  1098. [type=tel],
  1099. [type=email],
  1100. [type=password],
  1101. [type=number]
  1102. )[placeholder]:not([placeholder='']):is(:not([value]), [value=""]),
  1103. html|textarea[placeholder]:not([placeholder=''])
  1104. '''
  1105. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML | FLG_PLACEHOLDER_SHOWN)
  1106. # CSS pattern default for `:nth-child` "of S" feature
  1107. CSS_NTH_OF_S_DEFAULT = CSSParser(
  1108. '*|*'
  1109. ).process_selectors(flags=FLG_PSEUDO)
  1110. # CSS pattern for `:read-write` (CSS_DISABLED must be compiled first)
  1111. CSS_READ_WRITE = CSSParser(
  1112. '''
  1113. html|*:is(
  1114. textarea,
  1115. input:is(
  1116. :not([type]),
  1117. [type=""],
  1118. [type=text],
  1119. [type=search],
  1120. [type=url],
  1121. [type=tel],
  1122. [type=email],
  1123. [type=number],
  1124. [type=password],
  1125. [type=date],
  1126. [type=datetime-local],
  1127. [type=month],
  1128. [type=time],
  1129. [type=week]
  1130. )
  1131. ):not([readonly], :disabled),
  1132. html|*:is([contenteditable=""], [contenteditable="true" i])
  1133. '''
  1134. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1135. # CSS pattern for `:read-only`
  1136. CSS_READ_ONLY = CSSParser(
  1137. '''
  1138. html|*:not(:read-write)
  1139. '''
  1140. ).process_selectors(flags=FLG_PSEUDO | FLG_HTML)
  1141. # CSS pattern for `:in-range`
  1142. CSS_IN_RANGE = CSSParser(
  1143. '''
  1144. html|input:is(
  1145. [type="date"],
  1146. [type="month"],
  1147. [type="week"],
  1148. [type="time"],
  1149. [type="datetime-local"],
  1150. [type="number"],
  1151. [type="range"]
  1152. ):is(
  1153. [min],
  1154. [max]
  1155. )
  1156. '''
  1157. ).process_selectors(flags=FLG_PSEUDO | FLG_IN_RANGE | FLG_HTML)
  1158. # CSS pattern for `:out-of-range`
  1159. CSS_OUT_OF_RANGE = CSSParser(
  1160. '''
  1161. html|input:is(
  1162. [type="date"],
  1163. [type="month"],
  1164. [type="week"],
  1165. [type="time"],
  1166. [type="datetime-local"],
  1167. [type="number"],
  1168. [type="range"]
  1169. ):is(
  1170. [min],
  1171. [max]
  1172. )
  1173. '''
  1174. ).process_selectors(flags=FLG_PSEUDO | FLG_OUT_OF_RANGE | FLG_HTML)