123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584 |
- """CSS matcher."""
- from datetime import datetime
- from . import util
- import re
- from . import css_types as ct
- import unicodedata
- import bs4 # type: ignore[import]
- from typing import Iterator, Iterable, List, Any, Optional, Tuple, Union, Dict, Callable, Sequence, cast
- # Empty tag pattern (whitespace okay)
- RE_NOT_EMPTY = re.compile('[^ \t\r\n\f]')
- RE_NOT_WS = re.compile('[^ \t\r\n\f]+')
- # Relationships
- REL_PARENT = ' '
- REL_CLOSE_PARENT = '>'
- REL_SIBLING = '~'
- REL_CLOSE_SIBLING = '+'
- # Relationships for :has() (forward looking)
- REL_HAS_PARENT = ': '
- REL_HAS_CLOSE_PARENT = ':>'
- REL_HAS_SIBLING = ':~'
- REL_HAS_CLOSE_SIBLING = ':+'
- NS_XHTML = 'http://www.w3.org/1999/xhtml'
- NS_XML = 'http://www.w3.org/XML/1998/namespace'
- DIR_FLAGS = ct.SEL_DIR_LTR | ct.SEL_DIR_RTL
- RANGES = ct.SEL_IN_RANGE | ct.SEL_OUT_OF_RANGE
- DIR_MAP = {
- 'ltr': ct.SEL_DIR_LTR,
- 'rtl': ct.SEL_DIR_RTL,
- 'auto': 0
- }
- RE_NUM = re.compile(r"^(?P<value>-?(?:[0-9]{1,}(\.[0-9]+)?|\.[0-9]+))$")
- RE_TIME = re.compile(r'^(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$')
- RE_MONTH = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})$')
- RE_WEEK = re.compile(r'^(?P<year>[0-9]{4,})-W(?P<week>[0-9]{2})$')
- RE_DATE = re.compile(r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})$')
- RE_DATETIME = re.compile(
- r'^(?P<year>[0-9]{4,})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})T(?P<hour>[0-9]{2}):(?P<minutes>[0-9]{2})$'
- )
- RE_WILD_STRIP = re.compile(r'(?:(?:-\*-)(?:\*(?:-|$))*|-\*$)')
- MONTHS_30 = (4, 6, 9, 11) # April, June, September, and November
- FEB = 2
- SHORT_MONTH = 30
- LONG_MONTH = 31
- FEB_MONTH = 28
- FEB_LEAP_MONTH = 29
- DAYS_IN_WEEK = 7
- class _FakeParent:
- """
- Fake parent class.
- When we have a fragment with no `BeautifulSoup` document object,
- we can't evaluate `nth` selectors properly. Create a temporary
- fake parent so we can traverse the root element as a child.
- """
- def __init__(self, element: 'bs4.Tag') -> None:
- """Initialize."""
- self.contents = [element]
- def __len__(self) -> 'bs4.PageElement':
- """Length."""
- return len(self.contents)
- class _DocumentNav:
- """Navigate a Beautiful Soup document."""
- @classmethod
- def assert_valid_input(cls, tag: Any) -> None:
- """Check if valid input tag or document."""
- # Fail on unexpected types.
- if not cls.is_tag(tag):
- raise TypeError("Expected a BeautifulSoup 'Tag', but instead received type {}".format(type(tag)))
- @staticmethod
- def is_doc(obj: 'bs4.Tag') -> bool:
- """Is `BeautifulSoup` object."""
- return isinstance(obj, bs4.BeautifulSoup)
- @staticmethod
- def is_tag(obj: 'bs4.PageElement') -> bool:
- """Is tag."""
- return isinstance(obj, bs4.Tag)
- @staticmethod
- def is_declaration(obj: 'bs4.PageElement') -> bool: # pragma: no cover
- """Is declaration."""
- return isinstance(obj, bs4.Declaration)
- @staticmethod
- def is_cdata(obj: 'bs4.PageElement') -> bool:
- """Is CDATA."""
- return isinstance(obj, bs4.CData)
- @staticmethod
- def is_processing_instruction(obj: 'bs4.PageElement') -> bool: # pragma: no cover
- """Is processing instruction."""
- return isinstance(obj, bs4.ProcessingInstruction)
- @staticmethod
- def is_navigable_string(obj: 'bs4.PageElement') -> bool:
- """Is navigable string."""
- return isinstance(obj, bs4.NavigableString)
- @staticmethod
- def is_special_string(obj: 'bs4.PageElement') -> bool:
- """Is special string."""
- return isinstance(obj, (bs4.Comment, bs4.Declaration, bs4.CData, bs4.ProcessingInstruction, bs4.Doctype))
- @classmethod
- def is_content_string(cls, obj: 'bs4.PageElement') -> bool:
- """Check if node is content string."""
- return cls.is_navigable_string(obj) and not cls.is_special_string(obj)
- @staticmethod
- def create_fake_parent(el: 'bs4.Tag') -> _FakeParent:
- """Create fake parent for a given element."""
- return _FakeParent(el)
- @staticmethod
- def is_xml_tree(el: 'bs4.Tag') -> bool:
- """Check if element (or document) is from a XML tree."""
- return bool(el._is_xml)
- def is_iframe(self, el: 'bs4.Tag') -> bool:
- """Check if element is an `iframe`."""
- return bool(
- ((el.name if self.is_xml_tree(el) else util.lower(el.name)) == 'iframe') and
- self.is_html_tag(el) # type: ignore[attr-defined]
- )
- def is_root(self, el: 'bs4.Tag') -> bool:
- """
- Return whether element is a root element.
- We check that the element is the root of the tree (which we have already pre-calculated),
- and we check if it is the root element under an `iframe`.
- """
- root = self.root and self.root is el # type: ignore[attr-defined]
- if not root:
- parent = self.get_parent(el)
- root = parent is not None and self.is_html and self.is_iframe(parent) # type: ignore[attr-defined]
- return root
- def get_contents(self, el: 'bs4.Tag', no_iframe: bool = False) -> Iterator['bs4.PageElement']:
- """Get contents or contents in reverse."""
- if not no_iframe or not self.is_iframe(el):
- for content in el.contents:
- yield content
- def get_children(
- self,
- el: 'bs4.Tag',
- start: Optional[int] = None,
- reverse: bool = False,
- tags: bool = True,
- no_iframe: bool = False
- ) -> Iterator['bs4.PageElement']:
- """Get children."""
- if not no_iframe or not self.is_iframe(el):
- last = len(el.contents) - 1
- if start is None:
- index = last if reverse else 0
- else:
- index = start
- end = -1 if reverse else last + 1
- incr = -1 if reverse else 1
- if 0 <= index <= last:
- while index != end:
- node = el.contents[index]
- index += incr
- if not tags or self.is_tag(node):
- yield node
- def get_descendants(
- self,
- el: 'bs4.Tag',
- tags: bool = True,
- no_iframe: bool = False
- ) -> Iterator['bs4.PageElement']:
- """Get descendants."""
- if not no_iframe or not self.is_iframe(el):
- next_good = None
- for child in el.descendants:
- if next_good is not None:
- if child is not next_good:
- continue
- next_good = None
- is_tag = self.is_tag(child)
- if no_iframe and is_tag and self.is_iframe(child):
- if child.next_sibling is not None:
- next_good = child.next_sibling
- else:
- last_child = child
- while self.is_tag(last_child) and last_child.contents:
- last_child = last_child.contents[-1]
- next_good = last_child.next_element
- yield child
- if next_good is None:
- break
- # Coverage isn't seeing this even though it's executed
- continue # pragma: no cover
- if not tags or is_tag:
- yield child
- def get_parent(self, el: 'bs4.Tag', no_iframe: bool = False) -> 'bs4.Tag':
- """Get parent."""
- parent = el.parent
- if no_iframe and parent is not None and self.is_iframe(parent):
- parent = None
- return parent
- @staticmethod
- def get_tag_name(el: 'bs4.Tag') -> Optional[str]:
- """Get tag."""
- return cast(Optional[str], el.name)
- @staticmethod
- def get_prefix_name(el: 'bs4.Tag') -> Optional[str]:
- """Get prefix."""
- return cast(Optional[str], el.prefix)
- @staticmethod
- def get_uri(el: 'bs4.Tag') -> Optional[str]:
- """Get namespace `URI`."""
- return cast(Optional[str], el.namespace)
- @classmethod
- def get_next(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
- """Get next sibling tag."""
- sibling = el.next_sibling
- while tags and not cls.is_tag(sibling) and sibling is not None:
- sibling = sibling.next_sibling
- return sibling
- @classmethod
- def get_previous(cls, el: 'bs4.Tag', tags: bool = True) -> 'bs4.PageElement':
- """Get previous sibling tag."""
- sibling = el.previous_sibling
- while tags and not cls.is_tag(sibling) and sibling is not None:
- sibling = sibling.previous_sibling
- return sibling
- @staticmethod
- def has_html_ns(el: 'bs4.Tag') -> bool:
- """
- Check if element has an HTML namespace.
- This is a bit different than whether a element is treated as having an HTML namespace,
- like we do in the case of `is_html_tag`.
- """
- ns = getattr(el, 'namespace') if el else None
- return bool(ns and ns == NS_XHTML)
- @staticmethod
- def split_namespace(el: 'bs4.Tag', attr_name: str) -> Tuple[Optional[str], Optional[str]]:
- """Return namespace and attribute name without the prefix."""
- return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
- @classmethod
- def normalize_value(cls, value: Any) -> Union[str, Sequence[str]]:
- """Normalize the value to be a string or list of strings."""
- # Treat `None` as empty string.
- if value is None:
- return ''
- # Pass through strings
- if (isinstance(value, str)):
- return value
- # If it's a byte string, convert it to Unicode, treating it as UTF-8.
- if isinstance(value, bytes):
- return value.decode("utf8")
- # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
- if isinstance(value, Sequence):
- new_value = []
- for v in value:
- if not isinstance(v, (str, bytes)) and isinstance(v, Sequence):
- # This is most certainly a user error and will crash and burn later.
- # To keep things working, we'll do what we do with all objects,
- # And convert them to strings.
- new_value.append(str(v))
- else:
- # Convert the child to a string
- new_value.append(cast(str, cls.normalize_value(v)))
- return new_value
- # Try and make anything else a string
- return str(value)
- @classmethod
- def get_attribute_by_name(
- cls,
- el: 'bs4.Tag',
- name: str,
- default: Optional[Union[str, Sequence[str]]] = None
- ) -> Optional[Union[str, Sequence[str]]]:
- """Get attribute by name."""
- value = default
- if el._is_xml:
- try:
- value = cls.normalize_value(el.attrs[name])
- except KeyError:
- pass
- else:
- for k, v in el.attrs.items():
- if util.lower(k) == name:
- value = cls.normalize_value(v)
- break
- return value
- @classmethod
- def iter_attributes(cls, el: 'bs4.Tag') -> Iterator[Tuple[str, Optional[Union[str, Sequence[str]]]]]:
- """Iterate attributes."""
- for k, v in el.attrs.items():
- yield k, cls.normalize_value(v)
- @classmethod
- def get_classes(cls, el: 'bs4.Tag') -> Sequence[str]:
- """Get classes."""
- classes = cls.get_attribute_by_name(el, 'class', [])
- if isinstance(classes, str):
- classes = RE_NOT_WS.findall(classes)
- return cast(Sequence[str], classes)
- def get_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> str:
- """Get text."""
- return ''.join(
- [node for node in self.get_descendants(el, tags=False, no_iframe=no_iframe) if self.is_content_string(node)]
- )
- def get_own_text(self, el: 'bs4.Tag', no_iframe: bool = False) -> List[str]:
- """Get Own Text."""
- return [node for node in self.get_contents(el, no_iframe=no_iframe) if self.is_content_string(node)]
- class Inputs:
- """Class for parsing and validating input items."""
- @staticmethod
- def validate_day(year: int, month: int, day: int) -> bool:
- """Validate day."""
- max_days = LONG_MONTH
- if month == FEB:
- max_days = FEB_LEAP_MONTH if ((year % 4 == 0) and (year % 100 != 0)) or (year % 400 == 0) else FEB_MONTH
- elif month in MONTHS_30:
- max_days = SHORT_MONTH
- return 1 <= day <= max_days
- @staticmethod
- def validate_week(year: int, week: int) -> bool:
- """Validate week."""
- max_week = datetime.strptime("{}-{}-{}".format(12, 31, year), "%m-%d-%Y").isocalendar()[1]
- if max_week == 1:
- max_week = 53
- return 1 <= week <= max_week
- @staticmethod
- def validate_month(month: int) -> bool:
- """Validate month."""
- return 1 <= month <= 12
- @staticmethod
- def validate_year(year: int) -> bool:
- """Validate year."""
- return 1 <= year
- @staticmethod
- def validate_hour(hour: int) -> bool:
- """Validate hour."""
- return 0 <= hour <= 23
- @staticmethod
- def validate_minutes(minutes: int) -> bool:
- """Validate minutes."""
- return 0 <= minutes <= 59
- @classmethod
- def parse_value(cls, itype: str, value: Optional[str]) -> Optional[Tuple[float, ...]]:
- """Parse the input value."""
- parsed = None # type: Optional[Tuple[float, ...]]
- if value is None:
- return value
- if itype == "date":
- m = RE_DATE.match(value)
- if m:
- year = int(m.group('year'), 10)
- month = int(m.group('month'), 10)
- day = int(m.group('day'), 10)
- if cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day):
- parsed = (year, month, day)
- elif itype == "month":
- m = RE_MONTH.match(value)
- if m:
- year = int(m.group('year'), 10)
- month = int(m.group('month'), 10)
- if cls.validate_year(year) and cls.validate_month(month):
- parsed = (year, month)
- elif itype == "week":
- m = RE_WEEK.match(value)
- if m:
- year = int(m.group('year'), 10)
- week = int(m.group('week'), 10)
- if cls.validate_year(year) and cls.validate_week(year, week):
- parsed = (year, week)
- elif itype == "time":
- m = RE_TIME.match(value)
- if m:
- hour = int(m.group('hour'), 10)
- minutes = int(m.group('minutes'), 10)
- if cls.validate_hour(hour) and cls.validate_minutes(minutes):
- parsed = (hour, minutes)
- elif itype == "datetime-local":
- m = RE_DATETIME.match(value)
- if m:
- year = int(m.group('year'), 10)
- month = int(m.group('month'), 10)
- day = int(m.group('day'), 10)
- hour = int(m.group('hour'), 10)
- minutes = int(m.group('minutes'), 10)
- if (
- cls.validate_year(year) and cls.validate_month(month) and cls.validate_day(year, month, day) and
- cls.validate_hour(hour) and cls.validate_minutes(minutes)
- ):
- parsed = (year, month, day, hour, minutes)
- elif itype in ("number", "range"):
- m = RE_NUM.match(value)
- if m:
- parsed = (float(m.group('value')),)
- return parsed
- class CSSMatch(_DocumentNav):
- """Perform CSS matching."""
- def __init__(
- self,
- selectors: ct.SelectorList,
- scope: 'bs4.Tag',
- namespaces: Optional[ct.Namespaces],
- flags: int
- ) -> None:
- """Initialize."""
- self.assert_valid_input(scope)
- self.tag = scope
- self.cached_meta_lang = [] # type: List[Tuple[str, str]]
- self.cached_default_forms = [] # type: List[Tuple['bs4.Tag', 'bs4.Tag']]
- self.cached_indeterminate_forms = [] # type: List[Tuple['bs4.Tag', str, bool]]
- self.selectors = selectors
- self.namespaces = {} if namespaces is None else namespaces # type: Union[ct.Namespaces, Dict[str, str]]
- self.flags = flags
- self.iframe_restrict = False
- # Find the root element for the whole tree
- doc = scope
- parent = self.get_parent(doc)
- while parent:
- doc = parent
- parent = self.get_parent(doc)
- root = None
- if not self.is_doc(doc):
- root = doc
- else:
- for child in self.get_children(doc):
- root = child
- break
- self.root = root
- self.scope = scope if scope is not doc else root
- self.has_html_namespace = self.has_html_ns(root)
- # A document can be both XML and HTML (XHTML)
- self.is_xml = self.is_xml_tree(doc)
- self.is_html = not self.is_xml or self.has_html_namespace
- def supports_namespaces(self) -> bool:
- """Check if namespaces are supported in the HTML type."""
- return self.is_xml or self.has_html_namespace
- def get_tag_ns(self, el: 'bs4.Tag') -> str:
- """Get tag namespace."""
- if self.supports_namespaces():
- namespace = ''
- ns = self.get_uri(el)
- if ns:
- namespace = ns
- else:
- namespace = NS_XHTML
- return namespace
- def is_html_tag(self, el: 'bs4.Tag') -> bool:
- """Check if tag is in HTML namespace."""
- return self.get_tag_ns(el) == NS_XHTML
- def get_tag(self, el: 'bs4.Tag') -> Optional[str]:
- """Get tag."""
- name = self.get_tag_name(el)
- return util.lower(name) if name is not None and not self.is_xml else name
- def get_prefix(self, el: 'bs4.Tag') -> Optional[str]:
- """Get prefix."""
- prefix = self.get_prefix_name(el)
- return util.lower(prefix) if prefix is not None and not self.is_xml else prefix
- def find_bidi(self, el: 'bs4.Tag') -> Optional[int]:
- """Get directionality from element text."""
- for node in self.get_children(el, tags=False):
- # Analyze child text nodes
- if self.is_tag(node):
- # Avoid analyzing certain elements specified in the specification.
- direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(node, 'dir', '')), None)
- if (
- self.get_tag(node) in ('bdi', 'script', 'style', 'textarea', 'iframe') or
- not self.is_html_tag(node) or
- direction is not None
- ):
- continue # pragma: no cover
- # Check directionality of this node's text
- value = self.find_bidi(node)
- if value is not None:
- return value
- # Direction could not be determined
- continue # pragma: no cover
- # Skip `doctype` comments, etc.
- if self.is_special_string(node):
- continue
- # Analyze text nodes for directionality.
- for c in node:
- bidi = unicodedata.bidirectional(c)
- if bidi in ('AL', 'R', 'L'):
- return ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
- return None
- def extended_language_filter(self, lang_range: str, lang_tag: str) -> bool:
- """Filter the language tags."""
- match = True
- lang_range = RE_WILD_STRIP.sub('-', lang_range).lower()
- ranges = lang_range.split('-')
- subtags = lang_tag.lower().split('-')
- length = len(ranges)
- rindex = 0
- sindex = 0
- r = ranges[rindex]
- s = subtags[sindex]
- # Primary tag needs to match
- if r != '*' and r != s:
- match = False
- rindex += 1
- sindex += 1
- # Match until we run out of ranges
- while match and rindex < length:
- r = ranges[rindex]
- try:
- s = subtags[sindex]
- except IndexError:
- # Ran out of subtags,
- # but we still have ranges
- match = False
- continue
- # Empty range
- if not r:
- match = False
- continue
- # Matched range
- elif s == r:
- rindex += 1
- # Implicit wildcard cannot match
- # singletons
- elif len(s) == 1:
- match = False
- continue
- # Implicitly matched, so grab next subtag
- sindex += 1
- return match
- def match_attribute_name(
- self,
- el: 'bs4.Tag',
- attr: str,
- prefix: Optional[str]
- ) -> Optional[Union[str, Sequence[str]]]:
- """Match attribute name and return value if it exists."""
- value = None
- if self.supports_namespaces():
- value = None
- # If we have not defined namespaces, we can't very well find them, so don't bother trying.
- if prefix:
- ns = self.namespaces.get(prefix)
- if ns is None and prefix != '*':
- return None
- else:
- ns = None
- for k, v in self.iter_attributes(el):
- # Get attribute parts
- namespace, name = self.split_namespace(el, k)
- # Can't match a prefix attribute as we haven't specified one to match
- # Try to match it normally as a whole `p:a` as selector may be trying `p\:a`.
- if ns is None:
- if (self.is_xml and attr == k) or (not self.is_xml and util.lower(attr) == util.lower(k)):
- value = v
- break
- # Coverage is not finding this even though it is executed.
- # Adding a print statement before this (and erasing coverage) causes coverage to find the line.
- # Ignore the false positive message.
- continue # pragma: no cover
- # We can't match our desired prefix attribute as the attribute doesn't have a prefix
- if namespace is None or ns != namespace and prefix != '*':
- continue
- # The attribute doesn't match.
- if (util.lower(attr) != util.lower(name)) if not self.is_xml else (attr != name):
- continue
- value = v
- break
- else:
- for k, v in self.iter_attributes(el):
- if util.lower(attr) != util.lower(k):
- continue
- value = v
- break
- return value
- def match_namespace(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
- """Match the namespace of the element."""
- match = True
- namespace = self.get_tag_ns(el)
- default_namespace = self.namespaces.get('')
- tag_ns = '' if tag.prefix is None else self.namespaces.get(tag.prefix)
- # We must match the default namespace if one is not provided
- if tag.prefix is None and (default_namespace is not None and namespace != default_namespace):
- match = False
- # If we specified `|tag`, we must not have a namespace.
- elif (tag.prefix is not None and tag.prefix == '' and namespace):
- match = False
- # Verify prefix matches
- elif (
- tag.prefix and
- tag.prefix != '*' and (tag_ns is None or namespace != tag_ns)
- ):
- match = False
- return match
- def match_attributes(self, el: 'bs4.Tag', attributes: Tuple[ct.SelectorAttribute, ...]) -> bool:
- """Match attributes."""
- match = True
- if attributes:
- for a in attributes:
- temp = self.match_attribute_name(el, a.attribute, a.prefix)
- pattern = a.xml_type_pattern if self.is_xml and a.xml_type_pattern else a.pattern
- if temp is None:
- match = False
- break
- value = temp if isinstance(temp, str) else ' '.join(temp)
- if pattern is None:
- continue
- elif pattern.match(value) is None:
- match = False
- break
- return match
- def match_tagname(self, el: 'bs4.Tag', tag: ct.SelectorTag) -> bool:
- """Match tag name."""
- name = (util.lower(tag.name) if not self.is_xml and tag.name is not None else tag.name)
- return not (
- name is not None and
- name not in (self.get_tag(el), '*')
- )
- def match_tag(self, el: 'bs4.Tag', tag: Optional[ct.SelectorTag]) -> bool:
- """Match the tag."""
- match = True
- if tag is not None:
- # Verify namespace
- if not self.match_namespace(el, tag):
- match = False
- if not self.match_tagname(el, tag):
- match = False
- return match
- def match_past_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
- """Match past relationship."""
- found = False
- # I don't think this can ever happen, but it makes `mypy` happy
- if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
- return found
- if relation[0].rel_type == REL_PARENT:
- parent = self.get_parent(el, no_iframe=self.iframe_restrict)
- while not found and parent:
- found = self.match_selectors(parent, relation)
- parent = self.get_parent(parent, no_iframe=self.iframe_restrict)
- elif relation[0].rel_type == REL_CLOSE_PARENT:
- parent = self.get_parent(el, no_iframe=self.iframe_restrict)
- if parent:
- found = self.match_selectors(parent, relation)
- elif relation[0].rel_type == REL_SIBLING:
- sibling = self.get_previous(el)
- while not found and sibling:
- found = self.match_selectors(sibling, relation)
- sibling = self.get_previous(sibling)
- elif relation[0].rel_type == REL_CLOSE_SIBLING:
- sibling = self.get_previous(el)
- if sibling and self.is_tag(sibling):
- found = self.match_selectors(sibling, relation)
- return found
- def match_future_child(self, parent: 'bs4.Tag', relation: ct.SelectorList, recursive: bool = False) -> bool:
- """Match future child."""
- match = False
- if recursive:
- children = self.get_descendants # type: Callable[..., Iterator['bs4.Tag']]
- else:
- children = self.get_children
- for child in children(parent, no_iframe=self.iframe_restrict):
- match = self.match_selectors(child, relation)
- if match:
- break
- return match
- def match_future_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
- """Match future relationship."""
- found = False
- # I don't think this can ever happen, but it makes `mypy` happy
- if isinstance(relation[0], ct.SelectorNull): # pragma: no cover
- return found
- if relation[0].rel_type == REL_HAS_PARENT:
- found = self.match_future_child(el, relation, True)
- elif relation[0].rel_type == REL_HAS_CLOSE_PARENT:
- found = self.match_future_child(el, relation)
- elif relation[0].rel_type == REL_HAS_SIBLING:
- sibling = self.get_next(el)
- while not found and sibling:
- found = self.match_selectors(sibling, relation)
- sibling = self.get_next(sibling)
- elif relation[0].rel_type == REL_HAS_CLOSE_SIBLING:
- sibling = self.get_next(el)
- if sibling and self.is_tag(sibling):
- found = self.match_selectors(sibling, relation)
- return found
- def match_relations(self, el: 'bs4.Tag', relation: ct.SelectorList) -> bool:
- """Match relationship to other elements."""
- found = False
- if isinstance(relation[0], ct.SelectorNull) or relation[0].rel_type is None:
- return found
- if relation[0].rel_type.startswith(':'):
- found = self.match_future_relations(el, relation)
- else:
- found = self.match_past_relations(el, relation)
- return found
- def match_id(self, el: 'bs4.Tag', ids: Tuple[str, ...]) -> bool:
- """Match element's ID."""
- found = True
- for i in ids:
- if i != self.get_attribute_by_name(el, 'id', ''):
- found = False
- break
- return found
- def match_classes(self, el: 'bs4.Tag', classes: Tuple[str, ...]) -> bool:
- """Match element's classes."""
- current_classes = self.get_classes(el)
- found = True
- for c in classes:
- if c not in current_classes:
- found = False
- break
- return found
- def match_root(self, el: 'bs4.Tag') -> bool:
- """Match element as root."""
- is_root = self.is_root(el)
- if is_root:
- sibling = self.get_previous(el, tags=False)
- while is_root and sibling is not None:
- if (
- self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
- self.is_cdata(sibling)
- ):
- is_root = False
- else:
- sibling = self.get_previous(sibling, tags=False)
- if is_root:
- sibling = self.get_next(el, tags=False)
- while is_root and sibling is not None:
- if (
- self.is_tag(sibling) or (self.is_content_string(sibling) and sibling.strip()) or
- self.is_cdata(sibling)
- ):
- is_root = False
- else:
- sibling = self.get_next(sibling, tags=False)
- return is_root
- def match_scope(self, el: 'bs4.Tag') -> bool:
- """Match element as scope."""
- return self.scope is el
- def match_nth_tag_type(self, el: 'bs4.Tag', child: 'bs4.Tag') -> bool:
- """Match tag type for `nth` matches."""
- return(
- (self.get_tag(child) == self.get_tag(el)) and
- (self.get_tag_ns(child) == self.get_tag_ns(el))
- )
- def match_nth(self, el: 'bs4.Tag', nth: 'bs4.Tag') -> bool:
- """Match `nth` elements."""
- matched = True
- for n in nth:
- matched = False
- if n.selectors and not self.match_selectors(el, n.selectors):
- break
- parent = self.get_parent(el)
- if parent is None:
- parent = self.create_fake_parent(el)
- last = n.last
- last_index = len(parent) - 1
- index = last_index if last else 0
- relative_index = 0
- a = n.a
- b = n.b
- var = n.n
- count = 0
- count_incr = 1
- factor = -1 if last else 1
- idx = last_idx = a * count + b if var else a
- # We can only adjust bounds within a variable index
- if var:
- # Abort if our nth index is out of bounds and only getting further out of bounds as we increment.
- # Otherwise, increment to try to get in bounds.
- adjust = None
- while idx < 1 or idx > last_index:
- if idx < 0:
- diff_low = 0 - idx
- if adjust is not None and adjust == 1:
- break
- adjust = -1
- count += count_incr
- idx = last_idx = a * count + b if var else a
- diff = 0 - idx
- if diff >= diff_low:
- break
- else:
- diff_high = idx - last_index
- if adjust is not None and adjust == -1:
- break
- adjust = 1
- count += count_incr
- idx = last_idx = a * count + b if var else a
- diff = idx - last_index
- if diff >= diff_high:
- break
- diff_high = diff
- # If a < 0, our count is working backwards, so floor the index by increasing the count.
- # Find the count that yields the lowest, in bound value and use that.
- # Lastly reverse count increment so that we'll increase our index.
- lowest = count
- if a < 0:
- while idx >= 1:
- lowest = count
- count += count_incr
- idx = last_idx = a * count + b if var else a
- count_incr = -1
- count = lowest
- idx = last_idx = a * count + b if var else a
- # Evaluate elements while our calculated nth index is still in range
- while 1 <= idx <= last_index + 1:
- child = None
- # Evaluate while our child index is still in range.
- for child in self.get_children(parent, start=index, reverse=factor < 0, tags=False):
- index += factor
- if not self.is_tag(child):
- continue
- # Handle `of S` in `nth-child`
- if n.selectors and not self.match_selectors(child, n.selectors):
- continue
- # Handle `of-type`
- if n.of_type and not self.match_nth_tag_type(el, child):
- continue
- relative_index += 1
- if relative_index == idx:
- if child is el:
- matched = True
- else:
- break
- if child is el:
- break
- if child is el:
- break
- last_idx = idx
- count += count_incr
- if count < 0:
- # Count is counting down and has now ventured into invalid territory.
- break
- idx = a * count + b if var else a
- if last_idx == idx:
- break
- if not matched:
- break
- return matched
- def match_empty(self, el: 'bs4.Tag') -> bool:
- """Check if element is empty (if requested)."""
- is_empty = True
- for child in self.get_children(el, tags=False):
- if self.is_tag(child):
- is_empty = False
- break
- elif self.is_content_string(child) and RE_NOT_EMPTY.search(child):
- is_empty = False
- break
- return is_empty
- def match_subselectors(self, el: 'bs4.Tag', selectors: Tuple[ct.SelectorList, ...]) -> bool:
- """Match selectors."""
- match = True
- for sel in selectors:
- if not self.match_selectors(el, sel):
- match = False
- return match
- def match_contains(self, el: 'bs4.Tag', contains: Tuple[ct.SelectorContains, ...]) -> bool:
- """Match element if it contains text."""
- match = True
- content = None # type: Optional[Union[str, Sequence[str]]]
- for contain_list in contains:
- if content is None:
- if contain_list.own:
- content = self.get_own_text(el, no_iframe=self.is_html)
- else:
- content = self.get_text(el, no_iframe=self.is_html)
- found = False
- for text in contain_list.text:
- if contain_list.own:
- for c in content:
- if text in c:
- found = True
- break
- if found:
- break
- else:
- if text in content:
- found = True
- break
- if not found:
- match = False
- return match
- def match_default(self, el: 'bs4.Tag') -> bool:
- """Match default."""
- match = False
- # Find this input's form
- form = None
- parent = self.get_parent(el, no_iframe=True)
- while parent and form is None:
- if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
- form = parent
- else:
- parent = self.get_parent(parent, no_iframe=True)
- # Look in form cache to see if we've already located its default button
- found_form = False
- for f, t in self.cached_default_forms:
- if f is form:
- found_form = True
- if t is el:
- match = True
- break
- # We didn't have the form cached, so look for its default button
- if not found_form:
- for child in self.get_descendants(form, no_iframe=True):
- name = self.get_tag(child)
- # Can't do nested forms (haven't figured out why we never hit this)
- if name == 'form': # pragma: no cover
- break
- if name in ('input', 'button'):
- v = self.get_attribute_by_name(child, 'type', '')
- if v and util.lower(v) == 'submit':
- self.cached_default_forms.append((form, child))
- if el is child:
- match = True
- break
- return match
- def match_indeterminate(self, el: 'bs4.Tag') -> bool:
- """Match default."""
- match = False
- name = cast(str, self.get_attribute_by_name(el, 'name'))
- def get_parent_form(el: 'bs4.Tag') -> Optional['bs4.Tag']:
- """Find this input's form."""
- form = None
- parent = self.get_parent(el, no_iframe=True)
- while form is None:
- if self.get_tag(parent) == 'form' and self.is_html_tag(parent):
- form = parent
- break
- last_parent = parent
- parent = self.get_parent(parent, no_iframe=True)
- if parent is None:
- form = last_parent
- break
- return form
- form = get_parent_form(el)
- # Look in form cache to see if we've already evaluated that its fellow radio buttons are indeterminate
- found_form = False
- for f, n, i in self.cached_indeterminate_forms:
- if f is form and n == name:
- found_form = True
- if i is True:
- match = True
- break
- # We didn't have the form cached, so validate that the radio button is indeterminate
- if not found_form:
- checked = False
- for child in self.get_descendants(form, no_iframe=True):
- if child is el:
- continue
- tag_name = self.get_tag(child)
- if tag_name == 'input':
- is_radio = False
- check = False
- has_name = False
- for k, v in self.iter_attributes(child):
- if util.lower(k) == 'type' and util.lower(v) == 'radio':
- is_radio = True
- elif util.lower(k) == 'name' and v == name:
- has_name = True
- elif util.lower(k) == 'checked':
- check = True
- if is_radio and check and has_name and get_parent_form(child) is form:
- checked = True
- break
- if checked:
- break
- if not checked:
- match = True
- self.cached_indeterminate_forms.append((form, name, match))
- return match
- def match_lang(self, el: 'bs4.Tag', langs: Tuple[ct.SelectorLang, ...]) -> bool:
- """Match languages."""
- match = False
- has_ns = self.supports_namespaces()
- root = self.root
- has_html_namespace = self.has_html_namespace
- # Walk parents looking for `lang` (HTML) or `xml:lang` XML property.
- parent = el
- found_lang = None
- last = None
- while not found_lang:
- has_html_ns = self.has_html_ns(parent)
- for k, v in self.iter_attributes(parent):
- attr_ns, attr = self.split_namespace(parent, k)
- if (
- ((not has_ns or has_html_ns) and (util.lower(k) if not self.is_xml else k) == 'lang') or
- (
- has_ns and not has_html_ns and attr_ns == NS_XML and
- (util.lower(attr) if not self.is_xml and attr is not None else attr) == 'lang'
- )
- ):
- found_lang = v
- break
- last = parent
- parent = self.get_parent(parent, no_iframe=self.is_html)
- if parent is None:
- root = last
- has_html_namespace = self.has_html_ns(root)
- parent = last
- break
- # Use cached meta language.
- if not found_lang and self.cached_meta_lang:
- for cache in self.cached_meta_lang:
- if root is cache[0]:
- found_lang = cache[1]
- # If we couldn't find a language, and the document is HTML, look to meta to determine language.
- if found_lang is None and (not self.is_xml or (has_html_namespace and root.name == 'html')):
- # Find head
- found = False
- for tag in ('html', 'head'):
- found = False
- for child in self.get_children(parent, no_iframe=self.is_html):
- if self.get_tag(child) == tag and self.is_html_tag(child):
- found = True
- parent = child
- break
- if not found: # pragma: no cover
- break
- # Search meta tags
- if found:
- for child in parent:
- if self.is_tag(child) and self.get_tag(child) == 'meta' and self.is_html_tag(parent):
- c_lang = False
- content = None
- for k, v in self.iter_attributes(child):
- if util.lower(k) == 'http-equiv' and util.lower(v) == 'content-language':
- c_lang = True
- if util.lower(k) == 'content':
- content = v
- if c_lang and content:
- found_lang = content
- self.cached_meta_lang.append((cast(str, root), cast(str, found_lang)))
- break
- if found_lang:
- break
- if not found_lang:
- self.cached_meta_lang.append((cast(str, root), ''))
- # If we determined a language, compare.
- if found_lang:
- for patterns in langs:
- match = False
- for pattern in patterns:
- if self.extended_language_filter(pattern, cast(str, found_lang)):
- match = True
- if not match:
- break
- return match
- def match_dir(self, el: 'bs4.Tag', directionality: int) -> bool:
- """Check directionality."""
- # If we have to match both left and right, we can't match either.
- if directionality & ct.SEL_DIR_LTR and directionality & ct.SEL_DIR_RTL:
- return False
- if el is None or not self.is_html_tag(el):
- return False
- # Element has defined direction of left to right or right to left
- direction = DIR_MAP.get(util.lower(self.get_attribute_by_name(el, 'dir', '')), None)
- if direction not in (None, 0):
- return direction == directionality
- # Element is the document element (the root) and no direction assigned, assume left to right.
- is_root = self.is_root(el)
- if is_root and direction is None:
- return ct.SEL_DIR_LTR == directionality
- # If `input[type=telephone]` and no direction is assigned, assume left to right.
- name = self.get_tag(el)
- is_input = name == 'input'
- is_textarea = name == 'textarea'
- is_bdi = name == 'bdi'
- itype = util.lower(self.get_attribute_by_name(el, 'type', '')) if is_input else ''
- if is_input and itype == 'tel' and direction is None:
- return ct.SEL_DIR_LTR == directionality
- # Auto handling for text inputs
- if ((is_input and itype in ('text', 'search', 'tel', 'url', 'email')) or is_textarea) and direction == 0:
- if is_textarea:
- temp = []
- for node in self.get_contents(el, no_iframe=True):
- if self.is_content_string(node):
- temp.append(node)
- value = ''.join(temp)
- else:
- value = cast(str, self.get_attribute_by_name(el, 'value', ''))
- if value:
- for c in value:
- bidi = unicodedata.bidirectional(c)
- if bidi in ('AL', 'R', 'L'):
- direction = ct.SEL_DIR_LTR if bidi == 'L' else ct.SEL_DIR_RTL
- return direction == directionality
- # Assume left to right
- return ct.SEL_DIR_LTR == directionality
- elif is_root:
- return ct.SEL_DIR_LTR == directionality
- return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
- # Auto handling for `bdi` and other non text inputs.
- if (is_bdi and direction is None) or direction == 0:
- direction = self.find_bidi(el)
- if direction is not None:
- return direction == directionality
- elif is_root:
- return ct.SEL_DIR_LTR == directionality
- return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
- # Match parents direction
- return self.match_dir(self.get_parent(el, no_iframe=True), directionality)
- def match_range(self, el: 'bs4.Tag', condition: int) -> bool:
- """
- Match range.
- Behavior is modeled after what we see in browsers. Browsers seem to evaluate
- if the value is out of range, and if not, it is in range. So a missing value
- will not evaluate out of range; therefore, value is in range. Personally, I
- feel like this should evaluate as neither in or out of range.
- """
- out_of_range = False
- itype = util.lower(self.get_attribute_by_name(el, 'type'))
- mn = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'min', None)))
- mx = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'max', None)))
- # There is no valid min or max, so we cannot evaluate a range
- if mn is None and mx is None:
- return False
- value = Inputs.parse_value(itype, cast(str, self.get_attribute_by_name(el, 'value', None)))
- if value is not None:
- if itype in ("date", "datetime-local", "month", "week", "number", "range"):
- if mn is not None and value < mn:
- out_of_range = True
- if not out_of_range and mx is not None and value > mx:
- out_of_range = True
- elif itype == "time":
- if mn is not None and mx is not None and mn > mx:
- # Time is periodic, so this is a reversed/discontinuous range
- if value < mn and value > mx:
- out_of_range = True
- else:
- if mn is not None and value < mn:
- out_of_range = True
- if not out_of_range and mx is not None and value > mx:
- out_of_range = True
- return not out_of_range if condition & ct.SEL_IN_RANGE else out_of_range
- def match_defined(self, el: 'bs4.Tag') -> bool:
- """
- Match defined.
- `:defined` is related to custom elements in a browser.
- - If the document is XML (not XHTML), all tags will match.
- - Tags that are not custom (don't have a hyphen) are marked defined.
- - If the tag has a prefix (without or without a namespace), it will not match.
- This is of course requires the parser to provide us with the proper prefix and namespace info,
- if it doesn't, there is nothing we can do.
- """
- name = self.get_tag(el)
- return (
- name is not None and (
- name.find('-') == -1 or
- name.find(':') != -1 or
- self.get_prefix(el) is not None
- )
- )
- def match_placeholder_shown(self, el: 'bs4.Tag') -> bool:
- """
- Match placeholder shown according to HTML spec.
- - text area should be checked if they have content. A single newline does not count as content.
- """
- match = False
- content = self.get_text(el)
- if content in ('', '\n'):
- match = True
- return match
- def match_selectors(self, el: 'bs4.Tag', selectors: ct.SelectorList) -> bool:
- """Check if element matches one of the selectors."""
- match = False
- is_not = selectors.is_not
- is_html = selectors.is_html
- # Internal selector lists that use the HTML flag, will automatically get the `html` namespace.
- if is_html:
- namespaces = self.namespaces
- iframe_restrict = self.iframe_restrict
- self.namespaces = {'html': NS_XHTML}
- self.iframe_restrict = True
- if not is_html or self.is_html:
- for selector in selectors:
- match = is_not
- # We have a un-matchable situation (like `:focus` as you can focus an element in this environment)
- if isinstance(selector, ct.SelectorNull):
- continue
- # Verify tag matches
- if not self.match_tag(el, selector.tag):
- continue
- # Verify tag is defined
- if selector.flags & ct.SEL_DEFINED and not self.match_defined(el):
- continue
- # Verify element is root
- if selector.flags & ct.SEL_ROOT and not self.match_root(el):
- continue
- # Verify element is scope
- if selector.flags & ct.SEL_SCOPE and not self.match_scope(el):
- continue
- # Verify element has placeholder shown
- if selector.flags & ct.SEL_PLACEHOLDER_SHOWN and not self.match_placeholder_shown(el):
- continue
- # Verify `nth` matches
- if not self.match_nth(el, selector.nth):
- continue
- if selector.flags & ct.SEL_EMPTY and not self.match_empty(el):
- continue
- # Verify id matches
- if selector.ids and not self.match_id(el, selector.ids):
- continue
- # Verify classes match
- if selector.classes and not self.match_classes(el, selector.classes):
- continue
- # Verify attribute(s) match
- if not self.match_attributes(el, selector.attributes):
- continue
- # Verify ranges
- if selector.flags & RANGES and not self.match_range(el, selector.flags & RANGES):
- continue
- # Verify language patterns
- if selector.lang and not self.match_lang(el, selector.lang):
- continue
- # Verify pseudo selector patterns
- if selector.selectors and not self.match_subselectors(el, selector.selectors):
- continue
- # Verify relationship selectors
- if selector.relation and not self.match_relations(el, selector.relation):
- continue
- # Validate that the current default selector match corresponds to the first submit button in the form
- if selector.flags & ct.SEL_DEFAULT and not self.match_default(el):
- continue
- # Validate that the unset radio button is among radio buttons with the same name in a form that are
- # also not set.
- if selector.flags & ct.SEL_INDETERMINATE and not self.match_indeterminate(el):
- continue
- # Validate element directionality
- if selector.flags & DIR_FLAGS and not self.match_dir(el, selector.flags & DIR_FLAGS):
- continue
- # Validate that the tag contains the specified text.
- if selector.contains and not self.match_contains(el, selector.contains):
- continue
- match = not is_not
- break
- # Restore actual namespaces being used for external selector lists
- if is_html:
- self.namespaces = namespaces
- self.iframe_restrict = iframe_restrict
- return match
- def select(self, limit: int = 0) -> Iterator['bs4.Tag']:
- """Match all tags under the targeted tag."""
- lim = None if limit < 1 else limit
- for child in self.get_descendants(self.tag):
- if self.match(child):
- yield child
- if lim is not None:
- lim -= 1
- if lim < 1:
- break
- def closest(self) -> Optional['bs4.Tag']:
- """Match closest ancestor."""
- current = self.tag
- closest = None
- while closest is None and current is not None:
- if self.match(current):
- closest = current
- else:
- current = self.get_parent(current)
- return closest
- def filter(self) -> List['bs4.Tag']: # noqa A001
- """Filter tag's children."""
- return [tag for tag in self.get_contents(self.tag) if not self.is_navigable_string(tag) and self.match(tag)]
- def match(self, el: 'bs4.Tag') -> bool:
- """Match."""
- return not self.is_doc(el) and self.is_tag(el) and self.match_selectors(el, self.selectors)
- class SoupSieve(ct.Immutable):
- """Compiled Soup Sieve selector matching object."""
- pattern: str
- selectors: ct.SelectorList
- namespaces: Optional[ct.Namespaces]
- custom: Dict[str, str]
- flags: int
- __slots__ = ("pattern", "selectors", "namespaces", "custom", "flags", "_hash")
- def __init__(
- self,
- pattern: str,
- selectors: ct.SelectorList,
- namespaces: Optional[ct.Namespaces],
- custom: Optional[ct.CustomSelectors],
- flags: int
- ):
- """Initialize."""
- super().__init__(
- pattern=pattern,
- selectors=selectors,
- namespaces=namespaces,
- custom=custom,
- flags=flags
- )
- def match(self, tag: 'bs4.Tag') -> bool:
- """Match."""
- return CSSMatch(self.selectors, tag, self.namespaces, self.flags).match(tag)
- def closest(self, tag: 'bs4.Tag') -> 'bs4.Tag':
- """Match closest ancestor."""
- return CSSMatch(self.selectors, tag, self.namespaces, self.flags).closest()
- def filter(self, iterable: Iterable['bs4.Tag']) -> List['bs4.Tag']: # noqa A001
- """
- Filter.
- `CSSMatch` can cache certain searches for tags of the same document,
- so if we are given a tag, all tags are from the same document,
- and we can take advantage of the optimization.
- Any other kind of iterable could have tags from different documents or detached tags,
- so for those, we use a new `CSSMatch` for each item in the iterable.
- """
- if CSSMatch.is_tag(iterable):
- return CSSMatch(self.selectors, iterable, self.namespaces, self.flags).filter()
- else:
- return [node for node in iterable if not CSSMatch.is_navigable_string(node) and self.match(node)]
- def select_one(self, tag: 'bs4.Tag') -> 'bs4.Tag':
- """Select a single tag."""
- tags = self.select(tag, limit=1)
- return tags[0] if tags else None
- def select(self, tag: 'bs4.Tag', limit: int = 0) -> List['bs4.Tag']:
- """Select the specified tags."""
- return list(self.iselect(tag, limit))
- def iselect(self, tag: 'bs4.Tag', limit: int = 0) -> Iterator['bs4.Tag']:
- """Iterate the specified tags."""
- for el in CSSMatch(self.selectors, tag, self.namespaces, self.flags).select(limit):
- yield el
- def __repr__(self) -> str: # pragma: no cover
- """Representation."""
- return "SoupSieve(pattern={!r}, namespaces={!r}, custom={!r}, flags={!r})".format(
- self.pattern,
- self.namespaces,
- self.custom,
- self.flags
- )
- __str__ = __repr__
- ct.pickle_register(SoupSieve)
|