123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095 |
- # -*- coding: utf-8 -*-
- """Beautiful Soup bonus library: Unicode, Dammit
- This library converts a bytestream to Unicode through any means
- necessary. It is heavily based on code from Mark Pilgrim's Universal
- Feed Parser. It works best on XML and HTML, but it does not rewrite the
- XML or HTML to reflect a new encoding; that's the tree builder's job.
- """
- # Use of this source code is governed by the MIT license.
- __license__ = "MIT"
- from html.entities import codepoint2name
- from collections import defaultdict
- import codecs
- import re
- import logging
- import string
- # Import a library to autodetect character encodings. We'll support
- # any of a number of libraries that all support the same API:
- #
- # * cchardet
- # * chardet
- # * charset-normalizer
- chardet_module = None
- try:
- # PyPI package: cchardet
- import cchardet as chardet_module
- except ImportError:
- try:
- # Debian package: python-chardet
- # PyPI package: chardet
- import chardet as chardet_module
- except ImportError:
- try:
- # PyPI package: charset-normalizer
- import charset_normalizer as chardet_module
- except ImportError:
- # No chardet available.
- chardet_module = None
- if chardet_module:
- def chardet_dammit(s):
- if isinstance(s, str):
- return None
- return chardet_module.detect(s)['encoding']
- else:
- def chardet_dammit(s):
- return None
- # Build bytestring and Unicode versions of regular expressions for finding
- # a declared encoding inside an XML or HTML document.
- xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
- html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
- encoding_res = dict()
- encoding_res[bytes] = {
- 'html' : re.compile(html_meta.encode("ascii"), re.I),
- 'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
- }
- encoding_res[str] = {
- 'html' : re.compile(html_meta, re.I),
- 'xml' : re.compile(xml_encoding, re.I)
- }
- from html.entities import html5
- class EntitySubstitution(object):
- """The ability to substitute XML or HTML entities for certain characters."""
- def _populate_class_variables():
- """Initialize variables used by this class to manage the plethora of
- HTML5 named entities.
- This function returns a 3-tuple containing two dictionaries
- and a regular expression:
- unicode_to_name - A mapping of Unicode strings like "⦨" to
- entity names like "angmsdaa". When a single Unicode string has
- multiple entity names, we try to choose the most commonly-used
- name.
- name_to_unicode: A mapping of entity names like "angmsdaa" to
- Unicode strings like "⦨".
- named_entity_re: A regular expression matching (almost) any
- Unicode string that corresponds to an HTML5 named entity.
- """
- unicode_to_name = {}
- name_to_unicode = {}
- short_entities = set()
- long_entities_by_first_character = defaultdict(set)
-
- for name_with_semicolon, character in sorted(html5.items()):
- # "It is intentional, for legacy compatibility, that many
- # code points have multiple character reference names. For
- # example, some appear both with and without the trailing
- # semicolon, or with different capitalizations."
- # - https://html.spec.whatwg.org/multipage/named-characters.html#named-character-references
- #
- # The parsers are in charge of handling (or not) character
- # references with no trailing semicolon, so we remove the
- # semicolon whenever it appears.
- if name_with_semicolon.endswith(';'):
- name = name_with_semicolon[:-1]
- else:
- name = name_with_semicolon
- # When parsing HTML, we want to recognize any known named
- # entity and convert it to a sequence of Unicode
- # characters.
- if name not in name_to_unicode:
- name_to_unicode[name] = character
- # When _generating_ HTML, we want to recognize special
- # character sequences that _could_ be converted to named
- # entities.
- unicode_to_name[character] = name
- # We also need to build a regular expression that lets us
- # _find_ those characters in output strings so we can
- # replace them.
- #
- # This is tricky, for two reasons.
- if (len(character) == 1 and ord(character) < 128
- and character not in '<>&'):
- # First, it would be annoying to turn single ASCII
- # characters like | into named entities like
- # |. The exceptions are <>&, which we _must_
- # turn into named entities to produce valid HTML.
- continue
- if len(character) > 1 and all(ord(x) < 128 for x in character):
- # We also do not want to turn _combinations_ of ASCII
- # characters like 'fj' into named entities like 'fj',
- # though that's more debateable.
- continue
- # Second, some named entities have a Unicode value that's
- # a subset of the Unicode value for some _other_ named
- # entity. As an example, \u2267' is ≧,
- # but '\u2267\u0338' is ≧̸. Our regular
- # expression needs to match the first two characters of
- # "\u2267\u0338foo", but only the first character of
- # "\u2267foo".
- #
- # In this step, we build two sets of characters that
- # _eventually_ need to go into the regular expression. But
- # we won't know exactly what the regular expression needs
- # to look like until we've gone through the entire list of
- # named entities.
- if len(character) == 1:
- short_entities.add(character)
- else:
- long_entities_by_first_character[character[0]].add(character)
- # Now that we've been through the entire list of entities, we
- # can create a regular expression that matches any of them.
- particles = set()
- for short in short_entities:
- long_versions = long_entities_by_first_character[short]
- if not long_versions:
- particles.add(short)
- else:
- ignore = "".join([x[1] for x in long_versions])
- # This finds, e.g. \u2267 but only if it is _not_
- # followed by \u0338.
- particles.add("%s(?![%s])" % (short, ignore))
-
- for long_entities in list(long_entities_by_first_character.values()):
- for long_entity in long_entities:
- particles.add(long_entity)
- re_definition = "(%s)" % "|".join(particles)
-
- # If an entity shows up in both html5 and codepoint2name, it's
- # likely that HTML5 gives it several different names, such as
- # 'rsquo' and 'rsquor'. When converting Unicode characters to
- # named entities, the codepoint2name name should take
- # precedence where possible, since that's the more easily
- # recognizable one.
- for codepoint, name in list(codepoint2name.items()):
- character = chr(codepoint)
- unicode_to_name[character] = name
- return unicode_to_name, name_to_unicode, re.compile(re_definition)
- (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
- CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
- CHARACTER_TO_XML_ENTITY = {
- "'": "apos",
- '"': "quot",
- "&": "amp",
- "<": "lt",
- ">": "gt",
- }
- BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
- "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
- ")")
- AMPERSAND_OR_BRACKET = re.compile("([<>&])")
- @classmethod
- def _substitute_html_entity(cls, matchobj):
- """Used with a regular expression to substitute the
- appropriate HTML entity for a special character string."""
- entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
- return "&%s;" % entity
- @classmethod
- def _substitute_xml_entity(cls, matchobj):
- """Used with a regular expression to substitute the
- appropriate XML entity for a special character string."""
- entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
- return "&%s;" % entity
- @classmethod
- def quoted_attribute_value(self, value):
- """Make a value into a quoted XML attribute, possibly escaping it.
- Most strings will be quoted using double quotes.
- Bob's Bar -> "Bob's Bar"
- If a string contains double quotes, it will be quoted using
- single quotes.
- Welcome to "my bar" -> 'Welcome to "my bar"'
- If a string contains both single and double quotes, the
- double quotes will be escaped, and the string will be quoted
- using double quotes.
- Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
- """
- quote_with = '"'
- if '"' in value:
- if "'" in value:
- # The string contains both single and double
- # quotes. Turn the double quotes into
- # entities. We quote the double quotes rather than
- # the single quotes because the entity name is
- # """ whether this is HTML or XML. If we
- # quoted the single quotes, we'd have to decide
- # between ' and &squot;.
- replace_with = """
- value = value.replace('"', replace_with)
- else:
- # There are double quotes but no single quotes.
- # We can use single quotes to quote the attribute.
- quote_with = "'"
- return quote_with + value + quote_with
- @classmethod
- def substitute_xml(cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
- :param value: A string to be substituted. The less-than sign
- will become <, the greater-than sign will become >,
- and any ampersands will become &. If you want ampersands
- that appear to be part of an entity definition to be left
- alone, use substitute_xml_containing_entities() instead.
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
- """
- # Escape angle brackets and ampersands.
- value = cls.AMPERSAND_OR_BRACKET.sub(
- cls._substitute_xml_entity, value)
- if make_quoted_attribute:
- value = cls.quoted_attribute_value(value)
- return value
- @classmethod
- def substitute_xml_containing_entities(
- cls, value, make_quoted_attribute=False):
- """Substitute XML entities for special XML characters.
- :param value: A string to be substituted. The less-than sign will
- become <, the greater-than sign will become >, and any
- ampersands that are not part of an entity defition will
- become &.
- :param make_quoted_attribute: If True, then the string will be
- quoted, as befits an attribute value.
- """
- # Escape angle brackets, and ampersands that aren't part of
- # entities.
- value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
- cls._substitute_xml_entity, value)
- if make_quoted_attribute:
- value = cls.quoted_attribute_value(value)
- return value
- @classmethod
- def substitute_html(cls, s):
- """Replace certain Unicode characters with named HTML entities.
- This differs from data.encode(encoding, 'xmlcharrefreplace')
- in that the goal is to make the result more readable (to those
- with ASCII displays) rather than to recover from
- errors. There's absolutely nothing wrong with a UTF-8 string
- containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
- character with "é" will make it more readable to some
- people.
- :param s: A Unicode string.
- """
- return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
- cls._substitute_html_entity, s)
- class EncodingDetector:
- """Suggests a number of possible encodings for a bytestring.
- Order of precedence:
- 1. Encodings you specifically tell EncodingDetector to try first
- (the known_definite_encodings argument to the constructor).
- 2. An encoding determined by sniffing the document's byte-order mark.
- 3. Encodings you specifically tell EncodingDetector to try if
- byte-order mark sniffing fails (the user_encodings argument to the
- constructor).
- 4. An encoding declared within the bytestring itself, either in an
- XML declaration (if the bytestring is to be interpreted as an XML
- document), or in a <meta> tag (if the bytestring is to be
- interpreted as an HTML document.)
- 5. An encoding detected through textual analysis by chardet,
- cchardet, or a similar external library.
- 4. UTF-8.
- 5. Windows-1252.
- """
- def __init__(self, markup, known_definite_encodings=None,
- is_html=False, exclude_encodings=None,
- user_encodings=None, override_encodings=None):
- """Constructor.
- :param markup: Some markup in an unknown encoding.
- :param known_definite_encodings: When determining the encoding
- of `markup`, these encodings will be tried first, in
- order. In HTML terms, this corresponds to the "known
- definite encoding" step defined here:
- https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
- :param user_encodings: These encodings will be tried after the
- `known_definite_encodings` have been tried and failed, and
- after an attempt to sniff the encoding by looking at a
- byte order mark has failed. In HTML terms, this
- corresponds to the step "user has explicitly instructed
- the user agent to override the document's character
- encoding", defined here:
- https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
- :param override_encodings: A deprecated alias for
- known_definite_encodings. Any encodings here will be tried
- immediately after the encodings in
- known_definite_encodings.
- :param is_html: If True, this markup is considered to be
- HTML. Otherwise it's assumed to be XML.
- :param exclude_encodings: These encodings will not be tried,
- even if they otherwise would be.
- """
- self.known_definite_encodings = list(known_definite_encodings or [])
- if override_encodings:
- self.known_definite_encodings += override_encodings
- self.user_encodings = user_encodings or []
- exclude_encodings = exclude_encodings or []
- self.exclude_encodings = set([x.lower() for x in exclude_encodings])
- self.chardet_encoding = None
- self.is_html = is_html
- self.declared_encoding = None
- # First order of business: strip a byte-order mark.
- self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
- def _usable(self, encoding, tried):
- """Should we even bother to try this encoding?
- :param encoding: Name of an encoding.
- :param tried: Encodings that have already been tried. This will be modified
- as a side effect.
- """
- if encoding is not None:
- encoding = encoding.lower()
- if encoding in self.exclude_encodings:
- return False
- if encoding not in tried:
- tried.add(encoding)
- return True
- return False
- @property
- def encodings(self):
- """Yield a number of encodings that might work for this markup.
- :yield: A sequence of strings.
- """
- tried = set()
- # First, try the known definite encodings
- for e in self.known_definite_encodings:
- if self._usable(e, tried):
- yield e
- # Did the document originally start with a byte-order mark
- # that indicated its encoding?
- if self._usable(self.sniffed_encoding, tried):
- yield self.sniffed_encoding
- # Sniffing the byte-order mark did nothing; try the user
- # encodings.
- for e in self.user_encodings:
- if self._usable(e, tried):
- yield e
-
- # Look within the document for an XML or HTML encoding
- # declaration.
- if self.declared_encoding is None:
- self.declared_encoding = self.find_declared_encoding(
- self.markup, self.is_html)
- if self._usable(self.declared_encoding, tried):
- yield self.declared_encoding
- # Use third-party character set detection to guess at the
- # encoding.
- if self.chardet_encoding is None:
- self.chardet_encoding = chardet_dammit(self.markup)
- if self._usable(self.chardet_encoding, tried):
- yield self.chardet_encoding
- # As a last-ditch effort, try utf-8 and windows-1252.
- for e in ('utf-8', 'windows-1252'):
- if self._usable(e, tried):
- yield e
- @classmethod
- def strip_byte_order_mark(cls, data):
- """If a byte-order mark is present, strip it and return the encoding it implies.
- :param data: Some markup.
- :return: A 2-tuple (modified data, implied encoding)
- """
- encoding = None
- if isinstance(data, str):
- # Unicode data cannot have a byte-order mark.
- return data, encoding
- if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16be'
- data = data[2:]
- elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
- and (data[2:4] != '\x00\x00'):
- encoding = 'utf-16le'
- data = data[2:]
- elif data[:3] == b'\xef\xbb\xbf':
- encoding = 'utf-8'
- data = data[3:]
- elif data[:4] == b'\x00\x00\xfe\xff':
- encoding = 'utf-32be'
- data = data[4:]
- elif data[:4] == b'\xff\xfe\x00\x00':
- encoding = 'utf-32le'
- data = data[4:]
- return data, encoding
- @classmethod
- def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
- """Given a document, tries to find its declared encoding.
- An XML encoding is declared at the beginning of the document.
- An HTML encoding is declared in a <meta> tag, hopefully near the
- beginning of the document.
- :param markup: Some markup.
- :param is_html: If True, this markup is considered to be HTML. Otherwise
- it's assumed to be XML.
- :param search_entire_document: Since an encoding is supposed to declared near the beginning
- of the document, most of the time it's only necessary to search a few kilobytes of data.
- Set this to True to force this method to search the entire document.
- """
- if search_entire_document:
- xml_endpos = html_endpos = len(markup)
- else:
- xml_endpos = 1024
- html_endpos = max(2048, int(len(markup) * 0.05))
- if isinstance(markup, bytes):
- res = encoding_res[bytes]
- else:
- res = encoding_res[str]
- xml_re = res['xml']
- html_re = res['html']
- declared_encoding = None
- declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
- if not declared_encoding_match and is_html:
- declared_encoding_match = html_re.search(markup, endpos=html_endpos)
- if declared_encoding_match is not None:
- declared_encoding = declared_encoding_match.groups()[0]
- if declared_encoding:
- if isinstance(declared_encoding, bytes):
- declared_encoding = declared_encoding.decode('ascii', 'replace')
- return declared_encoding.lower()
- return None
- class UnicodeDammit:
- """A class for detecting the encoding of a *ML document and
- converting it to a Unicode string. If the source encoding is
- windows-1252, can replace MS smart quotes with their HTML or XML
- equivalents."""
- # This dictionary maps commonly seen values for "charset" in HTML
- # meta tags to the corresponding Python codec names. It only covers
- # values that aren't in Python's aliases and can't be determined
- # by the heuristics in find_codec.
- CHARSET_ALIASES = {"macintosh": "mac-roman",
- "x-sjis": "shift-jis"}
- ENCODINGS_WITH_SMART_QUOTES = [
- "windows-1252",
- "iso-8859-1",
- "iso-8859-2",
- ]
- def __init__(self, markup, known_definite_encodings=[],
- smart_quotes_to=None, is_html=False, exclude_encodings=[],
- user_encodings=None, override_encodings=None
- ):
- """Constructor.
- :param markup: A bytestring representing markup in an unknown encoding.
- :param known_definite_encodings: When determining the encoding
- of `markup`, these encodings will be tried first, in
- order. In HTML terms, this corresponds to the "known
- definite encoding" step defined here:
- https://html.spec.whatwg.org/multipage/parsing.html#parsing-with-a-known-character-encoding
- :param user_encodings: These encodings will be tried after the
- `known_definite_encodings` have been tried and failed, and
- after an attempt to sniff the encoding by looking at a
- byte order mark has failed. In HTML terms, this
- corresponds to the step "user has explicitly instructed
- the user agent to override the document's character
- encoding", defined here:
- https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
- :param override_encodings: A deprecated alias for
- known_definite_encodings. Any encodings here will be tried
- immediately after the encodings in
- known_definite_encodings.
- :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
- to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
- Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
- will convert them to HTML entity references.
- :param is_html: If True, this markup is considered to be HTML. Otherwise
- it's assumed to be XML.
- :param exclude_encodings: These encodings will not be considered, even
- if the sniffing code thinks they might make sense.
- """
- self.smart_quotes_to = smart_quotes_to
- self.tried_encodings = []
- self.contains_replacement_characters = False
- self.is_html = is_html
- self.log = logging.getLogger(__name__)
- self.detector = EncodingDetector(
- markup, known_definite_encodings, is_html, exclude_encodings,
- user_encodings, override_encodings
- )
- # Short-circuit if the data is in Unicode to begin with.
- if isinstance(markup, str) or markup == '':
- self.markup = markup
- self.unicode_markup = str(markup)
- self.original_encoding = None
- return
- # The encoding detector may have stripped a byte-order mark.
- # Use the stripped markup from this point on.
- self.markup = self.detector.markup
- u = None
- for encoding in self.detector.encodings:
- markup = self.detector.markup
- u = self._convert_from(encoding)
- if u is not None:
- break
- if not u:
- # None of the encodings worked. As an absolute last resort,
- # try them again with character replacement.
- for encoding in self.detector.encodings:
- if encoding != "ascii":
- u = self._convert_from(encoding, "replace")
- if u is not None:
- self.log.warning(
- "Some characters could not be decoded, and were "
- "replaced with REPLACEMENT CHARACTER."
- )
- self.contains_replacement_characters = True
- break
- # If none of that worked, we could at this point force it to
- # ASCII, but that would destroy so much data that I think
- # giving up is better.
- self.unicode_markup = u
- if not u:
- self.original_encoding = None
- def _sub_ms_char(self, match):
- """Changes a MS smart quote character to an XML or HTML
- entity, or an ASCII character."""
- orig = match.group(1)
- if self.smart_quotes_to == 'ascii':
- sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
- else:
- sub = self.MS_CHARS.get(orig)
- if type(sub) == tuple:
- if self.smart_quotes_to == 'xml':
- sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
- else:
- sub = '&'.encode() + sub[0].encode() + ';'.encode()
- else:
- sub = sub.encode()
- return sub
- def _convert_from(self, proposed, errors="strict"):
- """Attempt to convert the markup to the proposed encoding.
- :param proposed: The name of a character encoding.
- """
- proposed = self.find_codec(proposed)
- if not proposed or (proposed, errors) in self.tried_encodings:
- return None
- self.tried_encodings.append((proposed, errors))
- markup = self.markup
- # Convert smart quotes to HTML if coming from an encoding
- # that might have them.
- if (self.smart_quotes_to is not None
- and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
- smart_quotes_re = b"([\x80-\x9f])"
- smart_quotes_compiled = re.compile(smart_quotes_re)
- markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
- try:
- #print("Trying to convert document to %s (errors=%s)" % (
- # proposed, errors))
- u = self._to_unicode(markup, proposed, errors)
- self.markup = u
- self.original_encoding = proposed
- except Exception as e:
- #print("That didn't work!")
- #print(e)
- return None
- #print("Correct encoding: %s" % proposed)
- return self.markup
- def _to_unicode(self, data, encoding, errors="strict"):
- """Given a string and its encoding, decodes the string into Unicode.
- :param encoding: The name of an encoding.
- """
- return str(data, encoding, errors)
- @property
- def declared_html_encoding(self):
- """If the markup is an HTML document, returns the encoding declared _within_
- the document.
- """
- if not self.is_html:
- return None
- return self.detector.declared_encoding
- def find_codec(self, charset):
- """Convert the name of a character set to a codec name.
- :param charset: The name of a character set.
- :return: The name of a codec.
- """
- value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
- or (charset and self._codec(charset.replace("-", "")))
- or (charset and self._codec(charset.replace("-", "_")))
- or (charset and charset.lower())
- or charset
- )
- if value:
- return value.lower()
- return None
- def _codec(self, charset):
- if not charset:
- return charset
- codec = None
- try:
- codecs.lookup(charset)
- codec = charset
- except (LookupError, ValueError):
- pass
- return codec
- # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
- MS_CHARS = {b'\x80': ('euro', '20AC'),
- b'\x81': ' ',
- b'\x82': ('sbquo', '201A'),
- b'\x83': ('fnof', '192'),
- b'\x84': ('bdquo', '201E'),
- b'\x85': ('hellip', '2026'),
- b'\x86': ('dagger', '2020'),
- b'\x87': ('Dagger', '2021'),
- b'\x88': ('circ', '2C6'),
- b'\x89': ('permil', '2030'),
- b'\x8A': ('Scaron', '160'),
- b'\x8B': ('lsaquo', '2039'),
- b'\x8C': ('OElig', '152'),
- b'\x8D': '?',
- b'\x8E': ('#x17D', '17D'),
- b'\x8F': '?',
- b'\x90': '?',
- b'\x91': ('lsquo', '2018'),
- b'\x92': ('rsquo', '2019'),
- b'\x93': ('ldquo', '201C'),
- b'\x94': ('rdquo', '201D'),
- b'\x95': ('bull', '2022'),
- b'\x96': ('ndash', '2013'),
- b'\x97': ('mdash', '2014'),
- b'\x98': ('tilde', '2DC'),
- b'\x99': ('trade', '2122'),
- b'\x9a': ('scaron', '161'),
- b'\x9b': ('rsaquo', '203A'),
- b'\x9c': ('oelig', '153'),
- b'\x9d': '?',
- b'\x9e': ('#x17E', '17E'),
- b'\x9f': ('Yuml', ''),}
- # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
- # horrors like stripping diacritical marks to turn á into a, but also
- # contains non-horrors like turning “ into ".
- MS_CHARS_TO_ASCII = {
- b'\x80' : 'EUR',
- b'\x81' : ' ',
- b'\x82' : ',',
- b'\x83' : 'f',
- b'\x84' : ',,',
- b'\x85' : '...',
- b'\x86' : '+',
- b'\x87' : '++',
- b'\x88' : '^',
- b'\x89' : '%',
- b'\x8a' : 'S',
- b'\x8b' : '<',
- b'\x8c' : 'OE',
- b'\x8d' : '?',
- b'\x8e' : 'Z',
- b'\x8f' : '?',
- b'\x90' : '?',
- b'\x91' : "'",
- b'\x92' : "'",
- b'\x93' : '"',
- b'\x94' : '"',
- b'\x95' : '*',
- b'\x96' : '-',
- b'\x97' : '--',
- b'\x98' : '~',
- b'\x99' : '(TM)',
- b'\x9a' : 's',
- b'\x9b' : '>',
- b'\x9c' : 'oe',
- b'\x9d' : '?',
- b'\x9e' : 'z',
- b'\x9f' : 'Y',
- b'\xa0' : ' ',
- b'\xa1' : '!',
- b'\xa2' : 'c',
- b'\xa3' : 'GBP',
- b'\xa4' : '$', #This approximation is especially parochial--this is the
- #generic currency symbol.
- b'\xa5' : 'YEN',
- b'\xa6' : '|',
- b'\xa7' : 'S',
- b'\xa8' : '..',
- b'\xa9' : '',
- b'\xaa' : '(th)',
- b'\xab' : '<<',
- b'\xac' : '!',
- b'\xad' : ' ',
- b'\xae' : '(R)',
- b'\xaf' : '-',
- b'\xb0' : 'o',
- b'\xb1' : '+-',
- b'\xb2' : '2',
- b'\xb3' : '3',
- b'\xb4' : ("'", 'acute'),
- b'\xb5' : 'u',
- b'\xb6' : 'P',
- b'\xb7' : '*',
- b'\xb8' : ',',
- b'\xb9' : '1',
- b'\xba' : '(th)',
- b'\xbb' : '>>',
- b'\xbc' : '1/4',
- b'\xbd' : '1/2',
- b'\xbe' : '3/4',
- b'\xbf' : '?',
- b'\xc0' : 'A',
- b'\xc1' : 'A',
- b'\xc2' : 'A',
- b'\xc3' : 'A',
- b'\xc4' : 'A',
- b'\xc5' : 'A',
- b'\xc6' : 'AE',
- b'\xc7' : 'C',
- b'\xc8' : 'E',
- b'\xc9' : 'E',
- b'\xca' : 'E',
- b'\xcb' : 'E',
- b'\xcc' : 'I',
- b'\xcd' : 'I',
- b'\xce' : 'I',
- b'\xcf' : 'I',
- b'\xd0' : 'D',
- b'\xd1' : 'N',
- b'\xd2' : 'O',
- b'\xd3' : 'O',
- b'\xd4' : 'O',
- b'\xd5' : 'O',
- b'\xd6' : 'O',
- b'\xd7' : '*',
- b'\xd8' : 'O',
- b'\xd9' : 'U',
- b'\xda' : 'U',
- b'\xdb' : 'U',
- b'\xdc' : 'U',
- b'\xdd' : 'Y',
- b'\xde' : 'b',
- b'\xdf' : 'B',
- b'\xe0' : 'a',
- b'\xe1' : 'a',
- b'\xe2' : 'a',
- b'\xe3' : 'a',
- b'\xe4' : 'a',
- b'\xe5' : 'a',
- b'\xe6' : 'ae',
- b'\xe7' : 'c',
- b'\xe8' : 'e',
- b'\xe9' : 'e',
- b'\xea' : 'e',
- b'\xeb' : 'e',
- b'\xec' : 'i',
- b'\xed' : 'i',
- b'\xee' : 'i',
- b'\xef' : 'i',
- b'\xf0' : 'o',
- b'\xf1' : 'n',
- b'\xf2' : 'o',
- b'\xf3' : 'o',
- b'\xf4' : 'o',
- b'\xf5' : 'o',
- b'\xf6' : 'o',
- b'\xf7' : '/',
- b'\xf8' : 'o',
- b'\xf9' : 'u',
- b'\xfa' : 'u',
- b'\xfb' : 'u',
- b'\xfc' : 'u',
- b'\xfd' : 'y',
- b'\xfe' : 'b',
- b'\xff' : 'y',
- }
- # A map used when removing rogue Windows-1252/ISO-8859-1
- # characters in otherwise UTF-8 documents.
- #
- # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
- # Windows-1252.
- WINDOWS_1252_TO_UTF8 = {
- 0x80 : b'\xe2\x82\xac', # €
- 0x82 : b'\xe2\x80\x9a', # ‚
- 0x83 : b'\xc6\x92', # ƒ
- 0x84 : b'\xe2\x80\x9e', # „
- 0x85 : b'\xe2\x80\xa6', # …
- 0x86 : b'\xe2\x80\xa0', # †
- 0x87 : b'\xe2\x80\xa1', # ‡
- 0x88 : b'\xcb\x86', # ˆ
- 0x89 : b'\xe2\x80\xb0', # ‰
- 0x8a : b'\xc5\xa0', # Š
- 0x8b : b'\xe2\x80\xb9', # ‹
- 0x8c : b'\xc5\x92', # Œ
- 0x8e : b'\xc5\xbd', # Ž
- 0x91 : b'\xe2\x80\x98', # ‘
- 0x92 : b'\xe2\x80\x99', # ’
- 0x93 : b'\xe2\x80\x9c', # “
- 0x94 : b'\xe2\x80\x9d', # ”
- 0x95 : b'\xe2\x80\xa2', # •
- 0x96 : b'\xe2\x80\x93', # –
- 0x97 : b'\xe2\x80\x94', # —
- 0x98 : b'\xcb\x9c', # ˜
- 0x99 : b'\xe2\x84\xa2', # ™
- 0x9a : b'\xc5\xa1', # š
- 0x9b : b'\xe2\x80\xba', # ›
- 0x9c : b'\xc5\x93', # œ
- 0x9e : b'\xc5\xbe', # ž
- 0x9f : b'\xc5\xb8', # Ÿ
- 0xa0 : b'\xc2\xa0', #
- 0xa1 : b'\xc2\xa1', # ¡
- 0xa2 : b'\xc2\xa2', # ¢
- 0xa3 : b'\xc2\xa3', # £
- 0xa4 : b'\xc2\xa4', # ¤
- 0xa5 : b'\xc2\xa5', # ¥
- 0xa6 : b'\xc2\xa6', # ¦
- 0xa7 : b'\xc2\xa7', # §
- 0xa8 : b'\xc2\xa8', # ¨
- 0xa9 : b'\xc2\xa9', # ©
- 0xaa : b'\xc2\xaa', # ª
- 0xab : b'\xc2\xab', # «
- 0xac : b'\xc2\xac', # ¬
- 0xad : b'\xc2\xad', #
- 0xae : b'\xc2\xae', # ®
- 0xaf : b'\xc2\xaf', # ¯
- 0xb0 : b'\xc2\xb0', # °
- 0xb1 : b'\xc2\xb1', # ±
- 0xb2 : b'\xc2\xb2', # ²
- 0xb3 : b'\xc2\xb3', # ³
- 0xb4 : b'\xc2\xb4', # ´
- 0xb5 : b'\xc2\xb5', # µ
- 0xb6 : b'\xc2\xb6', # ¶
- 0xb7 : b'\xc2\xb7', # ·
- 0xb8 : b'\xc2\xb8', # ¸
- 0xb9 : b'\xc2\xb9', # ¹
- 0xba : b'\xc2\xba', # º
- 0xbb : b'\xc2\xbb', # »
- 0xbc : b'\xc2\xbc', # ¼
- 0xbd : b'\xc2\xbd', # ½
- 0xbe : b'\xc2\xbe', # ¾
- 0xbf : b'\xc2\xbf', # ¿
- 0xc0 : b'\xc3\x80', # À
- 0xc1 : b'\xc3\x81', # Á
- 0xc2 : b'\xc3\x82', # Â
- 0xc3 : b'\xc3\x83', # Ã
- 0xc4 : b'\xc3\x84', # Ä
- 0xc5 : b'\xc3\x85', # Å
- 0xc6 : b'\xc3\x86', # Æ
- 0xc7 : b'\xc3\x87', # Ç
- 0xc8 : b'\xc3\x88', # È
- 0xc9 : b'\xc3\x89', # É
- 0xca : b'\xc3\x8a', # Ê
- 0xcb : b'\xc3\x8b', # Ë
- 0xcc : b'\xc3\x8c', # Ì
- 0xcd : b'\xc3\x8d', # Í
- 0xce : b'\xc3\x8e', # Î
- 0xcf : b'\xc3\x8f', # Ï
- 0xd0 : b'\xc3\x90', # Ð
- 0xd1 : b'\xc3\x91', # Ñ
- 0xd2 : b'\xc3\x92', # Ò
- 0xd3 : b'\xc3\x93', # Ó
- 0xd4 : b'\xc3\x94', # Ô
- 0xd5 : b'\xc3\x95', # Õ
- 0xd6 : b'\xc3\x96', # Ö
- 0xd7 : b'\xc3\x97', # ×
- 0xd8 : b'\xc3\x98', # Ø
- 0xd9 : b'\xc3\x99', # Ù
- 0xda : b'\xc3\x9a', # Ú
- 0xdb : b'\xc3\x9b', # Û
- 0xdc : b'\xc3\x9c', # Ü
- 0xdd : b'\xc3\x9d', # Ý
- 0xde : b'\xc3\x9e', # Þ
- 0xdf : b'\xc3\x9f', # ß
- 0xe0 : b'\xc3\xa0', # à
- 0xe1 : b'\xa1', # á
- 0xe2 : b'\xc3\xa2', # â
- 0xe3 : b'\xc3\xa3', # ã
- 0xe4 : b'\xc3\xa4', # ä
- 0xe5 : b'\xc3\xa5', # å
- 0xe6 : b'\xc3\xa6', # æ
- 0xe7 : b'\xc3\xa7', # ç
- 0xe8 : b'\xc3\xa8', # è
- 0xe9 : b'\xc3\xa9', # é
- 0xea : b'\xc3\xaa', # ê
- 0xeb : b'\xc3\xab', # ë
- 0xec : b'\xc3\xac', # ì
- 0xed : b'\xc3\xad', # í
- 0xee : b'\xc3\xae', # î
- 0xef : b'\xc3\xaf', # ï
- 0xf0 : b'\xc3\xb0', # ð
- 0xf1 : b'\xc3\xb1', # ñ
- 0xf2 : b'\xc3\xb2', # ò
- 0xf3 : b'\xc3\xb3', # ó
- 0xf4 : b'\xc3\xb4', # ô
- 0xf5 : b'\xc3\xb5', # õ
- 0xf6 : b'\xc3\xb6', # ö
- 0xf7 : b'\xc3\xb7', # ÷
- 0xf8 : b'\xc3\xb8', # ø
- 0xf9 : b'\xc3\xb9', # ù
- 0xfa : b'\xc3\xba', # ú
- 0xfb : b'\xc3\xbb', # û
- 0xfc : b'\xc3\xbc', # ü
- 0xfd : b'\xc3\xbd', # ý
- 0xfe : b'\xc3\xbe', # þ
- }
- MULTIBYTE_MARKERS_AND_SIZES = [
- (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
- (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
- (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
- ]
- FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
- LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
- @classmethod
- def detwingle(cls, in_bytes, main_encoding="utf8",
- embedded_encoding="windows-1252"):
- """Fix characters from one encoding embedded in some other encoding.
- Currently the only situation supported is Windows-1252 (or its
- subset ISO-8859-1), embedded in UTF-8.
- :param in_bytes: A bytestring that you suspect contains
- characters from multiple encodings. Note that this _must_
- be a bytestring. If you've already converted the document
- to Unicode, you're too late.
- :param main_encoding: The primary encoding of `in_bytes`.
- :param embedded_encoding: The encoding that was used to embed characters
- in the main document.
- :return: A bytestring in which `embedded_encoding`
- characters have been converted to their `main_encoding`
- equivalents.
- """
- if embedded_encoding.replace('_', '-').lower() not in (
- 'windows-1252', 'windows_1252'):
- raise NotImplementedError(
- "Windows-1252 and ISO-8859-1 are the only currently supported "
- "embedded encodings.")
- if main_encoding.lower() not in ('utf8', 'utf-8'):
- raise NotImplementedError(
- "UTF-8 is the only currently supported main encoding.")
- byte_chunks = []
- chunk_start = 0
- pos = 0
- while pos < len(in_bytes):
- byte = in_bytes[pos]
- if not isinstance(byte, int):
- # Python 2.x
- byte = ord(byte)
- if (byte >= cls.FIRST_MULTIBYTE_MARKER
- and byte <= cls.LAST_MULTIBYTE_MARKER):
- # This is the start of a UTF-8 multibyte character. Skip
- # to the end.
- for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
- if byte >= start and byte <= end:
- pos += size
- break
- elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
- # We found a Windows-1252 character!
- # Save the string up to this point as a chunk.
- byte_chunks.append(in_bytes[chunk_start:pos])
- # Now translate the Windows-1252 character into UTF-8
- # and add it as another, one-byte chunk.
- byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
- pos += 1
- chunk_start = pos
- else:
- # Go on to the next character.
- pos += 1
- if chunk_start == 0:
- # The string is unchanged.
- return in_bytes
- else:
- # Store the final chunk.
- byte_chunks.append(in_bytes[chunk_start:])
- return b''.join(byte_chunks)
|