| # coding: utf-8 |
| """ |
| |
| webencodings |
| ~~~~~~~~~~~~ |
| |
| This is a Python implementation of the `WHATWG Encoding standard |
| <http://encoding.spec.whatwg.org/>`. See README for details. |
| |
| :copyright: Copyright 2012 by Simon Sapin |
| :license: BSD, see LICENSE for details. |
| |
| """ |
| |
| from __future__ import unicode_literals |
| |
| import codecs |
| |
| from .labels import LABELS |
| |
| |
| VERSION = '0.5.1' |
| |
| |
| # Some names in Encoding are not valid Python aliases. Remap these. |
| PYTHON_NAMES = { |
| 'iso-8859-8-i': 'iso-8859-8', |
| 'x-mac-cyrillic': 'mac-cyrillic', |
| 'macintosh': 'mac-roman', |
| 'windows-874': 'cp874'} |
| |
| CACHE = {} |
| |
| |
| def ascii_lower(string): |
| r"""Transform (only) ASCII letters to lower case: A-Z is mapped to a-z. |
| |
| :param string: An Unicode string. |
| :returns: A new Unicode string. |
| |
| This is used for `ASCII case-insensitive |
| <http://encoding.spec.whatwg.org/#ascii-case-insensitive>`_ |
| matching of encoding labels. |
| The same matching is also used, among other things, |
| for `CSS keywords <http://dev.w3.org/csswg/css-values/#keywords>`_. |
| |
| This is different from the :meth:`~py:str.lower` method of Unicode strings |
| which also affect non-ASCII characters, |
| sometimes mapping them into the ASCII range: |
| |
| >>> keyword = u'Bac\N{KELVIN SIGN}ground' |
| >>> assert keyword.lower() == u'background' |
| >>> assert ascii_lower(keyword) != keyword.lower() |
| >>> assert ascii_lower(keyword) == u'bac\N{KELVIN SIGN}ground' |
| |
| """ |
| # This turns out to be faster than unicode.translate() |
| return string.encode('utf8').lower().decode('utf8') |
| |
| |
| def lookup(label): |
| """ |
| Look for an encoding by its label. |
| This is the spec’s `get an encoding |
| <http://encoding.spec.whatwg.org/#concept-encoding-get>`_ algorithm. |
| Supported labels are listed there. |
| |
| :param label: A string. |
| :returns: |
| An :class:`Encoding` object, or :obj:`None` for an unknown label. |
| |
| """ |
| # Only strip ASCII whitespace: U+0009, U+000A, U+000C, U+000D, and U+0020. |
| label = ascii_lower(label.strip('\t\n\f\r ')) |
| name = LABELS.get(label) |
| if name is None: |
| return None |
| encoding = CACHE.get(name) |
| if encoding is None: |
| if name == 'x-user-defined': |
| from .x_user_defined import codec_info |
| else: |
| python_name = PYTHON_NAMES.get(name, name) |
| # Any python_name value that gets to here should be valid. |
| codec_info = codecs.lookup(python_name) |
| encoding = Encoding(name, codec_info) |
| CACHE[name] = encoding |
| return encoding |
| |
| |
| def _get_encoding(encoding_or_label): |
| """ |
| Accept either an encoding object or label. |
| |
| :param encoding: An :class:`Encoding` object or a label string. |
| :returns: An :class:`Encoding` object. |
| :raises: :exc:`~exceptions.LookupError` for an unknown label. |
| |
| """ |
| if hasattr(encoding_or_label, 'codec_info'): |
| return encoding_or_label |
| |
| encoding = lookup(encoding_or_label) |
| if encoding is None: |
| raise LookupError('Unknown encoding label: %r' % encoding_or_label) |
| return encoding |
| |
| |
| class Encoding(object): |
| """Reresents a character encoding such as UTF-8, |
| that can be used for decoding or encoding. |
| |
| .. attribute:: name |
| |
| Canonical name of the encoding |
| |
| .. attribute:: codec_info |
| |
| The actual implementation of the encoding, |
| a stdlib :class:`~codecs.CodecInfo` object. |
| See :func:`codecs.register`. |
| |
| """ |
| def __init__(self, name, codec_info): |
| self.name = name |
| self.codec_info = codec_info |
| |
| def __repr__(self): |
| return '<Encoding %s>' % self.name |
| |
| |
| #: The UTF-8 encoding. Should be used for new content and formats. |
| UTF8 = lookup('utf-8') |
| |
| _UTF16LE = lookup('utf-16le') |
| _UTF16BE = lookup('utf-16be') |
| |
| |
| def decode(input, fallback_encoding, errors='replace'): |
| """ |
| Decode a single string. |
| |
| :param input: A byte string |
| :param fallback_encoding: |
| An :class:`Encoding` object or a label string. |
| The encoding to use if :obj:`input` does note have a BOM. |
| :param errors: Type of error handling. See :func:`codecs.register`. |
| :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
| :return: |
| A ``(output, encoding)`` tuple of an Unicode string |
| and an :obj:`Encoding`. |
| |
| """ |
| # Fail early if `encoding` is an invalid label. |
| fallback_encoding = _get_encoding(fallback_encoding) |
| bom_encoding, input = _detect_bom(input) |
| encoding = bom_encoding or fallback_encoding |
| return encoding.codec_info.decode(input, errors)[0], encoding |
| |
| |
| def _detect_bom(input): |
| """Return (bom_encoding, input), with any BOM removed from the input.""" |
| if input.startswith(b'\xFF\xFE'): |
| return _UTF16LE, input[2:] |
| if input.startswith(b'\xFE\xFF'): |
| return _UTF16BE, input[2:] |
| if input.startswith(b'\xEF\xBB\xBF'): |
| return UTF8, input[3:] |
| return None, input |
| |
| |
| def encode(input, encoding=UTF8, errors='strict'): |
| """ |
| Encode a single string. |
| |
| :param input: An Unicode string. |
| :param encoding: An :class:`Encoding` object or a label string. |
| :param errors: Type of error handling. See :func:`codecs.register`. |
| :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
| :return: A byte string. |
| |
| """ |
| return _get_encoding(encoding).codec_info.encode(input, errors)[0] |
| |
| |
| def iter_decode(input, fallback_encoding, errors='replace'): |
| """ |
| "Pull"-based decoder. |
| |
| :param input: |
| An iterable of byte strings. |
| |
| The input is first consumed just enough to determine the encoding |
| based on the precense of a BOM, |
| then consumed on demand when the return value is. |
| :param fallback_encoding: |
| An :class:`Encoding` object or a label string. |
| The encoding to use if :obj:`input` does note have a BOM. |
| :param errors: Type of error handling. See :func:`codecs.register`. |
| :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
| :returns: |
| An ``(output, encoding)`` tuple. |
| :obj:`output` is an iterable of Unicode strings, |
| :obj:`encoding` is the :obj:`Encoding` that is being used. |
| |
| """ |
| |
| decoder = IncrementalDecoder(fallback_encoding, errors) |
| generator = _iter_decode_generator(input, decoder) |
| encoding = next(generator) |
| return generator, encoding |
| |
| |
| def _iter_decode_generator(input, decoder): |
| """Return a generator that first yields the :obj:`Encoding`, |
| then yields output chukns as Unicode strings. |
| |
| """ |
| decode = decoder.decode |
| input = iter(input) |
| for chunck in input: |
| output = decode(chunck) |
| if output: |
| assert decoder.encoding is not None |
| yield decoder.encoding |
| yield output |
| break |
| else: |
| # Input exhausted without determining the encoding |
| output = decode(b'', final=True) |
| assert decoder.encoding is not None |
| yield decoder.encoding |
| if output: |
| yield output |
| return |
| |
| for chunck in input: |
| output = decode(chunck) |
| if output: |
| yield output |
| output = decode(b'', final=True) |
| if output: |
| yield output |
| |
| |
| def iter_encode(input, encoding=UTF8, errors='strict'): |
| """ |
| “Pull”-based encoder. |
| |
| :param input: An iterable of Unicode strings. |
| :param encoding: An :class:`Encoding` object or a label string. |
| :param errors: Type of error handling. See :func:`codecs.register`. |
| :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
| :returns: An iterable of byte strings. |
| |
| """ |
| # Fail early if `encoding` is an invalid label. |
| encode = IncrementalEncoder(encoding, errors).encode |
| return _iter_encode_generator(input, encode) |
| |
| |
| def _iter_encode_generator(input, encode): |
| for chunck in input: |
| output = encode(chunck) |
| if output: |
| yield output |
| output = encode('', final=True) |
| if output: |
| yield output |
| |
| |
| class IncrementalDecoder(object): |
| """ |
| “Push”-based decoder. |
| |
| :param fallback_encoding: |
| An :class:`Encoding` object or a label string. |
| The encoding to use if :obj:`input` does note have a BOM. |
| :param errors: Type of error handling. See :func:`codecs.register`. |
| :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
| |
| """ |
| def __init__(self, fallback_encoding, errors='replace'): |
| # Fail early if `encoding` is an invalid label. |
| self._fallback_encoding = _get_encoding(fallback_encoding) |
| self._errors = errors |
| self._buffer = b'' |
| self._decoder = None |
| #: The actual :class:`Encoding` that is being used, |
| #: or :obj:`None` if that is not determined yet. |
| #: (Ie. if there is not enough input yet to determine |
| #: if there is a BOM.) |
| self.encoding = None # Not known yet. |
| |
| def decode(self, input, final=False): |
| """Decode one chunk of the input. |
| |
| :param input: A byte string. |
| :param final: |
| Indicate that no more input is available. |
| Must be :obj:`True` if this is the last call. |
| :returns: An Unicode string. |
| |
| """ |
| decoder = self._decoder |
| if decoder is not None: |
| return decoder(input, final) |
| |
| input = self._buffer + input |
| encoding, input = _detect_bom(input) |
| if encoding is None: |
| if len(input) < 3 and not final: # Not enough data yet. |
| self._buffer = input |
| return '' |
| else: # No BOM |
| encoding = self._fallback_encoding |
| decoder = encoding.codec_info.incrementaldecoder(self._errors).decode |
| self._decoder = decoder |
| self.encoding = encoding |
| return decoder(input, final) |
| |
| |
| class IncrementalEncoder(object): |
| """ |
| “Push”-based encoder. |
| |
| :param encoding: An :class:`Encoding` object or a label string. |
| :param errors: Type of error handling. See :func:`codecs.register`. |
| :raises: :exc:`~exceptions.LookupError` for an unknown encoding label. |
| |
| .. method:: encode(input, final=False) |
| |
| :param input: An Unicode string. |
| :param final: |
| Indicate that no more input is available. |
| Must be :obj:`True` if this is the last call. |
| :returns: A byte string. |
| |
| """ |
| def __init__(self, encoding=UTF8, errors='strict'): |
| encoding = _get_encoding(encoding) |
| self.encode = encoding.codec_info.incrementalencoder(errors).encode |