| """ |
| pygments.util |
| ~~~~~~~~~~~~~ |
| |
| Utility functions. |
| |
| :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. |
| :license: BSD, see LICENSE for details. |
| """ |
| |
| import re |
| from io import TextIOWrapper |
| |
| |
| split_path_re = re.compile(r'[/\\ ]') |
| doctype_lookup_re = re.compile(r''' |
| <!DOCTYPE\s+( |
| [a-zA-Z_][a-zA-Z0-9]* |
| (?: \s+ # optional in HTML5 |
| [a-zA-Z_][a-zA-Z0-9]*\s+ |
| "[^"]*")? |
| ) |
| [^>]*> |
| ''', re.DOTALL | re.MULTILINE | re.VERBOSE) |
| tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</.+?>', |
| re.IGNORECASE | re.DOTALL | re.MULTILINE) |
| xml_decl_re = re.compile(r'\s*<\?xml[^>]*\?>', re.I) |
| |
| |
| class ClassNotFound(ValueError): |
| """Raised if one of the lookup functions didn't find a matching class.""" |
| |
| |
| class OptionError(Exception): |
| pass |
| |
| |
| def get_choice_opt(options, optname, allowed, default=None, normcase=False): |
| string = options.get(optname, default) |
| if normcase: |
| string = string.lower() |
| if string not in allowed: |
| raise OptionError('Value for option %s must be one of %s' % |
| (optname, ', '.join(map(str, allowed)))) |
| return string |
| |
| |
| def get_bool_opt(options, optname, default=None): |
| string = options.get(optname, default) |
| if isinstance(string, bool): |
| return string |
| elif isinstance(string, int): |
| return bool(string) |
| elif not isinstance(string, str): |
| raise OptionError('Invalid type %r for option %s; use ' |
| '1/0, yes/no, true/false, on/off' % ( |
| string, optname)) |
| elif string.lower() in ('1', 'yes', 'true', 'on'): |
| return True |
| elif string.lower() in ('0', 'no', 'false', 'off'): |
| return False |
| else: |
| raise OptionError('Invalid value %r for option %s; use ' |
| '1/0, yes/no, true/false, on/off' % ( |
| string, optname)) |
| |
| |
| def get_int_opt(options, optname, default=None): |
| string = options.get(optname, default) |
| try: |
| return int(string) |
| except TypeError: |
| raise OptionError('Invalid type %r for option %s; you ' |
| 'must give an integer value' % ( |
| string, optname)) |
| except ValueError: |
| raise OptionError('Invalid value %r for option %s; you ' |
| 'must give an integer value' % ( |
| string, optname)) |
| |
| |
| def get_list_opt(options, optname, default=None): |
| val = options.get(optname, default) |
| if isinstance(val, str): |
| return val.split() |
| elif isinstance(val, (list, tuple)): |
| return list(val) |
| else: |
| raise OptionError('Invalid type %r for option %s; you ' |
| 'must give a list value' % ( |
| val, optname)) |
| |
| |
| def docstring_headline(obj): |
| if not obj.__doc__: |
| return '' |
| res = [] |
| for line in obj.__doc__.strip().splitlines(): |
| if line.strip(): |
| res.append(" " + line.strip()) |
| else: |
| break |
| return ''.join(res).lstrip() |
| |
| |
| def make_analysator(f): |
| """Return a static text analyser function that returns float values.""" |
| def text_analyse(text): |
| try: |
| rv = f(text) |
| except Exception: |
| return 0.0 |
| if not rv: |
| return 0.0 |
| try: |
| return min(1.0, max(0.0, float(rv))) |
| except (ValueError, TypeError): |
| return 0.0 |
| text_analyse.__doc__ = f.__doc__ |
| return staticmethod(text_analyse) |
| |
| |
| def shebang_matches(text, regex): |
| r"""Check if the given regular expression matches the last part of the |
| shebang if one exists. |
| |
| >>> from pygments.util import shebang_matches |
| >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?') |
| True |
| >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?') |
| True |
| >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?') |
| False |
| >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?') |
| False |
| >>> shebang_matches('#!/usr/bin/startsomethingwith python', |
| ... r'python(2\.\d)?') |
| True |
| |
| It also checks for common windows executable file extensions:: |
| |
| >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?') |
| True |
| |
| Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does |
| the same as ``'perl -e'``) |
| |
| Note that this method automatically searches the whole string (eg: |
| the regular expression is wrapped in ``'^$'``) |
| """ |
| index = text.find('\n') |
| if index >= 0: |
| first_line = text[:index].lower() |
| else: |
| first_line = text.lower() |
| if first_line.startswith('#!'): |
| try: |
| found = [x for x in split_path_re.split(first_line[2:].strip()) |
| if x and not x.startswith('-')][-1] |
| except IndexError: |
| return False |
| regex = re.compile(r'^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE) |
| if regex.search(found) is not None: |
| return True |
| return False |
| |
| |
| def doctype_matches(text, regex): |
| """Check if the doctype matches a regular expression (if present). |
| |
| Note that this method only checks the first part of a DOCTYPE. |
| eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' |
| """ |
| m = doctype_lookup_re.search(text) |
| if m is None: |
| return False |
| doctype = m.group(1) |
| return re.compile(regex, re.I).match(doctype.strip()) is not None |
| |
| |
| def html_doctype_matches(text): |
| """Check if the file looks like it has a html doctype.""" |
| return doctype_matches(text, r'html') |
| |
| |
| _looks_like_xml_cache = {} |
| |
| |
| def looks_like_xml(text): |
| """Check if a doctype exists or if we have some tags.""" |
| if xml_decl_re.match(text): |
| return True |
| key = hash(text) |
| try: |
| return _looks_like_xml_cache[key] |
| except KeyError: |
| m = doctype_lookup_re.search(text) |
| if m is not None: |
| return True |
| rv = tag_re.search(text[:1000]) is not None |
| _looks_like_xml_cache[key] = rv |
| return rv |
| |
| |
| def surrogatepair(c): |
| """Given a unicode character code with length greater than 16 bits, |
| return the two 16 bit surrogate pair. |
| """ |
| # From example D28 of: |
| # http://www.unicode.org/book/ch03.pdf |
| return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) |
| |
| |
| def format_lines(var_name, seq, raw=False, indent_level=0): |
| """Formats a sequence of strings for output.""" |
| lines = [] |
| base_indent = ' ' * indent_level * 4 |
| inner_indent = ' ' * (indent_level + 1) * 4 |
| lines.append(base_indent + var_name + ' = (') |
| if raw: |
| # These should be preformatted reprs of, say, tuples. |
| for i in seq: |
| lines.append(inner_indent + i + ',') |
| else: |
| for i in seq: |
| # Force use of single quotes |
| r = repr(i + '"') |
| lines.append(inner_indent + r[:-2] + r[-1] + ',') |
| lines.append(base_indent + ')') |
| return '\n'.join(lines) |
| |
| |
| def duplicates_removed(it, already_seen=()): |
| """ |
| Returns a list with duplicates removed from the iterable `it`. |
| |
| Order is preserved. |
| """ |
| lst = [] |
| seen = set() |
| for i in it: |
| if i in seen or i in already_seen: |
| continue |
| lst.append(i) |
| seen.add(i) |
| return lst |
| |
| |
| class Future: |
| """Generic class to defer some work. |
| |
| Handled specially in RegexLexerMeta, to support regex string construction at |
| first use. |
| """ |
| def get(self): |
| raise NotImplementedError |
| |
| |
| def guess_decode(text): |
| """Decode *text* with guessed encoding. |
| |
| First try UTF-8; this should fail for non-UTF-8 encodings. |
| Then try the preferred locale encoding. |
| Fall back to latin-1, which always works. |
| """ |
| try: |
| text = text.decode('utf-8') |
| return text, 'utf-8' |
| except UnicodeDecodeError: |
| try: |
| import locale |
| prefencoding = locale.getpreferredencoding() |
| text = text.decode() |
| return text, prefencoding |
| except (UnicodeDecodeError, LookupError): |
| text = text.decode('latin1') |
| return text, 'latin1' |
| |
| |
| def guess_decode_from_terminal(text, term): |
| """Decode *text* coming from terminal *term*. |
| |
| First try the terminal encoding, if given. |
| Then try UTF-8. Then try the preferred locale encoding. |
| Fall back to latin-1, which always works. |
| """ |
| if getattr(term, 'encoding', None): |
| try: |
| text = text.decode(term.encoding) |
| except UnicodeDecodeError: |
| pass |
| else: |
| return text, term.encoding |
| return guess_decode(text) |
| |
| |
| def terminal_encoding(term): |
| """Return our best guess of encoding for the given *term*.""" |
| if getattr(term, 'encoding', None): |
| return term.encoding |
| import locale |
| return locale.getpreferredencoding() |
| |
| |
| class UnclosingTextIOWrapper(TextIOWrapper): |
| # Don't close underlying buffer on destruction. |
| def close(self): |
| self.flush() |