| """ |
| pygments.regexopt |
| ~~~~~~~~~~~~~~~~~ |
| |
| An algorithm that generates optimized regexes for matching long lists of |
| literal strings. |
| |
| :copyright: Copyright 2006-2022 by the Pygments team, see AUTHORS. |
| :license: BSD, see LICENSE for details. |
| """ |
| |
| import re |
| from re import escape |
| from os.path import commonprefix |
| from itertools import groupby |
| from operator import itemgetter |
| |
| CS_ESCAPE = re.compile(r'[\[\^\\\-\]]') |
| FIRST_ELEMENT = itemgetter(0) |
| |
| |
| def make_charset(letters): |
| return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' |
| |
| |
| def regex_opt_inner(strings, open_paren): |
| """Return a regex that matches any string in the sorted list of strings.""" |
| close_paren = open_paren and ')' or '' |
| # print strings, repr(open_paren) |
| if not strings: |
| # print '-> nothing left' |
| return '' |
| first = strings[0] |
| if len(strings) == 1: |
| # print '-> only 1 string' |
| return open_paren + escape(first) + close_paren |
| if not first: |
| # print '-> first string empty' |
| return open_paren + regex_opt_inner(strings[1:], '(?:') \ |
| + '?' + close_paren |
| if len(first) == 1: |
| # multiple one-char strings? make a charset |
| oneletter = [] |
| rest = [] |
| for s in strings: |
| if len(s) == 1: |
| oneletter.append(s) |
| else: |
| rest.append(s) |
| if len(oneletter) > 1: # do we have more than one oneletter string? |
| if rest: |
| # print '-> 1-character + rest' |
| return open_paren + regex_opt_inner(rest, '') + '|' \ |
| + make_charset(oneletter) + close_paren |
| # print '-> only 1-character' |
| return open_paren + make_charset(oneletter) + close_paren |
| prefix = commonprefix(strings) |
| if prefix: |
| plen = len(prefix) |
| # we have a prefix for all strings |
| # print '-> prefix:', prefix |
| return open_paren + escape(prefix) \ |
| + regex_opt_inner([s[plen:] for s in strings], '(?:') \ |
| + close_paren |
| # is there a suffix? |
| strings_rev = [s[::-1] for s in strings] |
| suffix = commonprefix(strings_rev) |
| if suffix: |
| slen = len(suffix) |
| # print '-> suffix:', suffix[::-1] |
| return open_paren \ |
| + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ |
| + escape(suffix[::-1]) + close_paren |
| # recurse on common 1-string prefixes |
| # print '-> last resort' |
| return open_paren + \ |
| '|'.join(regex_opt_inner(list(group[1]), '') |
| for group in groupby(strings, lambda s: s[0] == first[0])) \ |
| + close_paren |
| |
| |
| def regex_opt(strings, prefix='', suffix=''): |
| """Return a compiled regex that matches any string in the given list. |
| |
| The strings to match must be literal strings, not regexes. They will be |
| regex-escaped. |
| |
| *prefix* and *suffix* are pre- and appended to the final regex. |
| """ |
| strings = sorted(strings) |
| return prefix + regex_opt_inner(strings, '(') + suffix |