| # unicode.py |
| |
| import sys |
| from itertools import filterfalse |
| from typing import List, Tuple, Union |
| |
| |
| class _lazyclassproperty: |
| def __init__(self, fn): |
| self.fn = fn |
| self.__doc__ = fn.__doc__ |
| self.__name__ = fn.__name__ |
| |
| def __get__(self, obj, cls): |
| if cls is None: |
| cls = type(obj) |
| if not hasattr(cls, "_intern") or any( |
| cls._intern is getattr(superclass, "_intern", []) |
| for superclass in cls.__mro__[1:] |
| ): |
| cls._intern = {} |
| attrname = self.fn.__name__ |
| if attrname not in cls._intern: |
| cls._intern[attrname] = self.fn(cls) |
| return cls._intern[attrname] |
| |
| |
| UnicodeRangeList = List[Union[Tuple[int, int], Tuple[int]]] |
| |
| |
| class unicode_set: |
| """ |
| A set of Unicode characters, for language-specific strings for |
| ``alphas``, ``nums``, ``alphanums``, and ``printables``. |
| A unicode_set is defined by a list of ranges in the Unicode character |
| set, in a class attribute ``_ranges``. Ranges can be specified using |
| 2-tuples or a 1-tuple, such as:: |
| |
| _ranges = [ |
| (0x0020, 0x007e), |
| (0x00a0, 0x00ff), |
| (0x0100,), |
| ] |
| |
| Ranges are left- and right-inclusive. A 1-tuple of (x,) is treated as (x, x). |
| |
| A unicode set can also be defined using multiple inheritance of other unicode sets:: |
| |
| class CJK(Chinese, Japanese, Korean): |
| pass |
| """ |
| |
| _ranges: UnicodeRangeList = [] |
| |
| @_lazyclassproperty |
| def _chars_for_ranges(cls): |
| ret = [] |
| for cc in cls.__mro__: |
| if cc is unicode_set: |
| break |
| for rr in getattr(cc, "_ranges", ()): |
| ret.extend(range(rr[0], rr[-1] + 1)) |
| return [chr(c) for c in sorted(set(ret))] |
| |
| @_lazyclassproperty |
| def printables(cls): |
| "all non-whitespace characters in this range" |
| return "".join(filterfalse(str.isspace, cls._chars_for_ranges)) |
| |
| @_lazyclassproperty |
| def alphas(cls): |
| "all alphabetic characters in this range" |
| return "".join(filter(str.isalpha, cls._chars_for_ranges)) |
| |
| @_lazyclassproperty |
| def nums(cls): |
| "all numeric digit characters in this range" |
| return "".join(filter(str.isdigit, cls._chars_for_ranges)) |
| |
| @_lazyclassproperty |
| def alphanums(cls): |
| "all alphanumeric characters in this range" |
| return cls.alphas + cls.nums |
| |
| @_lazyclassproperty |
| def identchars(cls): |
| "all characters in this range that are valid identifier characters, plus underscore '_'" |
| return "".join( |
| sorted( |
| set( |
| "".join(filter(str.isidentifier, cls._chars_for_ranges)) |
| + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzªµº" |
| + "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþÿ" |
| + "_" |
| ) |
| ) |
| ) |
| |
| @_lazyclassproperty |
| def identbodychars(cls): |
| """ |
| all characters in this range that are valid identifier body characters, |
| plus the digits 0-9 |
| """ |
| return "".join( |
| sorted( |
| set( |
| cls.identchars |
| + "0123456789" |
| + "".join( |
| [c for c in cls._chars_for_ranges if ("_" + c).isidentifier()] |
| ) |
| ) |
| ) |
| ) |
| |
| |
| class pyparsing_unicode(unicode_set): |
| """ |
| A namespace class for defining common language unicode_sets. |
| """ |
| |
| # fmt: off |
| |
| # define ranges in language character sets |
| _ranges: UnicodeRangeList = [ |
| (0x0020, sys.maxunicode), |
| ] |
| |
| class BasicMultilingualPlane(unicode_set): |
| "Unicode set for the Basic Multilingual Plane" |
| _ranges: UnicodeRangeList = [ |
| (0x0020, 0xFFFF), |
| ] |
| |
| class Latin1(unicode_set): |
| "Unicode set for Latin-1 Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0020, 0x007E), |
| (0x00A0, 0x00FF), |
| ] |
| |
| class LatinA(unicode_set): |
| "Unicode set for Latin-A Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0100, 0x017F), |
| ] |
| |
| class LatinB(unicode_set): |
| "Unicode set for Latin-B Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0180, 0x024F), |
| ] |
| |
| class Greek(unicode_set): |
| "Unicode set for Greek Unicode Character Ranges" |
| _ranges: UnicodeRangeList = [ |
| (0x0342, 0x0345), |
| (0x0370, 0x0377), |
| (0x037A, 0x037F), |
| (0x0384, 0x038A), |
| (0x038C,), |
| (0x038E, 0x03A1), |
| (0x03A3, 0x03E1), |
| (0x03F0, 0x03FF), |
| (0x1D26, 0x1D2A), |
| (0x1D5E,), |
| (0x1D60,), |
| (0x1D66, 0x1D6A), |
| (0x1F00, 0x1F15), |
| (0x1F18, 0x1F1D), |
| (0x1F20, 0x1F45), |
| (0x1F48, 0x1F4D), |
| (0x1F50, 0x1F57), |
| (0x1F59,), |
| (0x1F5B,), |
| (0x1F5D,), |
| (0x1F5F, 0x1F7D), |
| (0x1F80, 0x1FB4), |
| (0x1FB6, 0x1FC4), |
| (0x1FC6, 0x1FD3), |
| (0x1FD6, 0x1FDB), |
| (0x1FDD, 0x1FEF), |
| (0x1FF2, 0x1FF4), |
| (0x1FF6, 0x1FFE), |
| (0x2129,), |
| (0x2719, 0x271A), |
| (0xAB65,), |
| (0x10140, 0x1018D), |
| (0x101A0,), |
| (0x1D200, 0x1D245), |
| (0x1F7A1, 0x1F7A7), |
| ] |
| |
| class Cyrillic(unicode_set): |
| "Unicode set for Cyrillic Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0400, 0x052F), |
| (0x1C80, 0x1C88), |
| (0x1D2B,), |
| (0x1D78,), |
| (0x2DE0, 0x2DFF), |
| (0xA640, 0xA672), |
| (0xA674, 0xA69F), |
| (0xFE2E, 0xFE2F), |
| ] |
| |
| class Chinese(unicode_set): |
| "Unicode set for Chinese Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x2E80, 0x2E99), |
| (0x2E9B, 0x2EF3), |
| (0x31C0, 0x31E3), |
| (0x3400, 0x4DB5), |
| (0x4E00, 0x9FEF), |
| (0xA700, 0xA707), |
| (0xF900, 0xFA6D), |
| (0xFA70, 0xFAD9), |
| (0x16FE2, 0x16FE3), |
| (0x1F210, 0x1F212), |
| (0x1F214, 0x1F23B), |
| (0x1F240, 0x1F248), |
| (0x20000, 0x2A6D6), |
| (0x2A700, 0x2B734), |
| (0x2B740, 0x2B81D), |
| (0x2B820, 0x2CEA1), |
| (0x2CEB0, 0x2EBE0), |
| (0x2F800, 0x2FA1D), |
| ] |
| |
| class Japanese(unicode_set): |
| "Unicode set for Japanese Unicode Character Range, combining Kanji, Hiragana, and Katakana ranges" |
| _ranges: UnicodeRangeList = [] |
| |
| class Kanji(unicode_set): |
| "Unicode set for Kanji Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x4E00, 0x9FBF), |
| (0x3000, 0x303F), |
| ] |
| |
| class Hiragana(unicode_set): |
| "Unicode set for Hiragana Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x3041, 0x3096), |
| (0x3099, 0x30A0), |
| (0x30FC,), |
| (0xFF70,), |
| (0x1B001,), |
| (0x1B150, 0x1B152), |
| (0x1F200,), |
| ] |
| |
| class Katakana(unicode_set): |
| "Unicode set for Katakana Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x3099, 0x309C), |
| (0x30A0, 0x30FF), |
| (0x31F0, 0x31FF), |
| (0x32D0, 0x32FE), |
| (0xFF65, 0xFF9F), |
| (0x1B000,), |
| (0x1B164, 0x1B167), |
| (0x1F201, 0x1F202), |
| (0x1F213,), |
| ] |
| |
| class Hangul(unicode_set): |
| "Unicode set for Hangul (Korean) Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x1100, 0x11FF), |
| (0x302E, 0x302F), |
| (0x3131, 0x318E), |
| (0x3200, 0x321C), |
| (0x3260, 0x327B), |
| (0x327E,), |
| (0xA960, 0xA97C), |
| (0xAC00, 0xD7A3), |
| (0xD7B0, 0xD7C6), |
| (0xD7CB, 0xD7FB), |
| (0xFFA0, 0xFFBE), |
| (0xFFC2, 0xFFC7), |
| (0xFFCA, 0xFFCF), |
| (0xFFD2, 0xFFD7), |
| (0xFFDA, 0xFFDC), |
| ] |
| |
| Korean = Hangul |
| |
| class CJK(Chinese, Japanese, Hangul): |
| "Unicode set for combined Chinese, Japanese, and Korean (CJK) Unicode Character Range" |
| |
| class Thai(unicode_set): |
| "Unicode set for Thai Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0E01, 0x0E3A), |
| (0x0E3F, 0x0E5B) |
| ] |
| |
| class Arabic(unicode_set): |
| "Unicode set for Arabic Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0600, 0x061B), |
| (0x061E, 0x06FF), |
| (0x0700, 0x077F), |
| ] |
| |
| class Hebrew(unicode_set): |
| "Unicode set for Hebrew Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0591, 0x05C7), |
| (0x05D0, 0x05EA), |
| (0x05EF, 0x05F4), |
| (0xFB1D, 0xFB36), |
| (0xFB38, 0xFB3C), |
| (0xFB3E,), |
| (0xFB40, 0xFB41), |
| (0xFB43, 0xFB44), |
| (0xFB46, 0xFB4F), |
| ] |
| |
| class Devanagari(unicode_set): |
| "Unicode set for Devanagari Unicode Character Range" |
| _ranges: UnicodeRangeList = [ |
| (0x0900, 0x097F), |
| (0xA8E0, 0xA8FF) |
| ] |
| |
| # fmt: on |
| |
| |
| pyparsing_unicode.Japanese._ranges = ( |
| pyparsing_unicode.Japanese.Kanji._ranges |
| + pyparsing_unicode.Japanese.Hiragana._ranges |
| + pyparsing_unicode.Japanese.Katakana._ranges |
| ) |
| |
| pyparsing_unicode.BMP = pyparsing_unicode.BasicMultilingualPlane |
| |
| # add language identifiers using language Unicode |
| pyparsing_unicode.العربية = pyparsing_unicode.Arabic |
| pyparsing_unicode.中文 = pyparsing_unicode.Chinese |
| pyparsing_unicode.кириллица = pyparsing_unicode.Cyrillic |
| pyparsing_unicode.Ελληνικά = pyparsing_unicode.Greek |
| pyparsing_unicode.עִברִית = pyparsing_unicode.Hebrew |
| pyparsing_unicode.日本語 = pyparsing_unicode.Japanese |
| pyparsing_unicode.Japanese.漢字 = pyparsing_unicode.Japanese.Kanji |
| pyparsing_unicode.Japanese.カタカナ = pyparsing_unicode.Japanese.Katakana |
| pyparsing_unicode.Japanese.ひらがな = pyparsing_unicode.Japanese.Hiragana |
| pyparsing_unicode.한국어 = pyparsing_unicode.Korean |
| pyparsing_unicode.ไทย = pyparsing_unicode.Thai |
| pyparsing_unicode.देवनागरी = pyparsing_unicode.Devanagari |