| """ |
| Metadata about languages used by our model training code for our |
| SingleByteCharSetProbers. Could be used for other things in the future. |
| |
| This code is based on the language metadata from the uchardet project. |
| """ |
| |
| from string import ascii_letters |
| |
| # TODO: Add Ukrainian (KOI8-U) |
| |
| |
| class Language: |
| """Metadata about a language useful for training models |
| |
| :ivar name: The human name for the language, in English. |
| :type name: str |
| :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, |
| or use another catalog as a last resort. |
| :type iso_code: str |
| :ivar use_ascii: Whether or not ASCII letters should be included in trained |
| models. |
| :type use_ascii: bool |
| :ivar charsets: The charsets we want to support and create data for. |
| :type charsets: list of str |
| :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is |
| `True`, you only need to add those not in the ASCII set. |
| :type alphabet: str |
| :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling |
| Wikipedia for training data. |
| :type wiki_start_pages: list of str |
| """ |
| |
| def __init__( |
| self, |
| name=None, |
| iso_code=None, |
| use_ascii=True, |
| charsets=None, |
| alphabet=None, |
| wiki_start_pages=None, |
| ): |
| super().__init__() |
| self.name = name |
| self.iso_code = iso_code |
| self.use_ascii = use_ascii |
| self.charsets = charsets |
| if self.use_ascii: |
| if alphabet: |
| alphabet += ascii_letters |
| else: |
| alphabet = ascii_letters |
| elif not alphabet: |
| raise ValueError("Must supply alphabet if use_ascii is False") |
| self.alphabet = "".join(sorted(set(alphabet))) if alphabet else None |
| self.wiki_start_pages = wiki_start_pages |
| |
| def __repr__(self): |
| param_str = ", ".join( |
| f"{k}={v!r}" for k, v in self.__dict__.items() if not k.startswith("_") |
| ) |
| return f"{self.__class__.__name__}({param_str})" |
| |
| |
| LANGUAGES = { |
| "Arabic": Language( |
| name="Arabic", |
| iso_code="ar", |
| use_ascii=False, |
| # We only support encodings that use isolated |
| # forms, because the current recommendation is |
| # that the rendering system handles presentation |
| # forms. This means we purposefully skip IBM864. |
| charsets=["ISO-8859-6", "WINDOWS-1256", "CP720", "CP864"], |
| alphabet="ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ", |
| wiki_start_pages=["الصفحة_الرئيسية"], |
| ), |
| "Belarusian": Language( |
| name="Belarusian", |
| iso_code="be", |
| use_ascii=False, |
| charsets=["ISO-8859-5", "WINDOWS-1251", "IBM866", "MacCyrillic"], |
| alphabet="АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯабвгдеёжзійклмнопрстуўфхцчшыьэюяʼ", |
| wiki_start_pages=["Галоўная_старонка"], |
| ), |
| "Bulgarian": Language( |
| name="Bulgarian", |
| iso_code="bg", |
| use_ascii=False, |
| charsets=["ISO-8859-5", "WINDOWS-1251", "IBM855"], |
| alphabet="АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯабвгдежзийклмнопрстуфхцчшщъьюя", |
| wiki_start_pages=["Начална_страница"], |
| ), |
| "Czech": Language( |
| name="Czech", |
| iso_code="cz", |
| use_ascii=True, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ", |
| wiki_start_pages=["Hlavní_strana"], |
| ), |
| "Danish": Language( |
| name="Danish", |
| iso_code="da", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], |
| alphabet="æøåÆØÅ", |
| wiki_start_pages=["Forside"], |
| ), |
| "German": Language( |
| name="German", |
| iso_code="de", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "WINDOWS-1252"], |
| alphabet="äöüßÄÖÜ", |
| wiki_start_pages=["Wikipedia:Hauptseite"], |
| ), |
| "Greek": Language( |
| name="Greek", |
| iso_code="el", |
| use_ascii=False, |
| charsets=["ISO-8859-7", "WINDOWS-1253"], |
| alphabet="αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ", |
| wiki_start_pages=["Πύλη:Κύρια"], |
| ), |
| "English": Language( |
| name="English", |
| iso_code="en", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "WINDOWS-1252"], |
| wiki_start_pages=["Main_Page"], |
| ), |
| "Esperanto": Language( |
| name="Esperanto", |
| iso_code="eo", |
| # Q, W, X, and Y not used at all |
| use_ascii=False, |
| charsets=["ISO-8859-3"], |
| alphabet="abcĉdefgĝhĥijĵklmnoprsŝtuŭvzABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ", |
| wiki_start_pages=["Vikipedio:Ĉefpaĝo"], |
| ), |
| "Spanish": Language( |
| name="Spanish", |
| iso_code="es", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], |
| alphabet="ñáéíóúüÑÁÉÍÓÚÜ", |
| wiki_start_pages=["Wikipedia:Portada"], |
| ), |
| "Estonian": Language( |
| name="Estonian", |
| iso_code="et", |
| use_ascii=False, |
| charsets=["ISO-8859-4", "ISO-8859-13", "WINDOWS-1257"], |
| # C, F, Š, Q, W, X, Y, Z, Ž are only for |
| # loanwords |
| alphabet="ABDEGHIJKLMNOPRSTUVÕÄÖÜabdeghijklmnoprstuvõäöü", |
| wiki_start_pages=["Esileht"], |
| ), |
| "Finnish": Language( |
| name="Finnish", |
| iso_code="fi", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], |
| alphabet="ÅÄÖŠŽåäöšž", |
| wiki_start_pages=["Wikipedia:Etusivu"], |
| ), |
| "French": Language( |
| name="French", |
| iso_code="fr", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], |
| alphabet="œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ", |
| wiki_start_pages=["Wikipédia:Accueil_principal", "Bœuf (animal)"], |
| ), |
| "Hebrew": Language( |
| name="Hebrew", |
| iso_code="he", |
| use_ascii=False, |
| charsets=["ISO-8859-8", "WINDOWS-1255"], |
| alphabet="אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ", |
| wiki_start_pages=["עמוד_ראשי"], |
| ), |
| "Croatian": Language( |
| name="Croatian", |
| iso_code="hr", |
| # Q, W, X, Y are only used for foreign words. |
| use_ascii=False, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="abcčćdđefghijklmnoprsštuvzžABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ", |
| wiki_start_pages=["Glavna_stranica"], |
| ), |
| "Hungarian": Language( |
| name="Hungarian", |
| iso_code="hu", |
| # Q, W, X, Y are only used for foreign words. |
| use_ascii=False, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="abcdefghijklmnoprstuvzáéíóöőúüűABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ", |
| wiki_start_pages=["Kezdőlap"], |
| ), |
| "Italian": Language( |
| name="Italian", |
| iso_code="it", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], |
| alphabet="ÀÈÉÌÒÓÙàèéìòóù", |
| wiki_start_pages=["Pagina_principale"], |
| ), |
| "Lithuanian": Language( |
| name="Lithuanian", |
| iso_code="lt", |
| use_ascii=False, |
| charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], |
| # Q, W, and X not used at all |
| alphabet="AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽaąbcčdeęėfghiįyjklmnoprsštuųūvzž", |
| wiki_start_pages=["Pagrindinis_puslapis"], |
| ), |
| "Latvian": Language( |
| name="Latvian", |
| iso_code="lv", |
| use_ascii=False, |
| charsets=["ISO-8859-13", "WINDOWS-1257", "ISO-8859-4"], |
| # Q, W, X, Y are only for loanwords |
| alphabet="AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽaābcčdeēfgģhiījkķlļmnņoprsštuūvzž", |
| wiki_start_pages=["Sākumlapa"], |
| ), |
| "Macedonian": Language( |
| name="Macedonian", |
| iso_code="mk", |
| use_ascii=False, |
| charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], |
| alphabet="АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШабвгдѓежзѕијклљмнњопрстќуфхцчџш", |
| wiki_start_pages=["Главна_страница"], |
| ), |
| "Dutch": Language( |
| name="Dutch", |
| iso_code="nl", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "WINDOWS-1252"], |
| wiki_start_pages=["Hoofdpagina"], |
| ), |
| "Polish": Language( |
| name="Polish", |
| iso_code="pl", |
| # Q and X are only used for foreign words. |
| use_ascii=False, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻaąbcćdeęfghijklłmnńoóprsśtuwyzźż", |
| wiki_start_pages=["Wikipedia:Strona_główna"], |
| ), |
| "Portuguese": Language( |
| name="Portuguese", |
| iso_code="pt", |
| use_ascii=True, |
| charsets=["ISO-8859-1", "ISO-8859-15", "WINDOWS-1252"], |
| alphabet="ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú", |
| wiki_start_pages=["Wikipédia:Página_principal"], |
| ), |
| "Romanian": Language( |
| name="Romanian", |
| iso_code="ro", |
| use_ascii=True, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="ăâîșțĂÂÎȘȚ", |
| wiki_start_pages=["Pagina_principală"], |
| ), |
| "Russian": Language( |
| name="Russian", |
| iso_code="ru", |
| use_ascii=False, |
| charsets=[ |
| "ISO-8859-5", |
| "WINDOWS-1251", |
| "KOI8-R", |
| "MacCyrillic", |
| "IBM866", |
| "IBM855", |
| ], |
| alphabet="абвгдеёжзийклмнопрстуфхцчшщъыьэюяАБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ", |
| wiki_start_pages=["Заглавная_страница"], |
| ), |
| "Slovak": Language( |
| name="Slovak", |
| iso_code="sk", |
| use_ascii=True, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ", |
| wiki_start_pages=["Hlavná_stránka"], |
| ), |
| "Slovene": Language( |
| name="Slovene", |
| iso_code="sl", |
| # Q, W, X, Y are only used for foreign words. |
| use_ascii=False, |
| charsets=["ISO-8859-2", "WINDOWS-1250"], |
| alphabet="abcčdefghijklmnoprsštuvzžABCČDEFGHIJKLMNOPRSŠTUVZŽ", |
| wiki_start_pages=["Glavna_stran"], |
| ), |
| # Serbian can be written in both Latin and Cyrillic, but there's no |
| # simple way to get the Latin alphabet pages from Wikipedia through |
| # the API, so for now we just support Cyrillic. |
| "Serbian": Language( |
| name="Serbian", |
| iso_code="sr", |
| alphabet="АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШабвгдђежзијклљмнњопрстћуфхцчџш", |
| charsets=["ISO-8859-5", "WINDOWS-1251", "MacCyrillic", "IBM855"], |
| wiki_start_pages=["Главна_страна"], |
| ), |
| "Thai": Language( |
| name="Thai", |
| iso_code="th", |
| use_ascii=False, |
| charsets=["ISO-8859-11", "TIS-620", "CP874"], |
| alphabet="กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛", |
| wiki_start_pages=["หน้าหลัก"], |
| ), |
| "Turkish": Language( |
| name="Turkish", |
| iso_code="tr", |
| # Q, W, and X are not used by Turkish |
| use_ascii=False, |
| charsets=["ISO-8859-3", "ISO-8859-9", "WINDOWS-1254"], |
| alphabet="abcçdefgğhıijklmnoöprsştuüvyzâîûABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ", |
| wiki_start_pages=["Ana_Sayfa"], |
| ), |
| "Vietnamese": Language( |
| name="Vietnamese", |
| iso_code="vi", |
| use_ascii=False, |
| # Windows-1258 is the only common 8-bit |
| # Vietnamese encoding supported by Python. |
| # From Wikipedia: |
| # For systems that lack support for Unicode, |
| # dozens of 8-bit Vietnamese code pages are |
| # available.[1] The most common are VISCII |
| # (TCVN 5712:1993), VPS, and Windows-1258.[3] |
| # Where ASCII is required, such as when |
| # ensuring readability in plain text e-mail, |
| # Vietnamese letters are often encoded |
| # according to Vietnamese Quoted-Readable |
| # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] |
| # though usage of either variable-width |
| # scheme has declined dramatically following |
| # the adoption of Unicode on the World Wide |
| # Web. |
| charsets=["WINDOWS-1258"], |
| alphabet="aăâbcdđeêghiklmnoôơpqrstuưvxyAĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY", |
| wiki_start_pages=["Chữ_Quốc_ngữ"], |
| ), |
| } |