sign_language_translator.languages.text.hindi module

class sign_language_translator.languages.text.hindi.Hindi[source][source]

Bases: TextLanguage

NLP class for Hindi text. Extends slt.languages.text.TextLanguage class.

Hindi is an Indo-Aryan language spoken mostly in India. Hindi uses the Devanagari script, which consists of 11 vowels and 33 consonants and is written from left to right. See unicode details at: https://unicode.org/charts/PDF/U0900.pdf

ACRONYM_PERIODS: List[str] = ['॰'][source]

ALLOWED_CHARACTERS: Set[str] = {'\n', ' ', '!', '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', '{', '}', 'ऀ', 'ँ', 'ं', 'ः', 'ऄ', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ऒ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'ऩ', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ऱ', 'ल', 'ळ', 'ऴ', 'व', 'श', 'ष', 'स', 'ह', 'ऺ', 'ऻ', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ', '्', 'ॎ', 'ॏ', 'ॐ', '॑', '॒', '॓', '॔', 'ॕ', 'ॖ', 'ॗ', 'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़', 'ॠ', 'ॡ', 'ॢ', 'ॣ', '।', '॥', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '॰', 'ॱ', 'ॲ', 'ॳ', 'ॴ', 'ॵ', 'ॶ', 'ॷ', 'ॸ', 'ॹ', 'ॺ', 'ॻ', 'ॼ', 'ॽ', 'ॾ', 'ॿ'}[source]

BRACKETS: List[str] = ['(', ')', '[', ']', '{', '}'][source]

CHARACTERS: List[str] = ['ऀ', 'ँ', 'ं', 'ः', 'ऄ', 'अ', 'आ', 'इ', 'ई', 'उ', 'ऊ', 'ऋ', 'ऌ', 'ऍ', 'ऎ', 'ए', 'ऐ', 'ऑ', 'ऒ', 'ओ', 'औ', 'क', 'ख', 'ग', 'घ', 'ङ', 'च', 'छ', 'ज', 'झ', 'ञ', 'ट', 'ठ', 'ड', 'ढ', 'ण', 'त', 'थ', 'द', 'ध', 'न', 'ऩ', 'प', 'फ', 'ब', 'भ', 'म', 'य', 'र', 'ऱ', 'ल', 'ळ', 'ऴ', 'व', 'श', 'ष', 'स', 'ह', 'ऺ', 'ऻ', '़', 'ऽ', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'ॄ', 'ॅ', 'ॆ', 'े', 'ै', 'ॉ', 'ॊ', 'ो', 'ौ', '्', 'ॎ', 'ॏ', 'ॐ', '॑', '॒', '॓', '॔', 'ॕ', 'ॖ', 'ॗ', 'क़', 'ख़', 'ग़', 'ज़', 'ड़', 'ढ़', 'फ़', 'य़', 'ॠ', 'ॡ', 'ॢ', 'ॣ', '।', '॥', '०', '१', '२', '३', '४', '५', '६', '७', '८', '९', '॰', 'ॱ', 'ॲ', 'ॳ', 'ॴ', 'ॵ', 'ॶ', 'ॷ', 'ॸ', 'ॹ', 'ॺ', 'ॻ', 'ॼ', 'ॽ', 'ॾ', 'ॿ'][source]

CHARACTER_TO_DECOMPOSED: Dict[str, str] = {'क़': 'क़', 'ख़': 'ख़', 'ग़': 'ग़', 'ज़': 'ज़', 'ड़': 'ड़', 'ढ़': 'ढ़', 'फ़': 'फ़', 'य़': 'य़'}[source]

CHARACTER_TRANSLATOR = {2392: 'क़', 2393: 'ख़', 2394: 'ग़', 2395: 'ज़', 2396: 'ड़', 2397: 'ढ़', 2398: 'फ़', 2399: 'य़'}[source]

DIACRITICS = ['ऀ', 'ँ', 'ं', 'ः', 'ॄ', 'ॅ', '़', 'ा', 'ि', 'ी', 'ु', 'ू', 'ृ', 'े', 'ै', 'ॉ', 'ो', 'ौ', '्'][source]

END_OF_SENTENCE_MARKS: List[str] = ['.', '।', '॥', '?', '!'][source]

FULL_STOPS: List[str] = ['.', '।', '॥'][source]

NGRAM_ROMANIZATION_MAP = {'(?<=.क|.ख|.ग|.घ|घ़|.ङ|.च|.छ|.ज|.झ|.ञ|.ट|ट़|.ठ|.ड|.ढ|.ण|.त|.थ|.द|.ध|.न|.क़|.ख़|.ग़|.ज़|.ड़|.ढ़)ँ': 'n', '(?<=प|फ|फ़|ब|भ|म)ं': 'm', 'घ़': 'g̲̲h̲̲', 'ट़': 't̤', 'स़': 's̤', 'ह़': 'h̤'}[source]

NUMBER_REGEX = '\\d+(?:[\\.:]\\d+)*'[source]

PUNCTUATION: List[str] = ['.', '।', '॥', '?', '!', '॰', ',', ';', ':'][source]

QUESTION_MARKS: List[str] = ['?'][source]

ROMANIZATION_CHARACTER_TRANSLATOR = {2305: 'm̐', 2306: 'n', 2307: 'ḥ', 2308: 'ĕ', 2309: 'a', 2310: 'ā', 2311: 'i', 2312: 'ī', 2313: 'u', 2314: 'ū', 2315: 'r', 2316: 'l', 2318: 'ĕ', 2319: 'e', 2320: 'ai', 2321: 'ô', 2322: 'ŏ', 2323: 'o', 2324: 'au', 2325: 'k', 2326: 'kh', 2327: 'g', 2328: 'gh', 2329: 'ngh', 2330: 'ch', 2331: 'chh', 2332: 'j', 2333: 'jh', 2334: 'ñ', 2335: 'ṭ', 2336: 'ṭh', 2337: 'ḍ', 2338: 'ḍh', 2339: 'ṇ', 2340: 't', 2341: 'th', 2342: 'd', 2343: 'dh', 2344: 'n', 2346: 'p', 2347: 'ph', 2348: 'b', 2349: 'bh', 2350: 'm', 2351: 'y', 2352: 'r', 2354: 'l', 2357: 'v', 2358: 'sh', 2359: 's', 2360: 's', 2361: 'h', 2365: "'", 2366: 'a', 2367: 'i', 2368: 'ī', 2369: 'u', 2370: 'ū', 2371: 'r', 2372: 'r̄', 2373: 'ê', 2374: 'ĕ', 2375: 'e', 2376: 'ai', 2377: 'ô', 2378: 'ŏ', 2379: 'o', 2380: 'au', 2381: '', 2392: 'q', 2393: 'k̲h̲', 2394: 'g̲h̲', 2395: 'z', 2396: 'ṛ', 2397: 'ṛh', 2398: 'f', 2400: 'r̄', 2404: '.', 2405: '.', 2406: '0', 2407: '1', 2408: '2', 2409: '3', 2410: '4', 2411: '5', 2412: '6', 2413: '7', 2414: '8', 2415: '9', 2416: '.', 2418: 'ê'}[source]

ROMANIZATION_MAP = {'ँ': 'm̐', 'ं': 'n', 'ः': 'ḥ', 'ऄ': 'ĕ', 'अ': 'a', 'आ': 'ā', 'इ': 'i', 'ई': 'ī', 'उ': 'u', 'ऊ': 'ū', 'ऋ': 'r', 'ऌ': 'l', 'ऎ': 'ĕ', 'ए': 'e', 'ऐ': 'ai', 'ऑ': 'ô', 'ऒ': 'ŏ', 'ओ': 'o', 'औ': 'au', 'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'घ़': 'g̲̲h̲̲', 'ङ': 'ngh', 'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ñ', 'ट': 'ṭ', 'ट़': 't̤', 'ठ': 'ṭh', 'ड': 'ḍ', 'ढ': 'ḍh', 'ण': 'ṇ', 'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n', 'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm', 'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v', 'श': 'sh', 'ष': 's', 'स': 's', 'स़': 's̤', 'ह': 'h', 'ह़': 'h̤', 'ऽ': "'", 'ा': 'a', 'ि': 'i', 'ी': 'ī', 'ु': 'u', 'ू': 'ū', 'ृ': 'r', 'ॄ': 'r̄', 'ॅ': 'ê', 'ॆ': 'ĕ', 'े': 'e', 'ै': 'ai', 'ॉ': 'ô', 'ॊ': 'ŏ', 'ो': 'o', 'ौ': 'au', '्': '', 'क़': 'q', 'ख़': 'k̲h̲', 'ग़': 'g̲h̲', 'ज़': 'z', 'ड़': 'ṛ', 'ढ़': 'ṛh', 'फ़': 'f', 'ॠ': 'r̄', '।': '.', '॥': '.', '०': '0', '१': '1', '२': '2', '३': '3', '४': '4', '५': '5', '६': '6', '७': '7', '८': '8', '९': '9', '॰': '.', 'ॲ': 'ê'}[source]

ROMANIZATION_MAP_CONSONANTS_ASPIRATE = {'ह': 'h', 'ह़': 'h̤'}[source]

ROMANIZATION_MAP_CONSONANTS_CEREBRALS = {'ट': 'ṭ', 'ट़': 't̤', 'ठ': 'ṭh', 'ड': 'ḍ', 'ढ': 'ḍh', 'ण': 'ṇ', 'ड़': 'ṛ', 'ढ़': 'ṛh'}[source]

ROMANIZATION_MAP_CONSONANTS_DENTALS = {'त': 't', 'थ': 'th', 'द': 'd', 'ध': 'dh', 'न': 'n'}[source]

ROMANIZATION_MAP_CONSONANTS_GUTTURALS = {'क': 'k', 'ख': 'kh', 'ग': 'g', 'घ': 'gh', 'घ़': 'g̲̲h̲̲', 'ङ': 'ngh', 'क़': 'q', 'ख़': 'k̲h̲', 'ग़': 'g̲h̲'}[source]

ROMANIZATION_MAP_CONSONANTS_LABIALS = {'प': 'p', 'फ': 'ph', 'ब': 'b', 'भ': 'bh', 'म': 'm', 'फ़': 'f'}[source]

ROMANIZATION_MAP_CONSONANTS_PALATAS = {'च': 'ch', 'छ': 'chh', 'ज': 'j', 'झ': 'jh', 'ञ': 'ñ', 'ज़': 'z'}[source]

ROMANIZATION_MAP_CONSONANTS_SEMIVOWELS = {'य': 'y', 'र': 'r', 'ल': 'l', 'व': 'v'}[source]

ROMANIZATION_MAP_CONSONANTS_SIBILANTS = {'श': 'sh', 'ष': 's', 'स': 's', 'स़': 's̤'}[source]

ROMANIZATION_MAP_VOWELS_AND_DIPHTHONGS = {'ऄ': 'ĕ', 'अ': 'a', 'आ': 'ā', 'इ': 'i', 'ई': 'ī', 'उ': 'u', 'ऊ': 'ū', 'ऋ': 'r', 'ऌ': 'l', 'ऎ': 'ĕ', 'ए': 'e', 'ऐ': 'ai', 'ऑ': 'ô', 'ऒ': 'ŏ', 'ओ': 'o', 'औ': 'au', 'ा': 'a', 'ि': 'i', 'ी': 'ī', 'ु': 'u', 'ू': 'ū', 'ृ': 'r', 'ॄ': 'r̄', 'ॅ': 'ê', 'ॆ': 'ĕ', 'े': 'e', 'ै': 'ai', 'ॉ': 'ô', 'ॊ': 'ŏ', 'ो': 'o', 'ौ': 'au', 'ॠ': 'r̄', 'ॲ': 'ê'}[source]

SYMBOLS: List[str] = ['.', '।', '॥', '?', '!', '॰', ',', ';', ':', '(', ')', '[', ']', '{', '}', '-', '_', '/'][source]

UNALLOWED_CHARACTERS_REGEX = '[^खड़भऒ8॥O_ॱEॻॢऽऎॎ॔;फदॼॸा:डब॰इ3ऺऻरओऴI9षळग़ढफ़औZठॗीॅऊॉङॿFँ१धॹजॽG०श\\]ंT6़ै॓ऩख़M>2अवॡ५य़<A३ऋूृुUBकॏझॳ\\\nॾ!४DVतेKRYन्ईॆढ़Nॊौ८S\\}\\(ॲ,ए\\{क़छJ0णऍ\\[Cऑऌॣ।ॶॖगH\\?ॵP९लॕऐॠॺ/4आ15\\-६ऀQघ॒ॴसटमॷ7Lःॐञ२य॑चज़\\ ऱ७ोWॄउथपहऄि\\)\\.X]'[source]

UNICODE_RANGE: Tuple[int, int] = (2304, 2431)[source]

WORD_REGEX = '[^\\W_\\d]([^\\W_\\d]|[ऀँंःॄॅ़ािीुूृेैॉोौ्])*'[source]

classmethod allowed_characters() → Set[str][source][source]: Returns a set of all allowed characters in the language.

delete_unallowed_characters(text: str) → str[source][source]

detokenize(tokens: Iterable[str]) → str[source][source]: Joins tokens back into text.

get_tags(tokens: str | Iterable[str]) → List[Any][source][source]: Get the classifications of all tokens in the form of a sequence of tags

get_word_senses(tokens: str | Iterable[str]) → List[List[str]][source][source]: Get all known meanings of the ambiguous words.

static name() → str[source][source]: Returns the name of the language used everywhere else in datasets.

normalize_characters(text: str) → str[source][source]

preprocess(text: str) → str[source][source]: Preprocesses text before tokenization. Make sure no different unicode characters are used for the same word. Remove unnecessary symbols, spaces, etc.

romanize(text: str, *args, add_diacritics=True, **kwargs) → str[source][source]

Map Hindi characters to phonetically similar characters of the English language. Transliteration is useful for readability.

ALA-LC Romanization Table: https://www.loc.gov/catdir/cpso/romanization/hindi.pdf

Parameters:

text (str) – Hindi text to be mapped to Latin script.
add_diacritics (bool, optional) – Whether to use diacritics over English characters to help pronunciation. Defaults to True.

Examples:

import sign_language_translator as slt

nlp = slt.languages.text.Hindi()

text = "मैंने किताब खरीदी है।"
romanized_text = nlp.romanize(text)
print(romanized_text)
# 'mainne kitab khrīdī hai.'

text = "ईशांत शर्मा को उनकी शानदार गेंदबाजी के लिए १ प्लेयर ऑफ द मैच का अवॉर्ड दिया गया।"
text = nlp.preprocess(text)
romanized_text = nlp.romanize(text)
print(romanized_text)
# 'īshant shrma ko unkī shandar gendbajī ke lie 1 pleyr ôph d maich ka avôrḍ diya gya.'

sentence_tokenize(text: str) → List[str][source][source]: Break text into sentences.

tag(tokens: str | Iterable[str]) → List[Tuple[str, Any]][source][source]: Classify the tokens and mark them with appropriate tags.

classmethod token_regex() → str[source][source]: Returns a regular expression that matches words in this language.

tokenize(text: str) → List[str][source][source]: Break apart text into words or phrases