sign_language_translator.languages.text.urdu module

class sign_language_translator.languages.text.urdu.Urdu[source][source]

Bases: TextLanguage

NLP class for Urdu text. Extends slt.languages.text.TextLanguage class.

Urdu is an Indo-Aryan language spoken mostly in Pakistan. Urdu uses the Perso-Arabic script, which consists of 46 Alphabets, 10 Digits, 6 Punctuations & 6 Diacritics, and is written from right to left. See unicode details at: https://unicode.org/charts/PDF/U0600.pdf

ALLOWED_CHARACTERS = {'\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '<', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '{', '}', '،', 'ؐ', 'ؑ', 'ؒ', 'ؓ', '؛', '؟', 'ء', 'آ', 'أ', 'ؤ', 'ئ', 'ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ل', 'م', 'ن', 'و', 'ً', 'ٍ', 'َ', 'ُ', 'ِ', 'ّ', '٫', 'ٰ', 'ٹ', 'پ', 'چ', 'ڈ', 'ڑ', 'ژ', 'ک', 'گ', 'ں', 'ھ', 'ہ', 'ۂ', 'ۃ', 'ی', 'ے', 'ۓ', '۔', '۰', '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '‘', '’', '“', '”'}[source]

BRACKETS: List[str] = ['(', ')'][source]

CHARACTER_TO_WORD = {'–': '-', '—': '-', '−': '-', '⋯': '...', 'ﷲ': 'اللہ', 'ﷺ': ' صلی اللہ علیہ وسلم', '﷽': 'بسم اللہ الرحمن الرحیم'}[source]

CHARACTER_TRANSLATOR = {1577: 'ۃ', 1603: 'ک', 1607: 'ہ', 1609: 'ی', 1610: 'ی', 1632: '۰', 1633: '۱', 1634: '۲', 1635: '۳', 1636: '۴', 1637: '۵', 1638: '۶', 1639: '۷', 1640: '۸', 1641: '۹', 8211: '-', 8212: '-', 8722: '-', 8943: '...', 64342: 'پ', 64344: 'پ', 64345: 'پ', 64358: 'ٹ', 64359: 'ٹ', 64360: 'ٹ', 64361: 'ٹ', 64378: 'چ', 64379: 'چ', 64380: 'چ', 64381: 'چ', 64392: 'ڈ', 64393: 'ڈ', 64395: 'ژ', 64396: 'ڑ', 64397: 'ڑ', 64398: 'ک', 64399: 'ک', 64400: 'ک', 64401: 'ک', 64402: 'گ', 64403: 'گ', 64404: 'گ', 64405: 'گ', 64414: 'ں', 64415: 'ں', 64422: 'ہ', 64423: 'ہ', 64424: 'ہ', 64425: 'ہ', 64426: 'ھ', 64427: 'ھ', 64428: 'ھ', 64429: 'ھ', 64430: 'ے', 64431: 'ے', 64508: 'ی', 64509: 'ی', 64510: 'ی', 64511: 'ی', 65010: 'اللہ', 65018: ' صلی اللہ علیہ وسلم', 65021: 'بسم اللہ الرحمن الرحیم', 65152: 'ء', 65153: 'آ', 65154: 'آ', 65155: 'أ', 65157: 'ؤ', 65163: 'ئ', 65164: 'ئ', 65165: 'ا', 65166: 'ا', 65167: 'ب', 65168: 'ب', 65169: 'ب', 65170: 'ب', 65173: 'ت', 65174: 'ت', 65175: 'ت', 65176: 'ت', 65178: 'ث', 65179: 'ث', 65180: 'ث', 65181: 'ج', 65182: 'ج', 65183: 'ج', 65184: 'ج', 65185: 'ح', 65186: 'ح', 65187: 'ح', 65188: 'ح', 65190: 'خ', 65191: 'خ', 65192: 'خ', 65193: 'د', 65194: 'د', 65195: 'ذ', 65196: 'ذ', 65197: 'ر', 65198: 'ر', 65199: 'ز', 65200: 'ز', 65201: 'س', 65202: 'س', 65203: 'س', 65204: 'س', 65205: 'ش', 65206: 'ش', 65207: 'ش', 65208: 'ش', 65209: 'ص', 65210: 'ص', 65211: 'ص', 65212: 'ص', 65213: 'ض', 65214: 'ض', 65215: 'ض', 65216: 'ض', 65219: 'ط', 65220: 'ط', 65221: 'ظ', 65223: 'ظ', 65224: 'ظ', 65225: 'ع', 65226: 'ع', 65227: 'ع', 65228: 'ع', 65229: 'غ', 65231: 'غ', 65232: 'غ', 65233: 'ف', 65234: 'ف', 65235: 'ف', 65236: 'ف', 65237: 'ق', 65238: 'ق', 65239: 'ق', 65240: 'ق', 65243: 'ک', 65245: 'ل', 65246: 'ل', 65247: 'ل', 65248: 'ل', 65249: 'م', 65250: 'م', 65251: 'م', 65252: 'م', 65253: 'ن', 65254: 'ن', 65255: 'ن', 65256: 'ن', 65257: 'ہ', 65258: 'ہ', 65259: 'ھ', 65260: 'ھ', 65261: 'و', 65262: 'و', 65264: 'ی', 65265: 'ی', 65266: 'ی', 65267: 'ے', 65268: 'ے', 65275: 'لا', 65276: 'لا'}[source]

COMBINE_CHARACTERS_REGEX = 'آ|أ|ؤ|ۂ|یٔ|ۓ|ََ|ِِ'[source]

CORRECT_URDU_CHARACTERS_TO_INCORRECT: Dict[str, List[str]] = {'،': [], '؟': [], 'ء': ['ﺀ'], 'آ': ['ﺁ', 'ﺂ'], 'أ': ['ﺃ'], 'ؤ': ['ﺅ'], 'ئ': ['ﺋ', 'ﺌ'], 'ا': ['ﺍ', 'ﺎ'], 'ب': ['ﺏ', 'ﺐ', 'ﺑ', 'ﺒ'], 'ت': ['ﺕ', 'ﺖ', 'ﺗ', 'ﺘ'], 'ث': ['ﺛ', 'ﺜ', 'ﺚ'], 'ج': ['ﺝ', 'ﺞ', 'ﺟ', 'ﺠ'], 'ح': ['ﺡ', 'ﺣ', 'ﺤ', 'ﺢ'], 'خ': ['ﺧ', 'ﺨ', 'ﺦ'], 'د': ['ﺩ', 'ﺪ'], 'ذ': ['ﺬ', 'ﺫ'], 'ر': ['ﺭ', 'ﺮ'], 'ز': ['ﺯ', 'ﺰ'], 'س': ['ﺱ', 'ﺲ', 'ﺳ', 'ﺴ'], 'ش': ['ﺵ', 'ﺶ', 'ﺷ', 'ﺸ'], 'ص': ['ﺹ', 'ﺺ', 'ﺻ', 'ﺼ'], 'ض': ['ﺽ', 'ﺾ', 'ﺿ', 'ﻀ'], 'ط': ['ﻃ', 'ﻄ'], 'ظ': ['ﻅ', 'ﻇ', 'ﻈ'], 'ع': ['ﻉ', 'ﻊ', 'ﻋ', 'ﻌ'], 'غ': ['ﻍ', 'ﻏ', 'ﻐ'], 'ف': ['ﻑ', 'ﻒ', 'ﻓ', 'ﻔ'], 'ق': ['ﻕ', 'ﻖ', 'ﻗ', 'ﻘ'], 'ل': ['ﻝ', 'ﻞ', 'ﻟ', 'ﻠ'], 'لا': ['ﻻ', 'ﻼ'], 'م': ['ﻡ', 'ﻢ', 'ﻣ', 'ﻤ'], 'ن': ['ﻥ', 'ﻦ', 'ﻧ', 'ﻨ'], 'و': ['ﻮ', 'ﻭ', 'ﻮ'], '٫': [], 'ٹ': ['ﭦ', 'ﭧ', 'ﭨ', 'ﭩ'], 'پ': ['ﭖ', 'ﭘ', 'ﭙ'], 'چ': ['ﭺ', 'ﭻ', 'ﭼ', 'ﭽ'], 'ڈ': ['ﮈ', 'ﮉ'], 'ڑ': ['ﮍ', 'ﮌ'], 'ژ': ['ﮋ'], 'ک': ['ﮎ', 'ﮏ', 'ﮐ', 'ﮑ', 'ﻛ', 'ك'], 'گ': ['ﮒ', 'ﮓ', 'ﮔ', 'ﮕ'], 'ں': ['ﮞ', 'ﮟ'], 'ھ': ['ﮪ', 'ﮬ', 'ﮭ', 'ﻬ', 'ﻫ', 'ﮫ'], 'ہ': ['ﻩ', 'ﮦ', 'ﻪ', 'ﮧ', 'ﮩ', 'ﮨ', 'ه'], 'ۂ': [], 'ۃ': ['ة'], 'ی': ['ﯼ', 'ى', 'ﯽ', 'ﻰ', 'ﻱ', 'ﻲ', 'ﯾ', 'ﯿ', 'ي'], 'ے': ['ﮮ', 'ﮯ', 'ﻳ', 'ﻴ'], 'ۓ': [], '۔': [], '۰': ['٠'], '۱': ['١'], '۲': ['٢'], '۳': ['٣'], '۴': ['٤'], '۵': ['٥'], '۶': ['٦'], '۷': ['٧'], '۸': ['٨'], '۹': ['٩']}[source]

DIACRITICS = ['ٍ', 'ً', 'ٰ', 'َ', 'ُ', 'ِ', 'ّ'][source]

DIACRITICS_REGEX = 'ٍ|ً|ٰ|َ|ُ|ِ|ّ'[source]

END_OF_SENTENCE_MARKS: List[str] = ['.', '۔', '?', '؟', '!'][source]

FULL_STOPS: List[str] = ['.', '۔'][source]

HONORIFICS = ['ؐ', 'ؑ', 'ؒ', 'ؓ'][source]

NGRAM_ROMANIZATION_MAP = {'(?<=[ا])و(?![ں])': 'v', '(?<=[ُ])و(?![ا])': '', '(?<=\\d)\\s*ء': 'CE', '(?<=\\w)آ': "'ā", '(?<=ا)ی': 'y', '(?<=ل)ئ(?=ے)': 'ie', '(?<=ہ)ی(?!\\b)': 'e', '\\bع(?=ی)': 'ei', '\\bو': 'v', '\\bی': 'y', 'اً': 'an', 'اَ': 'a', 'اُ': 'u', 'اِ': 'i', 'و(?=[ؤ])': 'u', 'و(?=[اَےی])': 'v', 'ًا': 'an', 'ِ(?!\\w)': '-e', 'ی(?=[وای])': 'y', 'ی(?=ں)': 'ei', 'یٰ': 'a'}[source]

NUMBER_REGEX = '\\d+(?:[٫\\.:]\\d+)*'[source]

PUNCTUATION: List[str] = ['.', '۔', '?', '؟', '!', ',', '،', '؛'][source]

PUNCTUATION_REGEX = '[\\.۔\\?؟!,،؛]'[source]

QUESTION_MARKS: List[str] = ['?', '؟'][source]

QUOTATION_MARKS = ["'", '"', '”', '“', '’', '‘'][source]

ROMANIZATION_CHARACTER_TRANSLATOR = {1548: ',', 1552: '(PBUH)', 1553: '(AS)', 1554: '(RH)', 1555: '(RA)', 1563: ';', 1567: '?', 1569: "'", 1570: 'aa', 1571: 'a', 1572: 'ow', 1574: 'e', 1575: 'a', 1576: 'b', 1578: 't', 1579: 's', 1580: 'j', 1581: 'h', 1582: 'k̲h̲', 1583: 'd', 1584: 'z', 1585: 'r', 1586: 'z', 1587: 's', 1588: 'sh', 1589: 's', 1590: 'z', 1591: 't', 1592: 'z', 1593: "a'", 1594: 'g̲h̲', 1601: 'f', 1602: 'q', 1604: 'l', 1605: 'm', 1606: 'n', 1608: 'o', 1611: 'an', 1613: 'in', 1614: 'a', 1615: 'u', 1616: 'i', 1643: '.', 1648: 'a', 1657: 'ṭ', 1662: 'p', 1670: 'ch', 1672: 'ḍ', 1681: 'ṛ', 1688: 'zh', 1705: 'k', 1711: 'g', 1722: 'n̲', 1726: 'h', 1729: 'h', 1730: 'h-e', 1731: 't', 1740: 'i', 1746: 'y', 1747: 'ey', 1748: '.', 1776: '0', 1777: '1', 1778: '2', 1779: '3', 1780: '4', 1781: '5', 1782: '6', 1783: '7', 1784: '8', 1785: '9'}[source]

ROMANIZATION_MAP = {'،': ',', 'ؐ': '(PBUH)', 'ؑ': '(AS)', 'ؒ': '(RH)', 'ؓ': '(RA)', '؛': ';', '؟': '?', 'ء': "'", 'آ': 'aa', 'أ': 'a', 'ؤ': 'ow', 'ئ': 'e', 'ا': 'a', 'ب': 'b', 'ت': 't', 'ث': 's', 'ج': 'j', 'ح': 'h', 'خ': 'k̲h̲', 'د': 'd', 'ذ': 'z', 'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh', 'ص': 's', 'ض': 'z', 'ط': 't', 'ظ': 'z', 'ع': "a'", 'غ': 'g̲h̲', 'ف': 'f', 'ق': 'q', 'ل': 'l', 'م': 'm', 'ن': 'n', 'و': 'o', 'ً': 'an', 'ٍ': 'in', 'َ': 'a', 'ُ': 'u', 'ِ': 'i', '٫': '.', 'ٰ': 'a', 'ٹ': 'ṭ', 'پ': 'p', 'چ': 'ch', 'ڈ': 'ḍ', 'ڑ': 'ṛ', 'ژ': 'zh', 'ک': 'k', 'گ': 'g', 'ں': 'n̲', 'ھ': 'h', 'ہ': 'h', 'ۂ': 'h-e', 'ۃ': 't', 'ی': 'i', 'ے': 'y', 'ۓ': 'ey', '۔': '.', '۰': '0', '۱': '1', '۲': '2', '۳': '3', '۴': '4', '۵': '5', '۶': '6', '۷': '7', '۸': '8', '۹': '9'}[source]

SPLIT_TO_COMBINED_CHARACTERS: Dict[str, str] = {'آ': 'آ', 'أ': 'أ', 'ؤ': 'ؤ', 'ََ': 'ً', 'ِِ': 'ٍ', 'ۂ': 'ۂ', 'یٔ': 'ئ', 'ۓ': 'ۓ'}[source]

SYMBOLS: List[str] = ['.', '۔', '?', '؟', '!', ',', '،', '؛', "'", '"', '”', '“', '’', '‘', '(', ')', ' ', '-'][source]

UNALLOWED_CHARACTERS_REGEX = '[^ۓگ\\\nؐ۶84!DOخV؟EاۂKؤRYNزأ۳۱ٍS\\}\\(,\\{ٰژ۹ؑJ0قC\\[دؓہےH؛ڑ’3م\\?نPئُحصجںIشعUل\\-51۷ZطQسؒ۸ظ/ّF‘و۔7۰L9َGِ\\]بغآT6ً۲،”ف“پذتھی\'Mڈ>ٹثۃ\\ W2۵Bء<ضAر"چ٫\\)\\.ک۴X]'[source]

UNICODE_RANGE: Tuple[int, int] = (1536, 1791)[source]

WORD_REGEX = '[\\wًٍَُِّٰ]+'[source]

classmethod allowed_characters() → Set[str][source][source]: Returns a set of all allowed characters in the language.

static character_normalize(text: str) → str[source][source]

Replace characters that are rendered the same as Urdu characters in common fonts but actually belong to foreign unicode character ranges by Urdu characters.

Parameters:: text (str) – a piece of urdu text that may contain foreign symbols
Returns:: normalized urdu text
Return type:: str

delete_unallowed_characters(text: str) → str[source][source]

detokenize(tokens: Iterable[str]) → str[source][source]: Joins tokens back into text.

get_tags(tokens: str | Iterable[str]) → List[Any][source][source]: Get the classifications of all tokens in the form of a sequence of tags

get_word_senses(tokens: str | Iterable[str]) → List[List[str]][source][source]: Get all known meanings of the ambiguous words.

static name() → str[source][source]: Returns the name of the language used everywhere else in datasets.

static passage_preprocessor(text: str) → str[source][source]

static poetry_preprocessor(text: str) → str[source][source]

preprocess(text: str) → str[source][source]: Preprocesses text before tokenization. Make sure no different unicode characters are used for the same word. Remove unnecessary symbols, spaces, etc.

static remove_diacritics(text: str) → str[source][source]

romanize(text: str, *args, add_diacritics=True, **kwargs) → str[source][source]

Map Urdu characters to phonetically similar characters of the English language. Transliteration is useful for readability.

ALA-LC Romanization Table: https://www.loc.gov/catdir/cpso/romanization/urdu.pdf

Parameters:

text (str) – Urdu text to be mapped to Latin script.
add_diacritics (bool, optional) – Whether to use diacritics over English characters to ease pronunciation. (Rules: 1. The under-dot ‘ ̣’ indicates alternate soft/hard pronunciation of the letter. 2. The over-bar/macron ‘ ̄’ means long pronunciation. 3. The consecutive underline ‘ ̲ ̲’ means the characters come from a single source letter). Defaults to True.

Examples:

import sign_language_translator as slt

nlp = slt.languages.text.Urdu()

text = "میں نے ۴۷ کتابیں خریدی ہیں۔"
romanized_text = nlp.romanize(text)
print(romanized_text)
# 'mein̲ ny 47 ktabein̲ k̲h̲ridi hen̲.'

text = "مکّهی کا زکریّاؒ کی قابلِ تعریف قوّت سے منہ کهٹّا ہو گیا ہے۔۔۔"
text = nlp.preprocess(text)
romanized_text = nlp.romanize(text, add_diacritics=False)
print(romanized_text)
# "mkkhi ka zkryya(RH) ki qabl-e ta'rif qoot sy mnh khtta ho gya hy..."

sentence_tokenize(text: str) → List[str][source][source]: Break text into sentences.

tag(tokens: str | Iterable[str]) → List[Any][source]: Classify the tokens and mark them with appropriate tags.

classmethod token_regex() → str[source]: Returns a regular expression that matches words in this language.

tokenize(text: str) → List[str][source]: Break apart text into words or phrases

static wikipedia_preprocessor(text: str) → str[source]