Source code for sign_language_translator.languages.sign.pakistan_sign_language

"""Defines a class for constructing Pakistan Sign Language from text using rules."""

import random
import re
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

from sign_language_translator.config.assets import Assets
from sign_language_translator.config.enums import SignLanguages
from sign_language_translator.languages.sign.mapping_rules import (
    CharacterByCharacterMappingRule,
    DirectMappingRule,
    LambdaMappingRule,
    MappingRule,
)
from sign_language_translator.languages.sign.sign_language import SignLanguage
from sign_language_translator.languages.vocab import Vocab
from sign_language_translator.text import Tags


[docs] class PakistanSignLanguage(SignLanguage): """A class representing the Pakistan Sign Language. It provides methods for converting tokens to sign dictionaries and restructuring sentences. Attributes: STOPWORDS (set): A set of stopwords in Pakistan Sign Language. """ STOPWORDS = {"the", "so"}
[docs] @staticmethod def name() -> str: return SignLanguages.PAKISTAN_SIGN_LANGUAGE.value
def __init__(self) -> None: # load word maps and info from dataset files self.vocab = Vocab( language=r".+", country=r"^pk$", organization=r".+", part_number=r"[0-9]+", data_root_dir=Assets.ROOT_DIR, arg_is_regex=True, ) # restructure dict values self.word_to_sign_dict = { word: self._make_equal_weight_sign_dict(labels) for word, labels in self.vocab.word_to_labels.items() } # define mapping rules self._direct_rule = self.__get_direct_mapping_rule(4) self._double_handed_spelling_rule = self.__get_spelling_rule(5, "double-handed") self._single_handed_spelling_rule = self.__get_spelling_rule(5, "single-handed") self._urdu_character_rule = self.__get_urdu_spelling_rule(5) self._number_rule = self.__get_number_rule(5) self.mapping_rules: List[MappingRule] = [ self._direct_rule, self._double_handed_spelling_rule, self._single_handed_spelling_rule, self._urdu_character_rule, self._number_rule, # self._time_rule, # x:y:z -> x hours y minutes z seconds # self._date_rule, ]
[docs] def tokens_to_sign_dicts( self, tokens: Iterable[str], tags: Optional[Iterable[Any]] = None, contexts: Optional[Iterable[Any]] = None, ) -> List[Dict[str, Union[List[List[str]], List[float]]]]: # fix args if isinstance(tokens, str): tokens = [tokens] if not tags: tags = [None for _ in tokens] if not contexts: contexts = [None for _ in tokens] # map tokens to signs signs = [ sign_dict for token, tag, context in zip(tokens, tags, contexts) for sign_dict in self._apply_rules(token, tag, context) ] return signs
def _apply_rules( self, token: str, tag=None, context=None ) -> List[Dict[str, Union[List[List[str]], List[float]]]]: """Applies all the mapping rules to a token. Rules with lower value of priority overwrite the result. If multiple rules of same priority are applicable, one is selected at random. Args: token (str): The token to apply the rules to. tag (Any, optional): The tag associated with the token. Defaults to None. context (Any, optional): The context associated with the token. Defaults to None. Returns: List[Dict[str, List[List[str]] | List[float]]]: A list of sign dictionaries for the token. """ sign = None priority = float("inf") for rule in self.mapping_rules: if rule.is_applicable(token.lower(), tag, context): if rule.priority < priority or ( rule.priority == priority and random.random() < 0.5 ): sign = rule.apply(token.lower()) priority = rule.priority if sign is None and tag == Tags.AMBIGUOUS: raise ValueError( f"Token '{token}' is ambiguous." + f"Try from {self.vocab.ambiguous_to_unambiguous.get(token,[])}." ) if sign is None: raise ValueError(f"No PakistanSL sign could be inferred for {token = }.") return sign
[docs] def restructure_sentence( self, sentence: Iterable[str], tags: Optional[Iterable[Any]] = None, contexts: Optional[Iterable[Any]] = None, ) -> Tuple[Iterable[str], Iterable[Any], Iterable[Any]]: # Fix the args tags = [Tags.DEFAULT for _ in sentence] if tags is None else tags contexts = [None for _ in sentence] if contexts is None else contexts # map to urdu grammar "he goes to school" -> ["he", "school", "go"] restructured_sentence = [] restructured_tags = [] restructured_contexts = [] # drop stuff for token, tag, context in zip(sentence, tags, contexts): # drop stop-words if token.lower() in self.STOPWORDS: continue # drop space and punctuation if tag in {Tags.SPACE, Tags.PUNCTUATION}: continue # make numbers "numeric" if tag == Tags.NUMBER and "," in token: token = token.replace(",", "") # drop word-sense: "(name)" from the NAME token if tag == Tags.NAME: token = re.sub(self.vocab.word_sense_regex, "", token) restructured_sentence.append(token) restructured_tags.append(tag) restructured_contexts.append(context) return restructured_sentence, restructured_tags, restructured_contexts
def __call__( self, tokens: Iterable[str], tags: Optional[Iterable[Any]] = None, contexts: Optional[Iterable[Any]] = None, ) -> List[Dict[str, Union[List[List[str]], List[float]]]]: tokens, tags, contexts = self.restructure_sentence( tokens, tags=tags, contexts=contexts ) signs = self.tokens_to_sign_dicts(tokens, tags=tags, contexts=contexts) return signs def __get_direct_mapping_rule(self, priority=5): return DirectMappingRule( {w: [sd] for w, sd in self.word_to_sign_dict.items()}, priority ) def __get_spelling_rule( self, priority: int, word_sense_filter="handed-letter", ): return CharacterByCharacterMappingRule( { k__: v for k, v in self.word_to_sign_dict.items() for k_ in [self.vocab.remove_word_sense(k)] for k__ in [k_.lower(), k_.upper()] if word_sense_filter in k }, {Tags.NAME, Tags.ACRONYM}, priority, ) def __get_urdu_spelling_rule(self, priority=5): return CharacterByCharacterMappingRule( { k: v for k, v in self.word_to_sign_dict.items() if len(k) == 1 and not k.isnumeric() }, {Tags.NAME}, priority, ) def __get_number_rule(self, priority=5): return LambdaMappingRule( is_applicable_function=lambda token, tag, context: ( tag == Tags.NUMBER and all( digit in self.word_to_sign_dict for digit in self.__chunk_a_number(token) ) ), apply_function=lambda x: [ self.word_to_sign_dict[digit] for digit in self.__chunk_a_number(x) ], priority=priority, ) def __chunk_a_number(self, num): return re.findall( r"(" + r"|".join(sorted(self.vocab.numeric_keys, key=len, reverse=True)) + r"|\d|\.|.)", str(self.vocab.words_to_numbers.get(num, num)), )