Source code for sign_language_translator.languages.sign.pakistan_sign_language

"""Defines a class for constructing Pakistan Sign Language from text using rules."""

import random
import re
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

from sign_language_translator.config.assets import Assets
from sign_language_translator.config.enums import SignLanguages
from sign_language_translator.languages.sign.mapping_rules import (
    CharacterByCharacterMappingRule,
    DirectMappingRule,
    LambdaMappingRule,
    MappingRule,
)
from sign_language_translator.languages.sign.sign_language import SignLanguage
from sign_language_translator.languages.vocab import Vocab
from sign_language_translator.text import Tags



[docs]
class PakistanSignLanguage(SignLanguage):
    """A class representing the Pakistan Sign Language.

    It provides methods for converting tokens to sign dictionaries and restructuring sentences.

    Attributes:
        STOPWORDS (set): A set of stopwords in Pakistan Sign Language.
    """

    STOPWORDS = {"the", "so"}


[docs]
    @staticmethod
    def name() -> str:
        return SignLanguages.PAKISTAN_SIGN_LANGUAGE.value


    def __init__(self) -> None:
        # load word maps and info from dataset files
        self.vocab = Vocab(
            language=r".+",
            country=r"^pk$",
            organization=r".+",
            part_number=r"[0-9]+",
            data_root_dir=Assets.ROOT_DIR,
            arg_is_regex=True,
        )
        # restructure dict values
        self.word_to_sign_dict = {
            word: self._make_equal_weight_sign_dict(labels)
            for word, labels in self.vocab.word_to_labels.items()
        }

        # define mapping rules
        self._direct_rule = self.__get_direct_mapping_rule(4)
        self._double_handed_spelling_rule = self.__get_spelling_rule(5, "double-handed")
        self._single_handed_spelling_rule = self.__get_spelling_rule(5, "single-handed")
        self._urdu_character_rule = self.__get_urdu_spelling_rule(5)
        self._number_rule = self.__get_number_rule(5)

        self.mapping_rules: List[MappingRule] = [
            self._direct_rule,
            self._double_handed_spelling_rule,
            self._single_handed_spelling_rule,
            self._urdu_character_rule,
            self._number_rule,
            # self._time_rule, # x:y:z -> x hours y minutes z seconds
            # self._date_rule,
        ]


[docs]
    def tokens_to_sign_dicts(
        self,
        tokens: Iterable[str],
        tags: Optional[Iterable[Any]] = None,
        contexts: Optional[Iterable[Any]] = None,
    ) -> List[Dict[str, Union[List[List[str]], List[float]]]]:
        # fix args
        if isinstance(tokens, str):
            tokens = [tokens]
        if not tags:
            tags = [None for _ in tokens]
        if not contexts:
            contexts = [None for _ in tokens]

        # map tokens to signs
        signs = [
            sign_dict
            for token, tag, context in zip(tokens, tags, contexts)
            for sign_dict in self._apply_rules(token, tag, context)
        ]

        return signs


    def _apply_rules(
        self, token: str, tag=None, context=None
    ) -> List[Dict[str, Union[List[List[str]], List[float]]]]:
        """Applies all the mapping rules to a token.
        Rules with lower value of priority overwrite the result.
        If multiple rules of same priority are applicable, one is selected at random.

        Args:
            token (str): The token to apply the rules to.
            tag (Any, optional): The tag associated with the token. Defaults to None.
            context (Any, optional): The context associated with the token. Defaults to None.

        Returns:
            List[Dict[str, List[List[str]] | List[float]]]:
                A list of sign dictionaries for the token.
        """

        sign = None
        priority = float("inf")

        for rule in self.mapping_rules:
            if rule.is_applicable(token.lower(), tag, context):
                if rule.priority < priority or (
                    rule.priority == priority and random.random() < 0.5
                ):
                    sign = rule.apply(token.lower())
                    priority = rule.priority

        if sign is None and tag == Tags.AMBIGUOUS:
            raise ValueError(
                f"Token '{token}' is ambiguous."
                + f"Try from {self.vocab.ambiguous_to_unambiguous.get(token,[])}."
            )
        if sign is None:
            raise ValueError(f"No PakistanSL sign could be inferred for {token = }.")

        return sign


[docs]
    def restructure_sentence(
        self,
        sentence: Iterable[str],
        tags: Optional[Iterable[Any]] = None,
        contexts: Optional[Iterable[Any]] = None,
    ) -> Tuple[Iterable[str], Iterable[Any], Iterable[Any]]:
        # Fix the args
        tags = [Tags.DEFAULT for _ in sentence] if tags is None else tags
        contexts = [None for _ in sentence] if contexts is None else contexts

        # map to urdu grammar "he goes to school" -> ["he", "school", "go"]
        restructured_sentence = []
        restructured_tags = []
        restructured_contexts = []

        # drop stuff
        for token, tag, context in zip(sentence, tags, contexts):
            # drop stop-words
            if token.lower() in self.STOPWORDS:
                continue

            # drop space and punctuation
            if tag in {Tags.SPACE, Tags.PUNCTUATION}:
                continue

            # make numbers "numeric"
            if tag == Tags.NUMBER and "," in token:
                token = token.replace(",", "")

            # drop word-sense: "(name)" from the NAME token
            if tag == Tags.NAME:
                token = re.sub(self.vocab.word_sense_regex, "", token)

            restructured_sentence.append(token)
            restructured_tags.append(tag)
            restructured_contexts.append(context)

        return restructured_sentence, restructured_tags, restructured_contexts


    def __call__(
        self,
        tokens: Iterable[str],
        tags: Optional[Iterable[Any]] = None,
        contexts: Optional[Iterable[Any]] = None,
    ) -> List[Dict[str, Union[List[List[str]], List[float]]]]:
        tokens, tags, contexts = self.restructure_sentence(
            tokens, tags=tags, contexts=contexts
        )
        signs = self.tokens_to_sign_dicts(tokens, tags=tags, contexts=contexts)

        return signs

    def __get_direct_mapping_rule(self, priority=5):
        return DirectMappingRule(
            {w: [sd] for w, sd in self.word_to_sign_dict.items()}, priority
        )

    def __get_spelling_rule(
        self,
        priority: int,
        word_sense_filter="handed-letter",
    ):
        return CharacterByCharacterMappingRule(
            {
                k__: v
                for k, v in self.word_to_sign_dict.items()
                for k_ in [self.vocab.remove_word_sense(k)]
                for k__ in [k_.lower(), k_.upper()]
                if word_sense_filter in k
            },
            {Tags.NAME, Tags.ACRONYM},
            priority,
        )

    def __get_urdu_spelling_rule(self, priority=5):
        return CharacterByCharacterMappingRule(
            {
                k: v
                for k, v in self.word_to_sign_dict.items()
                if len(k) == 1 and not k.isnumeric()
            },
            {Tags.NAME},
            priority,
        )

    def __get_number_rule(self, priority=5):
        return LambdaMappingRule(
            is_applicable_function=lambda token, tag, context: (
                tag == Tags.NUMBER
                and all(
                    digit in self.word_to_sign_dict
                    for digit in self.__chunk_a_number(token)
                )
            ),
            apply_function=lambda x: [
                self.word_to_sign_dict[digit] for digit in self.__chunk_a_number(x)
            ],
            priority=priority,
        )

    def __chunk_a_number(self, num):
        return re.findall(
            r"("
            + r"|".join(sorted(self.vocab.numeric_keys, key=len, reverse=True))
            + r"|\d|\.|.)",
            str(self.vocab.words_to_numbers.get(num, num)),
        )