Source code for sign_language_translator.text.synonyms

"""
This module provides a SynonymFinder class that can find synonyms of a given text
by utilizing translation and back-translation or similarity in embedding vectors.

Dependencies:
- deep_translator

Classes:
    - SynonymFinder: A class for finding synonyms using translation and similarity methods.
"""

from collections import Counter
from typing import Dict, List, Optional
from warnings import warn

from urllib3.exceptions import HTTPError

try:
    from deep_translator import GoogleTranslator
    from deep_translator.exceptions import (
        BaseError as DeepTranslatorError,
        TooManyRequests,
    )
except ImportError:
    GoogleTranslator = None
    DeepTranslatorError = None
    TooManyRequests = None

from sign_language_translator.utils.parallel import threaded_map


[docs] class SynonymFinder: """ This class provides methods for finding synonyms of a given text using two different approaches: 1. Translation and back-translation through the 'synonyms_by_translation' method (requires internet). 2. Embedding-based similarity search through the 'synonyms_by_similarity' method. Attributes: language (str): The target language for translation. Use 2-letter codes (ISO 639-1). translator (GoogleTranslator): The translator object for language translation. intermediate_languages (List[str]): List of languages supported by the translator, excluding the current language. embedding_model (str): The embedding model for similarity-based synonym finding. Methods: synonyms_by_translation: Finds synonyms by translating text into an intermediate language and then back-translation. synonyms_by_similarity: Finds synonyms based on embedding vector similarity. translate: Translates text to the specified target language. Example: .. code-block:: python # Instantiate SynonymFinder with the target language synonym_finder = SynonymFinder("en") # Find synonyms using translation and back-translation text = "happy" synonyms = synonym_finder.synonyms_by_translation(text) print(f"Synonyms by Translation: {synonyms}") # Find synonyms using similarity based on embedding vectors text = "joyful" synonyms = synonym_finder.synonyms_by_similarity(text) print(f"Synonyms by Similarity: {synonyms}") """ def __init__(self, language: str = "en") -> None: """ Initialize a SynonymFinder object. Args: language (str): The target language for translation based synonyms. Use 2-letter codes (ISO 639-1). Defaults to "en". """ self._language = language self._translator = None self._intermediate_languages = None self._embedding_model = None @property def language(self) -> str: """The target language for translation. Use 2-letter codes (ISO 639-1).""" return self._language @language.setter def language(self, language: str) -> None: self._language = language self._embedding_model = None @property def translator(self): """ The deep_translator.GoogleTranslator object with the source language as "auto" and the target language as the __init__ argument or according to the current state. """ if self._translator is None: if GoogleTranslator is None: raise ImportError( "The 'deep_translator' package is required for translation-based synonym finding. " "Install it using `pip install sign-language-translator[synonyms]`." ) self._translator = GoogleTranslator(source="auto", target=self.language) return self._translator @property def intermediate_languages(self) -> List[str]: """ Returns a list of languages supported by the translator, excluding the current language. They are used to find synonyms by translation and back-translation. These are 2-letter codes (ISO 639-1). """ if not self._intermediate_languages: self._intermediate_languages = list( self.translator.get_supported_languages(as_dict=True).values() # type: ignore ) return self._intermediate_languages
[docs] def synonyms_by_translation( self, text: str, intermediate_languages: Optional[List[str]] = None, min_frequency: int = 1, time_delay: float = 1e-2, timeout: Optional[float] = 10, max_n_threads: int = 132, lower_case: bool = True, progress_bar: bool = True, leave: bool = False, cache: Optional[Dict[str, Dict[str, str]]] = None, ) -> List[str]: """ Translates the given text into intermediate languages and performs back-translation to obtain synonyms. Translation is done via the internet using web scraping by the deep_translator library. Args: text (str): The text to be translated. intermediate_languages (Optional[List[str]]): List of intermediate languages to translate the text into. Use 2-letter codes (ISO 639-1). If None, all supported languages of the translator will be used. Defaults to None. min_frequency (int): Minimum occurrence count for synonyms to get considered. Value is inclusive. Defaults to 1. time_delay (float): Time delay between translation requests (in seconds). Defaults to 1e-2. timeout (float | None): The maximum amount of time (in seconds) to wait for a thread to finish. None means wait indefinitely. Defaults to 10. max_n_threads (int): Maximum number of threads to use for parallel translation. Defaults to 128. lower_case (bool): Whether to convert the synonyms to lowercase. Defaults to True. progress_bar (bool): Whether to display a progress bar during translation. Defaults to True. leave (bool): Whether to leave the progress bar after translation. Defaults to True. cache (Optional[Dict[str, Dict[str, str]]]): A dictionary to save or retrieve the intermediate translations of the `text`. Structure is `{"text": {"language": "translation", ...}, ...}` where each input maps to a dict mapping language code to the text's translation. Defaults to None. Returns: List[str]: A list of synonyms obtained through back-translation from other languages. """ # setup if intermediate_languages is None: intermediate_languages = self.intermediate_languages def translation_function(text: str, target_lang: str, translations: List[str]): if ( isinstance(cache, dict) and (text in cache) and (target_lang in cache[text]) ): translations.append(cache[text][target_lang]) else: if translation := self.translate(text, target_lang): translations.append(translation) if isinstance(cache, dict): cache.setdefault(text, {})[target_lang] = translation # translation into intermediate languages translations = [] threaded_map( translation_function, [(text, lang, translations) for lang in intermediate_languages], time_delay=time_delay, timeout=timeout, max_n_threads=max_n_threads, progress_bar=progress_bar, leave=leave, # progress_callback=progress_callback, ) # back-translation into source language synonyms = [] threaded_map( translation_function, [ (translation.strip(), self.language, synonyms) for translation in set(translations + [text]) if translation.strip() ], time_delay=time_delay, timeout=timeout, max_n_threads=max_n_threads, progress_bar=progress_bar, leave=leave, ) # preprocess if lower_case: synonyms = [str(syn).lower() for syn in synonyms] synonyms = [stripped for syn in synonyms if (stripped := str(syn).strip())] # sort by frequency synonyms = [ txt for txt, freq in Counter(synonyms).most_common() if freq >= min_frequency ] return synonyms
[docs] def translate(self, text: str, target_language: str) -> str: """ Translates the given text to the specified target language. Args: text (str): The text to be translated. target_language (str): The target language for translation. Use 2-letter codes (ISO 639-1). Returns: str: The translated text. """ try: self.translator.target = target_language return str(self.translator.translate(text)).strip() except ( HTTPError, DeepTranslatorError or HTTPError, TooManyRequests or HTTPError, ) as exc: warn(f"Translation failed for '{text}' to '{target_language}'.Error: {exc}") return ""
@property def embedding_model(self): if self._embedding_model is None: from sign_language_translator.models._utils import get_model self._embedding_model = get_model(f"lookup-{self.language}-fasttext-cc.pt") return self._embedding_model
[docs] def synonyms_by_similarity( self, text: str, top_k=10, min_similarity=0.5 ) -> List[str]: """Looks into a vector database and returns the closest matches to the input text. Args: text (str): The input text to find synonyms for. top_k (int, optional): The maximum number of synonyms to return. Defaults to 10. min_similarity (float, optional): Cut off value for similarity between embedding vectors. Words with greater similarity score than this value are returned as synonyms. Defaults to 0.8. Returns: List[str]: A list of synonyms for the input text. Example: .. code-block:: python # Instantiate SynonymFinder with the target language synonym_finder = SynonymFinder("ur") # Find synonyms using similarity based on embedding vectors text = "تعلیم" synonyms = synonym_finder.synonyms_by_similarity(text, 3) print(synonyms) # ["تعلیم", "تربیت", "تعلیمی"] """ # TODO: search with a different language or by vector vector = self.embedding_model.embed(text) # type: ignore synonyms, scores = self.embedding_model.similar(vector, k=top_k) # type: ignore return [syn for syn, score in zip(synonyms, scores) if score > min_similarity]