Source code for sign_language_translator.config.assets

"""Module for managing assets required by the sign language translator package,
such as datasets and models.
"""

__all__ = [
    "Assets",
]

import json
import os
import re
from datetime import datetime
from os.path import abspath, dirname, exists, isdir, isfile, join, sep
from typing import Dict, List, Optional, Tuple, Union

from tqdm.auto import tqdm

from sign_language_translator.config.settings import Settings
from sign_language_translator.config.utils import read_urls
from sign_language_translator.utils import (
    Archive,
    ProgressStatusCallback,
    download,
    is_regex,
)


[docs] class Assets: """ Static class for managing assets related to the sign language translator. It wraps around utility functions to automatically handle downloading, extracting, deleting and loading assets. Attributes: ROOT_DIR (str): The root directory path where the sign language datasets & models are stored. FILE_TO_URL (Dict[str, str]): A dictionary mapping asset filenames to their corresponding URLs. asset_regex_to_urls_file (List[Tuple[str, str]]): A list of tuples mapping regular expressions matching asset names to URLs files. The UR file must be a JSON containing a key "file_to_url" which maps to an object mapping relative paths of assets to their URLs. Methods: set_root_dir(path: str) -> None: Set the SLT resources directory path. get_ids(filename_or_regex: str) -> List[str]: Get the relative paths of assets matching the given filename_or_regex. download(filename_or_regex: str, overwrite=False, progress_bar: bool = None, timeout: float = 20.0, leave=True, chunk_size=65536) -> bool: Download assets matching the given filename regex and save them to the appropriate file paths inside the assets root directory. extract(filename_or_regex: str, archive_name_or_regex: str = None, overwrite=False, download_archive=True) -> List[str]: extract the files matching the argument from an archived dataset into the appropriate location. delete(filename_or_regex: str) -> None: remove the matching assets from storage and its records from the checksum file. Example: .. code-block:: python import sign_language_translator as slt # slt.Assets.set_root_dir("~/centralized-slt-assets") # Archived datasets ids = slt.Assets.get_ids(r"datasets/.*\\.zip") paths = slt.Assets.download("datasets/pk-hfad-1.landmarks-mediapipe-world-csv.zip") files = slt.utils.Archive.extract(paths[0], "*.csv") # Specific file from archive path = slt.Assets.extract("pk-hfad-1_airplane.landmarks-mediapipe-world.csv", download_archive=True) # all dictionary videos for numbers urls = slt.Assets.get_url(r"videos/[a-z-]+_\\d+\\.mp4") # download a model paths = slt.Assets.download(r"models/names-stat-lm-w\\d\\.json") # add your own dataset slt.Assets.FILE_TO_URL.update({"relative/path/to/asset.mp4": "https://...",}) """ # =============== # # CONFIGURE # # =============== # ROOT_DIR: str = join(dirname(dirname(abspath(__file__))), "assets") """The root directory path where the sign language datasets & models are stored. Defaults is 'install_directory/assets'.""" primary_urls_file: str = "urls.json" """The name of the first URLs file that is loaded by default and must contain links to other url files. Defaults is 'urls.json'. Note: all url filenames must end with 'urls.json'.""" urls_file_dir: str = dirname(abspath(__file__)) """The directory path where the URLs files are stored. Defaults is 'install_directory/sign_language_translator/config'.""" FILE_TO_URL: Dict[str, str] = read_urls(join(urls_file_dir, primary_urls_file)) """A dictionary mapping asset filenames to their corresponding URLs.""" asset_regex_to_urls_file: List[Tuple[str, str]] = [ # regex_for_assets, urls_file (r"^videos/pk-.*mp4$", "pk-dictionary-urls.json"), (r".*zip$", "archive-urls.json"), (r".*", "extra-urls.json"), ] """A list of tuples that map regular expressions matching asset names to URLs files containing URLs to that asset group. The regex are tried sequentially so make sure list order is right.""" _checksum_filename = "checksum.json" _loaded_url_files = set()
[docs] @classmethod def set_root_dir(cls, path: str) -> None: """Set the SLT resources directory path. Helpful when using custom datasets, or when data must be stored outside install directory at a centralized location. (e.g hosted on cloud and mounted on disk). By default, resources are downloaded to the 'install_directory/assets'. Args: path (str): The path to the assets, datasets or models directory. Raises: ValueError: If the provided path is not a directory. """ path = abspath(path) if exists(path) and not isdir(path): raise ValueError(f"The provided path is not a directory. Path: {path}") os.makedirs(path, exist_ok=True) cls.ROOT_DIR = path
# ============= # # Getters # # ============= #
[docs] @classmethod def get_ids(cls, filename_or_regex: str) -> List[str]: """ Filters down the loaded Assets.FILE_TO_URLS dictionary and returns the list of asset IDs (relative paths) that match the given filename_or_regex. If no asset_id match the argument, appropriate urls file is loaded from the Assets.asset_regex_to_urls_file list and the function is called recursively. Args: filename_or_regex (str): The filename or regex to match against asset IDs. Returns: List[str]: List of matching asset IDs. """ # exact match if filename_or_regex in cls.FILE_TO_URL: return [filename_or_regex] # regex match #* (but only with currently loaded url files) if is_regex(filename_or_regex): regex = re.compile(f"^{filename_or_regex}$") if ids := [file for file in cls.FILE_TO_URL if regex.match(file)]: return ids # load more urls for regex, urls_file in cls.asset_regex_to_urls_file: if urls_file in cls._loaded_url_files: continue if regex == filename_or_regex or re.match(regex, filename_or_regex): cls.load_urls(urls_file) # recursive call return cls.get_ids(filename_or_regex) # not found return []
[docs] @classmethod def get_url(cls, filename_or_regex: str) -> List[str]: """ Filters down the loaded Assets.FILE_TO_URLS dictionary and returns the list of URLs corresponding to the asset_ids matching given filename_or_regex. Args: filename_or_regex (str): The filename or regex to match against asset IDs. Returns: List[str]: List of matching URLs. """ return [cls.FILE_TO_URL[id] for id in cls.get_ids(filename_or_regex)]
[docs] @classmethod def get_path(cls, filename_or_regex: str) -> List[str]: """ Filters down the loaded Assets.FILE_TO_URLS dictionary and returns the list of file paths corresponding to the asset_ids matching given filename_or_regex. Args: filename_or_regex (str): The filename or regex to match against asset IDs. Returns: List[str]: List of matching file paths. """ return [cls._abs_path(asset_id) for asset_id in cls.get_ids(filename_or_regex)]
# ============== # # Fetching # # ============== #
[docs] @classmethod def extract( cls, filename_or_regex: str, archive_name_or_regex: Optional[str] = None, overwrite=False, progress_bar=True, leave=True, download_archive=True, ) -> List[str]: """ Extracts assets matching the given filename_or_regex from the archived datasets. The target location is inferred from the archive & asset name and resides inside assets root directory. If the archive name is not provided, it will be inferred from the filename_or_regex, so avoid using vague regex and follow the filename structure. If the archive is not downloaded, it will be downloaded first. Note: Please use the `slt.utils.Archive.extract()` function directly if you want deterministic behavior and avoid false guesses. Args: filename_or_regex (str): The filename or regex pattern to match the archive contents. archive_name_or_regex (str, optional): The name or regex pattern of the archive(s) that contains the assets. If None, the function tries guess it from the content name. Defaults to None. overwrite (bool, optional): Flag indicating whether to overwrite existing assets. Defaults to False. progress_bar (bool, optional): Flag indicating whether to display a progress bar during extraction. Defaults to True. leave (bool, optional): Flag indicating whether to leave the progress bar displayed after extraction. Defaults to True. download_archive (bool, optional): Flag indicating whether to download the archive if it is not already downloaded. Defaults to True. Returns: List[str]: A list of paths to the extracted assets. """ cls.delete_out_of_date_assets() checksum_asset_ids, checksum_infos = [], [] arch = archive_name_or_regex or cls.infer_archive_name(filename_or_regex) if download_archive: cls.download(arch, overwrite=False, progress_bar=progress_bar, leave=leave) content_name = filename_or_regex.split("/")[-1] extracted_assets = [] for archive_id in cls.get_ids(arch): asset_dir = archive_id.split("/")[-1].split(".")[-2].split("-")[0] extracted_assets += Archive.extract( archive_path=cls._abs_path(archive_id), regex=content_name, output_dir=cls._abs_path(asset_dir), overwrite=overwrite, progress_bar=progress_bar, leave=leave, verbose=False, ) for asset_id in extracted_assets: # todo: skip those which were not extracted/overwritten but existed already checksum_asset_ids.append(asset_id) checksum_infos.append( { "archive_id": archive_id, "archive_url": cls.get_url(archive_id)[0], } ) cls._update_checksum(checksum_asset_ids, checksum_infos) return extracted_assets
[docs] @classmethod def download( cls, filename_or_regex: str, overwrite=False, timeout: float = 20.0, chunk_size=2**18, progress_bar: Optional[bool] = None, leave=True, ) -> List[str]: """ Downloads package assets matching the given filename regex and saves them to the appropriate file paths. Args: filename_or_regex (str): Relative path or Regular expression to match the desired asset names. overwrite (bool, optional): If False, skips downloading if the resource file already exists. Defaults to False. timeout (float, optional): The maximum number of seconds to wait for a server response. Defaults to 20.0. chunk_size (int, optional): The number of bytes to fetch in each step. Defaults to 256*1024. progress_bar (bool, optional): If True, displays a progress bar during the download. If None, uses the value in slt.Settings.SHOW_DOWNLOAD_PROGRESS. Defaults to None. leave (bool, optional): Wether to leave the progress bar behind after the download. Defaults to True. Returns: List[str]: List of paths to matching files that were downloaded or existed already. """ cls.delete_out_of_date_assets() existing_paths = [] # Select assets to download id_path_url = [] for asset_id in cls.get_ids(filename_or_regex): path = cls._abs_path(asset_id) if exists(path) and not overwrite: existing_paths.append(path) continue id_path_url.append((asset_id, path, cls.FILE_TO_URL[asset_id])) # Configure progress bar callback = None if progress_bar is None: progress_bar = Settings.SHOW_DOWNLOAD_PROGRESS leave = False if progress_bar and len(id_path_url) > 1: id_path_url = tqdm(id_path_url, leave=leave) callback = ProgressStatusCallback(id_path_url) # Download assets for asset_id, file_path, url in id_path_url: # progress bar if isinstance(id_path_url, tqdm): id_path_url.set_description(f"Downloading {asset_id}") # Make sure that the file/directory exists os.makedirs(dirname(file_path), exist_ok=True) # Download the file from the URL success_in_download = download( file_path, url, progress_bar=(progress_bar and len(id_path_url) == 1), timeout=timeout, overwrite=overwrite, chunk_size=chunk_size, status_callback=callback, ) # update checksum file with date, url, filename, n_bytes if success_in_download: cls._update_checksum(asset_id, {"url": url}) return [path for _, path, _ in id_path_url] + existing_paths
# ============ # # Delete # # ============ #
[docs] @classmethod def delete_out_of_date_assets(cls) -> None: """ Delete asset if it is out of date. Currently determined by comparing the download URL of the asset in checksum file with the loaded URL. Does not delete files not present in checksum. """ checksum = cls._read_checksum() for asset_id, info in checksum.copy().items(): path = cls._abs_path(asset_id) # false record if not exists(path): checksum.pop(asset_id, None) continue # outdated URL if ( "url" in info and asset_id in cls.FILE_TO_URL # is loaded and info["url"] != cls.FILE_TO_URL[asset_id] ): os.remove(path) checksum.pop(asset_id, None) # outdated source archive if ( "archive_id" in info and "archive_url" in info and info["archive_id"] in cls.FILE_TO_URL and info["archive_url"] != cls.FILE_TO_URL[info["archive_id"]] ): os.remove(cls._abs_path(asset_id)) checksum.pop(asset_id, None) cls._write_checksum(checksum)
[docs] @classmethod def delete(cls, filename_or_regex: str): checksum = cls._read_checksum() matching_ids = cls.get_ids(filename_or_regex) for asset_id in matching_ids: if exists(file_path := cls._abs_path(asset_id)): os.remove(file_path) checksum.pop(asset_id, None) cls._write_checksum(checksum)
# ================= # # URL Loading # # ================= #
[docs] @classmethod def reload(cls) -> None: """ Clear the cache and read the URL files again. """ cls.FILE_TO_URL = read_urls(join(cls.urls_file_dir, cls.primary_urls_file)) cls._loaded_url_files = set()
[docs] @classmethod def load_urls(cls, filename: str) -> None: """ Load URLs from the specified file into the Assets.FILE_TO_URL dictionary. Args: filename (str): The name of the URLs file to load. """ cls.download(filename) for file_id in cls.get_ids(filename): cls.FILE_TO_URL.update(read_urls(cls._abs_path(file_id))) cls._loaded_url_files.add(file_id)
[docs] @classmethod def load_all_urls(cls) -> None: """ Load all URL files into the Assets.FILE_TO_URL dictionary. """ for _, urls_file in cls.asset_regex_to_urls_file: cls.load_urls(urls_file) cls.load_urls(r"([a-z-]+[-_])?urls.json")
# ============= # # helpers # # ============= #
[docs] @classmethod def is_dictionary_video(cls, filename: str) -> bool: """Class method to check if the given filename is a dictionary video. Checks the folder name, extension & direct URL. Args: filename (str): The asset ID to check. (e.g. 'videos/pk-hfad-1_airplane.mp4') Returns: bool: True if the filename represents a dictionary video, False otherwise. """ folder, basename = filename.split("/") if folder != "videos": return False label, extension = basename.rsplit(".", maxsplit=1) if extension != "mp4": return False chunks = label.split(Settings.FILENAME_SEPARATOR) if len(chunks) == 2 and len(cls.get_ids(filename)) == 1: return True return False
@classmethod def _abs_path(cls, asset_id: str) -> str: return join( (cls.urls_file_dir if asset_id.endswith("urls.json") else cls.ROOT_DIR), asset_id.replace("/", sep), ) @classmethod def _read_checksum(cls) -> Dict[str, Dict[str, str]]: os.makedirs(cls.ROOT_DIR, exist_ok=True) checksum_file_path = join(cls.ROOT_DIR, cls._checksum_filename) if isfile(checksum_file_path): with open(checksum_file_path, "r", encoding="utf-8") as f: checksum: Dict[str, Dict[str, str]] = json.load(f) else: checksum = {} cls._write_checksum(checksum) return checksum @classmethod def _write_checksum(cls, checksum: Dict[str, Dict[str, str]]) -> None: checksum_file_path = join(cls.ROOT_DIR, cls._checksum_filename) with open(checksum_file_path, "w", encoding="utf-8") as f: json.dump(checksum, f, indent=2, ensure_ascii=False, sort_keys=True) @classmethod def _update_checksum( cls, asset_id: Union[str, List[str]], info: Union[Dict, List[Dict]] ): if not isinstance(asset_id, list): asset_id = [asset_id] if not isinstance(info, list): info = [info] checksum = cls._read_checksum() for _asset_id, _info in zip(asset_id, info): checksum.setdefault(_asset_id, {}).update( {"date": datetime.now().strftime("%Y-%m-%dT%H:%M:%SZ"), **_info} ) cls._write_checksum(checksum)
[docs] @classmethod def infer_archive_name(cls, filename_or_regex: str) -> str: """ Infers the archive name/regex that should contain the given asset based on the provided filename_or_regex argument. Please follow the naming convention to avoid false guesses i.e. use all allowed special symbols `["/", "-", "_", ".", r"\\."]` in the right places. Args: filename_or_regex (str): The asset filename or regex from which its containing archive name must be inferred. Returns: str: A regex pattern that matches the archive name which should contain the given asset. """ if filename_or_regex.endswith("zip"): return filename_or_regex base = filename_or_regex.split("/")[-1] collection = base.split("_", 1)[0] if "_" in base else r".*" category = r".^" model = r".^" extension = ext if (ext := base[-3:]) in ("csv", "npz", "npy", "mp4") else r".*" if "landmarks" in filename_or_regex: category = "landmarks" if extension != "mp4" else r".^" # landmarks/x.mp4 # assuming the filename structure follows the convention (select the part between two '.'s) sub_extension = re.split(r"(\\\.|\.(?![\*\+\{\?]))", filename_or_regex)[-3] model = sub_extension.split("-", maxsplit=1)[-1] if not model.startswith(("mediapipe", "testmodel")): # validation model = r".*" elif "video" in filename_or_regex or extension == "mp4": category = "videos" if extension in ("mp4", r".*") else r".^" # video/x.csv model = "?" # todo: remove this hack # todo: make it work for dictionary & replications if extension == r".*": extension = "mp4" return f"datasets/{collection}\\.{category}-{model}-{extension}\\.zip$"