Skip to content

Utilities

nplinker.utils

calculate_md5

calculate_md5(
    fpath: str | PathLike, chunk_size: int = 1024 * 1024
) -> str

Calculate the MD5 checksum of a file.

Parameters:

  • fpath (str | PathLike) –

    Path to the file.

  • chunk_size (int, default: 1024 * 1024 ) –

    Chunk size for reading the file. Defaults to 1024*1024.

Returns:

  • str

    MD5 checksum of the file.

Source code in src/nplinker/utils.py
def calculate_md5(fpath: str | PathLike, chunk_size: int = 1024 * 1024) -> str:
    """Calculate the MD5 checksum of a file.

    Args:
        fpath: Path to the file.
        chunk_size: Chunk size for reading the file. Defaults to 1024*1024.

    Returns:
        MD5 checksum of the file.
    """
    if sys.version_info >= (3, 9):
        md5 = hashlib.md5(usedforsecurity=False)
    else:
        md5 = hashlib.md5()
    with open(fpath, "rb") as f:
        for chunk in iter(lambda: f.read(chunk_size), b""):
            md5.update(chunk)
    return md5.hexdigest()

check_disk_space

check_disk_space(func)

A decorator to check available disk space.

If the available disk space is less than 500GB, raise and log a warning.

Warns:

  • UserWarning

    If the available disk space is less than 500GB.

Source code in src/nplinker/utils.py
def check_disk_space(func):
    """A decorator to check available disk space.

    If the available disk space is less than 500GB, raise and log a warning.

    Warnings:
        UserWarning: If the available disk space is less than 500GB.
    """

    @functools.wraps(func)
    def wrapper_check_disk_space(*args, **kwargs):
        _, _, free = shutil.disk_usage("/")
        free_gb = free // (2**30)
        if free_gb < 50:
            warning_message = f"Available disk space is {free_gb}GB. Is it enough for your project?"
            logger.warning(warning_message)
            warnings.warn(warning_message, UserWarning)
        return func(*args, **kwargs)

    return wrapper_check_disk_space

check_md5

check_md5(fpath: str | PathLike, md5: str) -> bool

Verify the MD5 checksum of a file.

Parameters:

  • fpath (str | PathLike) –

    Path to the file.

  • md5 (str) –

    MD5 checksum to verify.

Returns:

  • bool

    True if the MD5 checksum matches, False otherwise.

Source code in src/nplinker/utils.py
def check_md5(fpath: str | PathLike, md5: str) -> bool:
    """Verify the MD5 checksum of a file.

    Args:
        fpath: Path to the file.
        md5: MD5 checksum to verify.

    Returns:
        True if the MD5 checksum matches, False otherwise.
    """
    return md5 == calculate_md5(fpath)

download_and_extract_archive

download_and_extract_archive(
    url: str,
    download_root: str | PathLike,
    extract_root: str | Path | None = None,
    filename: str | None = None,
    md5: str | None = None,
    remove_finished: bool = False,
) -> None

Download an archive file and then extract it.

This method is a wrapper of download_url and extract_archive functions.

Parameters:

  • url (str) –

    URL to download file from

  • download_root (str | PathLike) –

    Path to the directory to place downloaded file in. If it doesn't exist, it will be created.

  • extract_root (str | Path | None, default: None ) –

    Path to the directory the file will be extracted to. The given directory will be created if not exist. If omitted, the download_root is used.

  • filename (str | None, default: None ) –

    Name to save the downloaded file under. If None, use the basename of the URL

  • md5 (str | None, default: None ) –

    MD5 checksum of the download. If None, do not check

  • remove_finished (bool, default: False ) –

    If True, remove the downloaded file after the extraction. Defaults to False.

Source code in src/nplinker/utils.py
def download_and_extract_archive(
    url: str,
    download_root: str | PathLike,
    extract_root: str | Path | None = None,
    filename: str | None = None,
    md5: str | None = None,
    remove_finished: bool = False,
) -> None:
    """Download an archive file and then extract it.

    This method is a wrapper of [`download_url`][nplinker.utils.download_url] and
    [`extract_archive`][nplinker.utils.extract_archive] functions.

    Args:
        url: URL to download file from
        download_root: Path to the directory to place downloaded
            file in. If it doesn't exist, it will be created.
        extract_root: Path to the directory the file
            will be extracted to. The given directory will be created if not exist.
            If omitted, the `download_root` is used.
        filename: Name to save the downloaded file under.
            If None, use the basename of the URL
        md5: MD5 checksum of the download. If None, do not check
        remove_finished: If `True`, remove the downloaded file
             after the extraction. Defaults to False.
    """
    download_root = Path(download_root)
    if extract_root is None:
        extract_root = download_root
    else:
        extract_root = Path(extract_root)
    if not filename:
        filename = Path(url).name

    download_url(url, download_root, filename, md5)

    archive = download_root / filename
    extract_archive(archive, extract_root, remove_finished=remove_finished)

download_url

download_url(
    url: str,
    root: str | PathLike,
    filename: str | None = None,
    md5: str | None = None,
    http_method: str = "GET",
    allow_http_redirect: bool = True,
) -> None

Download a file from a url and place it in root.

Parameters:

  • url (str) –

    URL to download file from

  • root (str | PathLike) –

    Directory to place downloaded file in. If it doesn't exist, it will be created.

  • filename (str | None, default: None ) –

    Name to save the file under. If None, use the basename of the URL.

  • md5 (str | None, default: None ) –

    MD5 checksum of the download. If None, do not check.

  • http_method (str, default: 'GET' ) –

    HTTP request method, e.g. "GET", "POST". Defaults to "GET".

  • allow_http_redirect (bool, default: True ) –

    If true, enable following redirects for all HTTP ("http:") methods.

Source code in src/nplinker/utils.py
@check_disk_space
def download_url(
    url: str,
    root: str | PathLike,
    filename: str | None = None,
    md5: str | None = None,
    http_method: str = "GET",
    allow_http_redirect: bool = True,
) -> None:
    """Download a file from a url and place it in root.

    Args:
        url: URL to download file from
        root: Directory to place downloaded file in. If it doesn't exist, it will be created.
        filename: Name to save the file under. If None, use the
            basename of the URL.
        md5: MD5 checksum of the download. If None, do not check.
        http_method: HTTP request method, e.g. "GET", "POST".
            Defaults to "GET".
        allow_http_redirect: If true, enable following redirects for all HTTP ("http:") methods.
    """
    root = transform_to_full_path(root)
    # create the download directory if not exist
    root.mkdir(exist_ok=True)
    if not filename:
        filename = Path(url).name
    fpath = root / filename

    # check if file is already present locally
    if fpath.is_file() and md5 is not None and check_md5(fpath, md5):
        logger.info("Using downloaded and verified file: " + str(fpath))
        return

    # download the file
    logger.info(f"Downloading {filename} to {root}")
    with open(fpath, "wb") as fh:
        with httpx.stream(http_method, url, follow_redirects=allow_http_redirect) as response:
            if not response.is_success:
                fpath.unlink(missing_ok=True)
                raise RuntimeError(
                    f"Failed to download url {url} with status code {response.status_code}"
                )
            total = int(response.headers.get("Content-Length", 0))

            with Progress(
                TextColumn("[progress.description]{task.description}"),
                BarColumn(bar_width=None),
                "[progress.percentage]{task.percentage:>3.1f}%",
                "•",
                DownloadColumn(),
                "•",
                TransferSpeedColumn(),
                "•",
                TimeRemainingColumn(),
                "•",
                TimeElapsedColumn(),
            ) as progress:
                task = progress.add_task(f"[hot_pink]Downloading {fpath.name}", total=total)
                for chunk in response.iter_bytes():
                    fh.write(chunk)
                    progress.update(task, advance=len(chunk))

    # check integrity of downloaded file
    if md5 is not None and not check_md5(fpath, md5):
        raise RuntimeError("MD5 validation failed.")

extract_archive

extract_archive(
    from_path: str | PathLike,
    extract_root: str | PathLike | None = None,
    members: list | None = None,
    remove_finished: bool = False,
) -> str

Extract an archive.

The archive type and a possible compression is automatically detected from the file name.

If the file is compressed but not an archive, the call is dispatched to _decompress function.

Parameters:

  • from_path (str | PathLike) –

    Path to the file to be extracted.

  • extract_root (str | PathLike | None, default: None ) –

    Path to the directory the file will be extracted to. The given directory will be created if not exist. If omitted, the directory of the archive file is used.

  • members (list | None, default: None ) –

    Optional selection of members to extract. If not specified, all members are extracted. Members must be a subset of the list returned by - zipfile.ZipFile.namelist() or a list of strings for zip file - tarfile.TarFile.getmembers() for tar file

  • remove_finished (bool, default: False ) –

    If True, remove the file after the extraction.

Returns:

  • str

    Path to the directory the file was extracted to.

Source code in src/nplinker/utils.py
def extract_archive(
    from_path: str | PathLike,
    extract_root: str | PathLike | None = None,
    members: list | None = None,
    remove_finished: bool = False,
) -> str:
    """Extract an archive.

    The archive type and a possible compression is automatically detected from
    the file name.

    If the file is compressed but not an archive, the call is dispatched to `_decompress` function.

    Args:
        from_path: Path to the file to be extracted.
        extract_root: Path to the directory the file will be extracted to.
            The given directory will be created if not exist.
            If omitted, the directory of the archive file is used.
        members: Optional selection of members to extract. If not specified,
            all members are extracted.
            Members must be a subset of the list returned by
            - `zipfile.ZipFile.namelist()` or a list of strings for zip file
            - `tarfile.TarFile.getmembers()` for tar file
        remove_finished: If `True`, remove the file after the extraction.

    Returns:
        Path to the directory the file was extracted to.
    """
    from_path = Path(from_path)

    if extract_root is None:
        extract_root = from_path.parent
    else:
        extract_root = Path(extract_root)

    # create the extract directory if not exist
    extract_root.mkdir(exist_ok=True)

    logger.info(f"Extracting {from_path} to {extract_root}")
    suffix, archive_type, compression = _detect_file_type(from_path)
    if not archive_type:
        return _decompress(
            from_path,
            extract_root / from_path.name.replace(suffix, ""),
            remove_finished=remove_finished,
        )

    extractor = _ARCHIVE_EXTRACTORS[archive_type]

    extractor(str(from_path), str(extract_root), members, compression)
    if remove_finished:
        from_path.unlink()

    return str(extract_root)

is_file_format

is_file_format(
    file: str | PathLike, format: str = "tsv"
) -> bool

Check if the file is in the given format.

Parameters:

  • file (str | PathLike) –

    Path to the file to check.

  • format (str, default: 'tsv' ) –

    The format to check for, either "tsv" or "csv".

Returns:

  • bool

    True if the file is in the given format, False otherwise.

Source code in src/nplinker/utils.py
def is_file_format(file: str | PathLike, format: str = "tsv") -> bool:
    """Check if the file is in the given format.

    Args:
        file: Path to the file to check.
        format: The format to check for, either "tsv" or "csv".

    Returns:
        True if the file is in the given format, False otherwise.
    """
    try:
        with open(file, "rt") as f:
            if format == "tsv":
                reader = csv.reader(f, delimiter="\t")
            elif format == "csv":
                reader = csv.reader(f, delimiter=",")
            else:
                raise ValueError(f"Unknown format '{format}'.")
            for _ in reader:
                pass
        return True
    except csv.Error:
        return False

list_dirs

list_dirs(
    root: str | PathLike, keep_parent: bool = True
) -> list[str]

List all directories at a given root.

Parameters:

  • root (str | PathLike) –

    Path to directory whose folders need to be listed

  • keep_parent (bool, default: True ) –

    If true, prepends the path to each result, otherwise only returns the name of the directories found

Source code in src/nplinker/utils.py
def list_dirs(root: str | PathLike, keep_parent: bool = True) -> list[str]:
    """List all directories at a given root.

    Args:
        root: Path to directory whose folders need to be listed
        keep_parent: If true, prepends the path to each result, otherwise
            only returns the name of the directories found
    """
    root = transform_to_full_path(root)
    directories = [str(p) for p in root.iterdir() if p.is_dir()]
    if not keep_parent:
        directories = [os.path.basename(d) for d in directories]
    return directories

list_files

list_files(
    root: str | PathLike,
    prefix: str | tuple[str, ...] = "",
    suffix: str | tuple[str, ...] = "",
    keep_parent: bool = True,
) -> list[str]

List all files at a given root.

Parameters:

  • root (str | PathLike) –

    Path to directory whose files need to be listed

  • prefix (str | tuple[str, ...], default: '' ) –

    Prefix of the file names to match, Defaults to empty string '""'.

  • suffix (str | tuple[str, ...], default: '' ) –

    Suffix of the files to match, e.g. ".png" or (".jpg", ".png"). Defaults to empty string '""'.

  • keep_parent (bool, default: True ) –

    If true, prepends the parent path to each result, otherwise only returns the name of the files found. Defaults to False.

Source code in src/nplinker/utils.py
def list_files(
    root: str | PathLike,
    prefix: str | tuple[str, ...] = "",
    suffix: str | tuple[str, ...] = "",
    keep_parent: bool = True,
) -> list[str]:
    """List all files at a given root.

    Args:
        root: Path to directory whose files need to be listed
        prefix: Prefix of the file names to match,
            Defaults to empty string '""'.
        suffix: Suffix of the files to match, e.g. ".png" or
            (".jpg", ".png").
            Defaults to empty string '""'.
        keep_parent: If true, prepends the parent path to each
            result, otherwise only returns the name of the files found.
            Defaults to False.
    """
    root = Path(root)
    files = [
        str(p)
        for p in root.iterdir()
        if p.is_file() and p.name.startswith(prefix) and p.name.endswith(suffix)
    ]

    if not keep_parent:
        files = [os.path.basename(f) for f in files]

    return files

transform_to_full_path

transform_to_full_path(p: str | PathLike) -> Path

Transform a path to a full path.

The path is expanded (i.e. the ~ will be replaced with actual path) and converted to an absolute path (i.e. . or .. will be replaced with actual path).

Parameters:

Returns:

  • Path

    The transformed full path.

Source code in src/nplinker/utils.py
def transform_to_full_path(p: str | PathLike) -> Path:
    """Transform a path to a full path.

    The path is expanded (i.e. the `~` will be replaced with actual path) and converted to an
    absolute path (i.e. `.` or `..` will be replaced with actual path).

    Args:
        p: The path to transform.

    Returns:
        The transformed full path.
    """
    # Multiple calls to `Path` are used to ensure static typing compatibility.
    p = Path(p).expanduser()
    p = Path(p).resolve()
    return Path(p)