Skip to content

AntiSMASH

nplinker.genomics.antismash

AntismashBGCLoader

AntismashBGCLoader(data_dir: str | PathLike)

Bases: BGCLoaderBase

Data loader for AntiSMASH BGC genbank (.gbk) files.

Parameters:

  • data_dir (str | PathLike) –

    Path to AntiSMASH directory that contains a collection of AntiSMASH outputs.

Notes

The input data_dir must follow the structure defined in the Working Directory Structure for AntiSMASH data, e.g.:

antismash
    ├── genome_id_1                  # one AntiSMASH output, e.g. GCF_000514775.1
      ├── NZ_AZWO01000004.region001.gbk
      └── ...
    ├── genome_id_2
      ├── ...
    └── ...

Source code in src/nplinker/genomics/antismash/antismash_loader.py
def __init__(self, data_dir: str | PathLike) -> None:
    """Initialize the AntiSMASH BGC loader.

    Args:
        data_dir: Path to AntiSMASH directory that contains a collection of AntiSMASH outputs.

    Notes:
        The input `data_dir` must follow the structure defined in the
        [Working Directory Structure][working-directory-structure] for AntiSMASH data, e.g.:
        ```shell
        antismash
            ├── genome_id_1                  # one AntiSMASH output, e.g. GCF_000514775.1
            │  ├── NZ_AZWO01000004.region001.gbk
            │  └── ...
            ├── genome_id_2
            │  ├── ...
            └── ...
        ```
    """
    self.data_dir = str(data_dir)
    self._file_dict = self._parse_data_dir(self.data_dir)
    self._bgcs = self._parse_bgcs(self._file_dict)

data_dir instance-attribute

data_dir = str(data_dir)

get_bgc_genome_mapping

get_bgc_genome_mapping() -> dict[str, str]

Get the mapping from BGC to genome.

Info

The directory name of the gbk files is treated as genome id.

Returns:

  • dict[str, str]

    The key is BGC name (gbk file name) and value is genome id (the directory name of the

  • dict[str, str]

    gbk file).

Source code in src/nplinker/genomics/antismash/antismash_loader.py
def get_bgc_genome_mapping(self) -> dict[str, str]:
    """Get the mapping from BGC to genome.

    !!! info
        The directory name of the gbk files is treated as genome id.

    Returns:
        The key is BGC name (gbk file name) and value is genome id (the directory name of the
        gbk file).
    """
    return {
        bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()
    }

get_files

get_files() -> dict[str, str]

Get BGC gbk files.

Returns:

  • dict[str, str]

    The key is BGC name (gbk file name) and value is path to the gbk file.

Source code in src/nplinker/genomics/antismash/antismash_loader.py
def get_files(self) -> dict[str, str]:
    """Get BGC gbk files.

    Returns:
        The key is BGC name (gbk file name) and value is path to the gbk file.
    """
    return self._file_dict

get_bgcs

get_bgcs() -> list[BGC]

Get all BGC objects.

Returns:

  • list[BGC]

    A list of BGC objects

Source code in src/nplinker/genomics/antismash/antismash_loader.py
def get_bgcs(self) -> list[BGC]:
    """Get all BGC objects.

    Returns:
        A list of BGC objects
    """
    return self._bgcs

GenomeStatus

GenomeStatus(
    original_id: str,
    resolved_refseq_id: str = "",
    resolve_attempted: bool = False,
    bgc_path: str = "",
)

Class to represent the status of a single genome.

The status of genomes is tracked in the file GENOME_STATUS_FILENAME.

Parameters:

  • original_id (str) –

    The original ID of the genome.

  • resolved_refseq_id (str, default: '' ) –

    The resolved RefSeq ID of the genome. Defaults to "".

  • resolve_attempted (bool, default: False ) –

    A flag indicating whether an attempt to resolve the RefSeq ID has been made. Defaults to False.

  • bgc_path (str, default: '' ) –

    The path to the downloaded BGC file for the genome. Defaults to "".

Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
def __init__(
    self,
    original_id: str,
    resolved_refseq_id: str = "",
    resolve_attempted: bool = False,
    bgc_path: str = "",
):
    """Initialize a GenomeStatus object for the given genome.

    Args:
        original_id: The original ID of the genome.
        resolved_refseq_id: The resolved RefSeq ID of the
            genome. Defaults to "".
        resolve_attempted: A flag indicating whether an
            attempt to resolve the RefSeq ID has been made. Defaults to False.
        bgc_path: The path to the downloaded BGC file for
            the genome. Defaults to "".
    """
    self.original_id = original_id
    self.resolved_refseq_id = "" if resolved_refseq_id == "None" else resolved_refseq_id
    self.resolve_attempted = resolve_attempted
    self.bgc_path = bgc_path

original_id instance-attribute

original_id = original_id

resolved_refseq_id instance-attribute

resolved_refseq_id = (
    ""
    if resolved_refseq_id == "None"
    else resolved_refseq_id
)

resolve_attempted instance-attribute

resolve_attempted = resolve_attempted

bgc_path instance-attribute

bgc_path = bgc_path

read_json staticmethod

read_json(
    file: str | PathLike,
) -> dict[str, "GenomeStatus"]

Get a dict of GenomeStatus objects by loading given genome status file.

Note that an empty dict is returned if the given file doesn't exist.

Parameters:

  • file (str | PathLike) –

    Path to genome status file.

Returns:

  • dict[str, 'GenomeStatus']

    Dict keys are genome original id and values are GenomeStatus objects. An empty dict is returned if the given file doesn't exist.

Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
@staticmethod
def read_json(file: str | PathLike) -> dict[str, "GenomeStatus"]:
    """Get a dict of GenomeStatus objects by loading given genome status file.

    Note that an empty dict is returned if the given file doesn't exist.

    Args:
        file: Path to genome status file.

    Returns:
        Dict keys are genome original id and values are GenomeStatus
            objects. An empty dict is returned if the given file doesn't exist.
    """
    genome_status_dict = {}
    if Path(file).exists():
        with open(file, "r") as f:
            data = json.load(f)

        # validate json data before using it
        validate(data, schema=GENOME_STATUS_SCHEMA)

        genome_status_dict = {
            gs["original_id"]: GenomeStatus(**gs) for gs in data["genome_status"]
        }
    return genome_status_dict

to_json staticmethod

to_json(
    genome_status_dict: Mapping[str, "GenomeStatus"],
    file: str | PathLike | None = None,
) -> str | None

Convert the genome status dictionary to a JSON string.

If a file path is provided, the JSON string is written to the file. If the file already exists, it is overwritten.

Parameters:

  • genome_status_dict (Mapping[str, 'GenomeStatus']) –

    A dictionary of genome status objects. The keys are the original genome IDs and the values are GenomeStatus objects.

  • file (str | PathLike | None, default: None ) –

    The path to the output JSON file. If None, the JSON string is returned but not written to a file.

Returns:

  • str | None

    The JSON string if file is None, otherwise None.

Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
@staticmethod
def to_json(
    genome_status_dict: Mapping[str, "GenomeStatus"], file: str | PathLike | None = None
) -> str | None:
    """Convert the genome status dictionary to a JSON string.

    If a file path is provided, the JSON string is written to the file. If
    the file already exists, it is overwritten.

    Args:
        genome_status_dict: A dictionary of genome
            status objects. The keys are the original genome IDs and the values
            are GenomeStatus objects.
        file: The path to the output JSON file.
            If None, the JSON string is returned but not written to a file.

    Returns:
        The JSON string if `file` is None, otherwise None.
    """
    gs_list = [gs._to_dict() for gs in genome_status_dict.values()]
    json_data = {"genome_status": gs_list, "version": "1.0"}

    # validate json object before dumping
    validate(json_data, schema=GENOME_STATUS_SCHEMA)

    if file is not None:
        with open(file, "w") as f:
            json.dump(json_data, f)
        return None
    return json.dumps(json_data)

download_and_extract_antismash_data

download_and_extract_antismash_data(
    antismash_id: str,
    download_root: str | PathLike,
    extract_root: str | PathLike,
) -> None

Download and extract antiSMASH BGC archive for a specified genome.

The antiSMASH database (https://antismash-db.secondarymetabolites.org/) is used to download the BGC archive. And antiSMASH use RefSeq assembly id of a genome as the id of the archive.

Parameters:

  • antismash_id (str) –

    The id used to download BGC archive from antiSMASH database. If the id is versioned (e.g., "GCF_004339725.1") please be sure to specify the version as well.

  • download_root (str | PathLike) –

    Path to the directory to place downloaded archive in.

  • extract_root (str | PathLike) –

    Path to the directory data files will be extracted to. Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.

Raises:

  • ValueError

    if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.

Examples:

>>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
Source code in src/nplinker/genomics/antismash/antismash_downloader.py
def download_and_extract_antismash_data(
    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
) -> None:
    """Download and extract antiSMASH BGC archive for a specified genome.

    The antiSMASH database (https://antismash-db.secondarymetabolites.org/)
    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
    of a genome as the id of the archive.

    Args:
        antismash_id: The id used to download BGC archive from antiSMASH database.
            If the id is versioned (e.g., "GCF_004339725.1") please be sure to
            specify the version as well.
        download_root: Path to the directory to place downloaded archive in.
        extract_root: Path to the directory data files will be extracted to.
            Note that an `antismash` directory will be created in the specified `extract_root` if
            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.

    Raises:
        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.

    Examples:
        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
    """
    download_root = Path(download_root)
    extract_root = Path(extract_root)
    extract_path = extract_root / "antismash" / antismash_id

    try:
        if extract_path.exists():
            _check_extract_path(extract_path)
        else:
            extract_path.mkdir(parents=True, exist_ok=True)

        for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]:
            url = base_url.format(antismash_id, antismash_id + ".zip")
            download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")
            break

        # delete subdirs
        for subdir_path in list_dirs(extract_path):
            shutil.rmtree(subdir_path)

        # delete unnecessary files
        files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
        for file in list_files(extract_path):
            if file not in files_to_keep:
                os.remove(file)

        logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id)

    except Exception as e:
        shutil.rmtree(extract_path)
        logger.warning(e)
        raise e

parse_bgc_genbank

parse_bgc_genbank(file: str | PathLike) -> BGC

Parse a single BGC gbk file to BGC object.

Parameters:

Returns:

  • BGC

    BGC object

Examples:

>>> bgc = AntismashBGCLoader.parse_bgc(
...    "/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk")
Source code in src/nplinker/genomics/antismash/antismash_loader.py
def parse_bgc_genbank(file: str | PathLike) -> BGC:
    """Parse a single BGC gbk file to BGC object.

    Args:
        file: Path to BGC gbk file

    Returns:
        BGC object

    Examples:
        >>> bgc = AntismashBGCLoader.parse_bgc(
        ...    "/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk")
    """
    file = Path(file)
    fname = file.stem

    record = SeqIO.read(file, format="genbank")
    description = record.description  # "DEFINITION" in gbk file
    antismash_id = record.id  # "VERSION" in gbk file
    features = _parse_antismash_genbank(record)
    product_prediction = features.get("product")
    if product_prediction is None:
        raise ValueError(f"Not found product prediction in antiSMASH Genbank file {file}")

    # init BGC
    bgc = BGC(fname, *product_prediction)
    bgc.description = description
    bgc.antismash_id = antismash_id
    bgc.antismash_file = str(file)
    bgc.antismash_region = features.get("region_number")
    bgc.smiles = features.get("smiles")
    bgc.strain = Strain(fname)
    return bgc

get_best_available_genome_id

get_best_available_genome_id(
    genome_id_data: Mapping[str, str]
) -> str | None

Get the best available ID from genome_id_data dict.

Parameters:

  • genome_id_data (Mapping[str, str]) –

    dictionary containing information for each genome record present.

Returns:

  • str | None

    ID for the genome, if present, otherwise None.

Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | None:
    """Get the best available ID from genome_id_data dict.

    Args:
        genome_id_data: dictionary containing information for each genome record present.

    Returns:
        ID for the genome, if present, otherwise None.
    """
    if "RefSeq_accession" in genome_id_data:
        best_id = genome_id_data["RefSeq_accession"]
    elif "GenBank_accession" in genome_id_data:
        best_id = genome_id_data["GenBank_accession"]
    elif "JGI_Genome_ID" in genome_id_data:
        best_id = genome_id_data["JGI_Genome_ID"]
    else:
        best_id = None

    if best_id is None or len(best_id) == 0:
        logger.warning(f"Failed to get valid genome ID in genome data: {genome_id_data}")
        return None
    return best_id

podp_download_and_extract_antismash_data

podp_download_and_extract_antismash_data(
    genome_records: Sequence[
        Mapping[str, Mapping[str, str]]
    ],
    project_download_root: str | PathLike,
    project_extract_root: str | PathLike,
)

Download and extract antiSMASH BGC archive for the given genome records.

Parameters:

  • genome_records (Sequence[Mapping[str, Mapping[str, str]]]) –

    list of dicts representing genome records.

    The dict of each genome record contains a key of genome ID with a value of another dict containing information about genome type, label and accession ids (RefSeq, GenBank, and/or JGI).

  • project_download_root (str | PathLike) –

    Path to the directory to place downloaded archive in.

  • project_extract_root (str | PathLike) –

    Path to the directory downloaded archive will be extracted to.

    Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.

Warns:

  • UserWarning

    when no antiSMASH data is found for some genomes.

Source code in src/nplinker/genomics/antismash/podp_antismash_downloader.py
def podp_download_and_extract_antismash_data(
    genome_records: Sequence[Mapping[str, Mapping[str, str]]],
    project_download_root: str | PathLike,
    project_extract_root: str | PathLike,
):
    """Download and extract antiSMASH BGC archive for the given genome records.

    Args:
        genome_records: list of dicts representing genome records.

            The dict of each genome record contains a key of genome ID with a value
            of another dict containing information about genome type, label and
            accession ids (RefSeq, GenBank, and/or JGI).
        project_download_root: Path to the directory to place
            downloaded archive in.
        project_extract_root: Path to the directory downloaded archive will be extracted to.

            Note that an `antismash` directory will be created in the specified
            `extract_root` if it doesn't exist. The files will be extracted to
            `<extract_root>/antismash/<antismash_id>` directory.

    Warnings:
        UserWarning: when no antiSMASH data is found for some genomes.
    """
    if not Path(project_download_root).exists():
        # otherwise in case of failed first download, the folder doesn't exist and
        # genome_status_file can't be written
        Path(project_download_root).mkdir(parents=True, exist_ok=True)

    gs_file = Path(project_download_root, GENOME_STATUS_FILENAME)
    gs_dict = GenomeStatus.read_json(gs_file)

    for i, genome_record in enumerate(genome_records):
        # get the best available ID from the dict
        genome_id_data = genome_record["genome_ID"]
        raw_genome_id = get_best_available_genome_id(genome_id_data)
        if raw_genome_id is None or len(raw_genome_id) == 0:
            logger.warning(f'Invalid input genome record "{genome_record}"')
            continue

        # check if genome ID exist in the genome status file
        if raw_genome_id not in gs_dict:
            gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id)

        gs_obj = gs_dict[raw_genome_id]

        logger.info(
            f"Checking for antismash data {i + 1}/{len(genome_records)}, "
            f"current genome ID={raw_genome_id}"
        )
        # first, check if BGC data is downloaded
        if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists():
            logger.info(f"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}")
            continue
        # second, check if lookup attempted previously
        if gs_obj.resolve_attempted:
            logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt")
            continue

        # if not downloaded or lookup attempted, then try to resolve the ID
        # and download
        logger.info(f"Start lookup process for genome ID {raw_genome_id}")
        gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data)
        gs_obj.resolve_attempted = True

        if gs_obj.resolved_refseq_id == "":
            # give up on this one
            logger.warning(f"Failed lookup for genome ID {raw_genome_id}")
            continue

        # if resolved id is valid, try to download and extract antismash data
        try:
            download_and_extract_antismash_data(
                gs_obj.resolved_refseq_id, project_download_root, project_extract_root
            )

            gs_obj.bgc_path = str(
                Path(project_download_root, gs_obj.resolved_refseq_id + ".zip").absolute()
            )

            output_path = Path(project_extract_root, "antismash", gs_obj.resolved_refseq_id)
            if output_path.exists():
                Path.touch(output_path / "completed", exist_ok=True)

        except Exception:
            gs_obj.bgc_path = ""

    # raise and log warning for failed downloads
    failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]
    if failed_ids:
        warning_message = (
            f"Failed to download antiSMASH data for the following genome IDs: {failed_ids}"
        )
        logger.warning(warning_message)
        warnings.warn(warning_message, UserWarning)

    # save updated genome status to json file
    GenomeStatus.to_json(gs_dict, gs_file)

    if len(failed_ids) == len(genome_records):
        raise ValueError("No antiSMASH data found for any genome")