Skip to content




AntismashBGCLoader(data_dir: str | PathLike)

Bases: BGCLoaderBase

Data loader for AntiSMASH BGC genbank (.gbk) files.


  • data_dir (str | PathLike) –

    Path to AntiSMASH directory that contains a collection of AntiSMASH outputs.


The input data_dir must follow the structure defined in the Working Directory Structure for AntiSMASH data, e.g.:

    ├── genome_id_1                  # one AntiSMASH output, e.g. GCF_000514775.1
      ├── NZ_AZWO01000004.region001.gbk
      └── ...
    ├── genome_id_2
      ├── ...
    └── ...

Source code in src/nplinker/genomics/antismash/
def __init__(self, data_dir: str | PathLike) -> None:
    """Initialize the AntiSMASH BGC loader.

        data_dir: Path to AntiSMASH directory that contains a collection of AntiSMASH outputs.

        The input `data_dir` must follow the structure defined in the
        [Working Directory Structure][working-directory-structure] for AntiSMASH data, e.g.:
            ├── genome_id_1                  # one AntiSMASH output, e.g. GCF_000514775.1
            │  ├── NZ_AZWO01000004.region001.gbk
            │  └── ...
            ├── genome_id_2
            │  ├── ...
            └── ...
    self.data_dir = str(data_dir)
    self._file_dict = self._parse_data_dir(self.data_dir)
    self._bgcs = self._parse_bgcs(self._file_dict)

data_dir instance-attribute

data_dir = str(data_dir)


get_bgc_genome_mapping() -> dict[str, str]

Get the mapping from BGC to genome.


The directory name of the gbk files is treated as genome id.


  • dict[str, str]

    The key is BGC name (gbk file name) and value is genome id (the directory name of the

  • dict[str, str]

    gbk file).

Source code in src/nplinker/genomics/antismash/
def get_bgc_genome_mapping(self) -> dict[str, str]:
    """Get the mapping from BGC to genome.

    !!! info
        The directory name of the gbk files is treated as genome id.

        The key is BGC name (gbk file name) and value is genome id (the directory name of the
        gbk file).
    return {
        bid: os.path.basename(os.path.dirname(bpath)) for bid, bpath in self._file_dict.items()


get_files() -> dict[str, str]

Get BGC gbk files.


  • dict[str, str]

    The key is BGC name (gbk file name) and value is path to the gbk file.

Source code in src/nplinker/genomics/antismash/
def get_files(self) -> dict[str, str]:
    """Get BGC gbk files.

        The key is BGC name (gbk file name) and value is path to the gbk file.
    return self._file_dict


get_bgcs() -> list[BGC]

Get all BGC objects.


  • list[BGC]

    A list of BGC objects

Source code in src/nplinker/genomics/antismash/
def get_bgcs(self) -> list[BGC]:
    """Get all BGC objects.

        A list of BGC objects
    return self._bgcs


    original_id: str,
    resolved_refseq_id: str = "",
    resolve_attempted: bool = False,
    bgc_path: str = "",

Class to represent the status of a single genome.

The status of genomes is tracked in the file GENOME_STATUS_FILENAME.


  • original_id (str) –

    The original ID of the genome.

  • resolved_refseq_id (str, default: '' ) –

    The resolved RefSeq ID of the genome. Defaults to "".

  • resolve_attempted (bool, default: False ) –

    A flag indicating whether an attempt to resolve the RefSeq ID has been made. Defaults to False.

  • bgc_path (str, default: '' ) –

    The path to the downloaded BGC file for the genome. Defaults to "".

Source code in src/nplinker/genomics/antismash/
def __init__(
    original_id: str,
    resolved_refseq_id: str = "",
    resolve_attempted: bool = False,
    bgc_path: str = "",
    """Initialize a GenomeStatus object for the given genome.

        original_id: The original ID of the genome.
        resolved_refseq_id: The resolved RefSeq ID of the
            genome. Defaults to "".
        resolve_attempted: A flag indicating whether an
            attempt to resolve the RefSeq ID has been made. Defaults to False.
        bgc_path: The path to the downloaded BGC file for
            the genome. Defaults to "".
    self.original_id = original_id
    self.resolved_refseq_id = "" if resolved_refseq_id == "None" else resolved_refseq_id
    self.resolve_attempted = resolve_attempted
    self.bgc_path = bgc_path

original_id instance-attribute

original_id = original_id

resolved_refseq_id instance-attribute

resolved_refseq_id = (
    if resolved_refseq_id == "None"
    else resolved_refseq_id

resolve_attempted instance-attribute

resolve_attempted = resolve_attempted

bgc_path instance-attribute

bgc_path = bgc_path

read_json staticmethod

    file: str | PathLike,
) -> dict[str, "GenomeStatus"]

Get a dict of GenomeStatus objects by loading given genome status file.

Note that an empty dict is returned if the given file doesn't exist.


  • file (str | PathLike) –

    Path to genome status file.


  • dict[str, 'GenomeStatus']

    Dict keys are genome original id and values are GenomeStatus objects. An empty dict is returned if the given file doesn't exist.

Source code in src/nplinker/genomics/antismash/
def read_json(file: str | PathLike) -> dict[str, "GenomeStatus"]:
    """Get a dict of GenomeStatus objects by loading given genome status file.

    Note that an empty dict is returned if the given file doesn't exist.

        file: Path to genome status file.

        Dict keys are genome original id and values are GenomeStatus
            objects. An empty dict is returned if the given file doesn't exist.
    genome_status_dict = {}
    if Path(file).exists():
        with open(file, "r") as f:
            data = json.load(f)

        # validate json data before using it
        validate(data, schema=GENOME_STATUS_SCHEMA)

        genome_status_dict = {
            gs["original_id"]: GenomeStatus(**gs) for gs in data["genome_status"]
    return genome_status_dict

to_json staticmethod

    genome_status_dict: Mapping[str, "GenomeStatus"],
    file: str | PathLike | None = None,
) -> str | None

Convert the genome status dictionary to a JSON string.

If a file path is provided, the JSON string is written to the file. If the file already exists, it is overwritten.


  • genome_status_dict (Mapping[str, 'GenomeStatus']) –

    A dictionary of genome status objects. The keys are the original genome IDs and the values are GenomeStatus objects.

  • file (str | PathLike | None, default: None ) –

    The path to the output JSON file. If None, the JSON string is returned but not written to a file.


  • str | None

    The JSON string if file is None, otherwise None.

Source code in src/nplinker/genomics/antismash/
def to_json(
    genome_status_dict: Mapping[str, "GenomeStatus"], file: str | PathLike | None = None
) -> str | None:
    """Convert the genome status dictionary to a JSON string.

    If a file path is provided, the JSON string is written to the file. If
    the file already exists, it is overwritten.

        genome_status_dict: A dictionary of genome
            status objects. The keys are the original genome IDs and the values
            are GenomeStatus objects.
        file: The path to the output JSON file.
            If None, the JSON string is returned but not written to a file.

        The JSON string if `file` is None, otherwise None.
    gs_list = [gs._to_dict() for gs in genome_status_dict.values()]
    json_data = {"genome_status": gs_list, "version": "1.0"}

    # validate json object before dumping
    validate(json_data, schema=GENOME_STATUS_SCHEMA)

    if file is not None:
        with open(file, "w") as f:
            json.dump(json_data, f)
        return None
    return json.dumps(json_data)


    antismash_id: str,
    download_root: str | PathLike,
    extract_root: str | PathLike,
) -> None

Download and extract antiSMASH BGC archive for a specified genome.

The antiSMASH database ( is used to download the BGC archive. And antiSMASH use RefSeq assembly id of a genome as the id of the archive.


  • antismash_id (str) –

    The id used to download BGC archive from antiSMASH database. If the id is versioned (e.g., "GCF_004339725.1") please be sure to specify the version as well.

  • download_root (str | PathLike) –

    Path to the directory to place downloaded archive in.

  • extract_root (str | PathLike) –

    Path to the directory data files will be extracted to. Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.


  • ValueError

    if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.


>>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
Source code in src/nplinker/genomics/antismash/
def download_and_extract_antismash_data(
    antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike
) -> None:
    """Download and extract antiSMASH BGC archive for a specified genome.

    The antiSMASH database (
    is used to download the BGC archive. And antiSMASH use RefSeq assembly id
    of a genome as the id of the archive.

        antismash_id: The id used to download BGC archive from antiSMASH database.
            If the id is versioned (e.g., "GCF_004339725.1") please be sure to
            specify the version as well.
        download_root: Path to the directory to place downloaded archive in.
        extract_root: Path to the directory data files will be extracted to.
            Note that an `antismash` directory will be created in the specified `extract_root` if
            it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory.

        ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty.

        >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
    download_root = Path(download_root)
    extract_root = Path(extract_root)
    extract_path = extract_root / "antismash" / antismash_id

        if extract_path.exists():
            extract_path.mkdir(parents=True, exist_ok=True)

            url = base_url.format(antismash_id, antismash_id + ".zip")
            download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip")

        # delete subdirs
        for subdir_path in list_dirs(extract_path):

        # delete unnecessary files
        files_to_keep = list_files(extract_path, suffix=(".json", ".gbk"))
        for file in list_files(extract_path):
            if file not in files_to_keep:
                os.remove(file)"antiSMASH BGC data of %s is downloaded and extracted.", antismash_id)

    except Exception as e:
        raise e


parse_bgc_genbank(file: str | PathLike) -> BGC

Parse a single BGC gbk file to BGC object.



  • BGC

    BGC object


>>> bgc = AntismashBGCLoader.parse_bgc(
...    "/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk")
Source code in src/nplinker/genomics/antismash/
def parse_bgc_genbank(file: str | PathLike) -> BGC:
    """Parse a single BGC gbk file to BGC object.

        file: Path to BGC gbk file

        BGC object

        >>> bgc = AntismashBGCLoader.parse_bgc(
        ...    "/data/antismash/GCF_000016425.1/NC_009380.1.region001.gbk")
    file = Path(file)
    fname = file.stem

    record =, format="genbank")
    description = record.description  # "DEFINITION" in gbk file
    antismash_id =  # "VERSION" in gbk file
    features = _parse_antismash_genbank(record)
    product_prediction = features.get("product")
    if product_prediction is None:
        raise ValueError(f"Not found product prediction in antiSMASH Genbank file {file}")

    # init BGC
    bgc = BGC(fname, *product_prediction)
    bgc.description = description
    bgc.antismash_id = antismash_id
    bgc.antismash_file = str(file)
    bgc.antismash_region = features.get("region_number")
    bgc.strain = Strain(fname)
    return bgc


    genome_id_data: Mapping[str, str]
) -> str | None

Get the best available ID from genome_id_data dict.


  • genome_id_data (Mapping[str, str]) –

    dictionary containing information for each genome record present.


  • str | None

    ID for the genome, if present, otherwise None.

Source code in src/nplinker/genomics/antismash/
def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | None:
    """Get the best available ID from genome_id_data dict.

        genome_id_data: dictionary containing information for each genome record present.

        ID for the genome, if present, otherwise None.
    if "RefSeq_accession" in genome_id_data:
        best_id = genome_id_data["RefSeq_accession"]
    elif "GenBank_accession" in genome_id_data:
        best_id = genome_id_data["GenBank_accession"]
    elif "JGI_Genome_ID" in genome_id_data:
        best_id = genome_id_data["JGI_Genome_ID"]
        best_id = None

    if best_id is None or len(best_id) == 0:
        logger.warning(f"Failed to get valid genome ID in genome data: {genome_id_data}")
        return None
    return best_id


    genome_records: Sequence[
        Mapping[str, Mapping[str, str]]
    project_download_root: str | PathLike,
    project_extract_root: str | PathLike,

Download and extract antiSMASH BGC archive for the given genome records.


  • genome_records (Sequence[Mapping[str, Mapping[str, str]]]) –

    list of dicts representing genome records.

    The dict of each genome record contains a key of genome ID with a value of another dict containing information about genome type, label and accession ids (RefSeq, GenBank, and/or JGI).

  • project_download_root (str | PathLike) –

    Path to the directory to place downloaded archive in.

  • project_extract_root (str | PathLike) –

    Path to the directory downloaded archive will be extracted to.

    Note that an antismash directory will be created in the specified extract_root if it doesn't exist. The files will be extracted to <extract_root>/antismash/<antismash_id> directory.


  • UserWarning

    when no antiSMASH data is found for some genomes.

Source code in src/nplinker/genomics/antismash/
def podp_download_and_extract_antismash_data(
    genome_records: Sequence[Mapping[str, Mapping[str, str]]],
    project_download_root: str | PathLike,
    project_extract_root: str | PathLike,
    """Download and extract antiSMASH BGC archive for the given genome records.

        genome_records: list of dicts representing genome records.

            The dict of each genome record contains a key of genome ID with a value
            of another dict containing information about genome type, label and
            accession ids (RefSeq, GenBank, and/or JGI).
        project_download_root: Path to the directory to place
            downloaded archive in.
        project_extract_root: Path to the directory downloaded archive will be extracted to.

            Note that an `antismash` directory will be created in the specified
            `extract_root` if it doesn't exist. The files will be extracted to
            `<extract_root>/antismash/<antismash_id>` directory.

        UserWarning: when no antiSMASH data is found for some genomes.
    if not Path(project_download_root).exists():
        # otherwise in case of failed first download, the folder doesn't exist and
        # genome_status_file can't be written
        Path(project_download_root).mkdir(parents=True, exist_ok=True)

    gs_file = Path(project_download_root, GENOME_STATUS_FILENAME)
    gs_dict = GenomeStatus.read_json(gs_file)

    for i, genome_record in enumerate(genome_records):
        # get the best available ID from the dict
        genome_id_data = genome_record["genome_ID"]
        raw_genome_id = get_best_available_genome_id(genome_id_data)
        if raw_genome_id is None or len(raw_genome_id) == 0:
            logger.warning(f'Invalid input genome record "{genome_record}"')

        # check if genome ID exist in the genome status file
        if raw_genome_id not in gs_dict:
            gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id)

        gs_obj = gs_dict[raw_genome_id]
            f"Checking for antismash data {i + 1}/{len(genome_records)}, "
            f"current genome ID={raw_genome_id}"
        # first, check if BGC data is downloaded
        if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists():
  "Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}")
        # second, check if lookup attempted previously
        if gs_obj.resolve_attempted:
  "Genome ID {raw_genome_id} skipped due to previous failed attempt")

        # if not downloaded or lookup attempted, then try to resolve the ID
        # and download"Start lookup process for genome ID {raw_genome_id}")
        gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data)
        gs_obj.resolve_attempted = True

        if gs_obj.resolved_refseq_id == "":
            # give up on this one
            logger.warning(f"Failed lookup for genome ID {raw_genome_id}")

        # if resolved id is valid, try to download and extract antismash data
                gs_obj.resolved_refseq_id, project_download_root, project_extract_root

            gs_obj.bgc_path = str(
                Path(project_download_root, gs_obj.resolved_refseq_id + ".zip").absolute()

            output_path = Path(project_extract_root, "antismash", gs_obj.resolved_refseq_id)
            if output_path.exists():
                Path.touch(output_path / "completed", exist_ok=True)

        except Exception:
            gs_obj.bgc_path = ""

    # raise and log warning for failed downloads
    failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path]
    if failed_ids:
        warning_message = (
            f"Failed to download antiSMASH data for the following genome IDs: {failed_ids}"
        warnings.warn(warning_message, UserWarning)

    # save updated genome status to json file
    GenomeStatus.to_json(gs_dict, gs_file)

    if len(failed_ids) == len(genome_records):
        raise ValueError("No antiSMASH data found for any genome")