Skip to content

BigScape

nplinker.genomics.bigscape

BigscapeGCFLoader

BigscapeGCFLoader(cluster_file: str | PathLike)

Bases: GCFLoaderBase

Data loader for BiG-SCAPE GCF cluster file.

Attributes:

Parameters:

  • cluster_file (str | PathLike) –

    Path to the BiG-SCAPE cluster file, the filename has a pattern of <class>_clustering_c0.xx.tsv.

Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
def __init__(self, cluster_file: str | PathLike, /) -> None:
    """Initialize the BiG-SCAPE GCF loader.

    Args:
        cluster_file: Path to the BiG-SCAPE cluster file,
            the filename has a pattern of `<class>_clustering_c0.xx.tsv`.
    """
    self.cluster_file: str = str(cluster_file)
    self._gcf_list = self._parse_gcf(self.cluster_file)

cluster_file instance-attribute

cluster_file: str = str(cluster_file)

get_gcfs

get_gcfs(
    keep_mibig_only: bool = False,
    keep_singleton: bool = False,
) -> list[GCF]

Get all GCF objects.

Parameters:

  • keep_mibig_only (bool, default: False ) –

    True to keep GCFs that contain only MIBiG BGCs.

  • keep_singleton (bool, default: False ) –

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

Returns:

  • list[GCF]

    A list of GCF objects.

Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]:
    """Get all GCF objects.

    Args:
        keep_mibig_only: True to keep GCFs that contain only MIBiG
            BGCs.
        keep_singleton: True to keep singleton GCFs. A singleton GCF
            is a GCF that contains only one BGC.

    Returns:
        A list of GCF objects.
    """
    gcf_list = self._gcf_list
    if not keep_mibig_only:
        gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()]
    if not keep_singleton:
        gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()]
    return gcf_list

BigscapeV2GCFLoader

BigscapeV2GCFLoader(db_file: str | PathLike)

Bases: GCFLoaderBase

Data loader for BiG-SCAPE v2 database file.

Attributes:

  • db_file

    Path to the BiG-SCAPE database file.

Parameters:

  • db_file (str | PathLike) –

    Path to the BiG-SCAPE v2 database file

Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
def __init__(self, db_file: str | PathLike, /) -> None:
    """Initialize the BiG-SCAPE v2 GCF loader.

    Args:
        db_file: Path to the BiG-SCAPE v2 database file
    """
    self.db_file = str(db_file)
    self._gcf_list = self._parse_gcf(self.db_file)

db_file instance-attribute

db_file = str(db_file)

get_gcfs

get_gcfs(
    keep_mibig_only: bool = False,
    keep_singleton: bool = False,
) -> list[GCF]

Get all GCF objects.

Parameters:

  • keep_mibig_only (bool, default: False ) –

    True to keep GCFs that contain only MIBiG BGCs.

  • keep_singleton (bool, default: False ) –

    True to keep singleton GCFs. A singleton GCF is a GCF that contains only one BGC.

Returns:

  • list[GCF]

    a list of GCF objects.

Source code in src/nplinker/genomics/bigscape/bigscape_loader.py
def get_gcfs(self, keep_mibig_only: bool = False, keep_singleton: bool = False) -> list[GCF]:
    """Get all GCF objects.

    Args:
        keep_mibig_only: True to keep GCFs that contain only MIBiG BGCs.
        keep_singleton: True to keep singleton GCFs.
            A singleton GCF is a GCF that contains only one BGC.

    Returns:
        a list of GCF objects.
    """
    gcf_list = self._gcf_list
    if not keep_mibig_only:
        gcf_list = [gcf for gcf in gcf_list if not gcf.has_mibig_only()]
    if not keep_singleton:
        gcf_list = [gcf for gcf in gcf_list if not gcf.is_singleton()]
    return gcf_list

run_bigscape

run_bigscape(
    antismash_path: str | PathLike,
    output_path: str | PathLike,
    extra_params: str,
    version: Literal[1, 2] = 1,
) -> bool

Runs BiG-SCAPE to cluster BGCs.

The behavior of this function is slightly different depending on the version of BiG-SCAPE that is set to run using the configuration file. Mostly this means a different set of parameters is used between the two versions.

The AntiSMASH output directory should be a directory that contains GBK files. The directory can contain subdirectories, in which case BiG-SCAPE will search recursively for GBK files. E.g.:

example_folder
    ├── organism_1
    │  ├── organism_1.region001.gbk
    │  ├── organism_1.region002.gbk
    │  ├── organism_1.region003.gbk
    │  ├── organism_1.final.gbk          <- skipped!
    │  └── ...
    ├── organism_2
    │  ├── ...
    └── ...

By default, only GBK Files with "cluster" or "region" in the filename are accepted. GBK Files with "final" in the filename are excluded.

Parameters:

  • antismash_path (str | PathLike) –

    Path to the antismash output directory.

  • output_path (str | PathLike) –

    Path to the output directory where BiG-SCAPE will write its results.

  • extra_params (str) –

    Additional parameters to pass to BiG-SCAPE.

  • version (Literal[1, 2], default: 1 ) –

    The version of BiG-SCAPE to run. Must be 1 or 2.

Returns:

  • bool

    True if BiG-SCAPE ran successfully, False otherwise.

Raises:

  • ValueError

    If an unexpected BiG-SCAPE version number is specified.

  • FileNotFoundError

    If the antismash_path does not exist or if the BiG-SCAPE python script could not be found.

  • RuntimeError

    If BiG-SCAPE fails to run.

Examples:

>>>  from nplinker.genomics.bigscape import run_bigscape
>>> run_bigscape(antismash_path="./antismash", output_path="./output",
... extra_params="--help", version=1)
Source code in src/nplinker/genomics/bigscape/runbigscape.py
def run_bigscape(
    antismash_path: str | PathLike,
    output_path: str | PathLike,
    extra_params: str,
    version: Literal[1, 2] = 1,
) -> bool:
    """Runs BiG-SCAPE to cluster BGCs.

    The behavior of this function is slightly different depending on the version of
    BiG-SCAPE that is set to run using the configuration file.
    Mostly this means a different set of parameters is used between the two versions.

    The AntiSMASH output directory should be a directory that contains GBK files.
    The directory can contain subdirectories, in which case BiG-SCAPE will search
    recursively for GBK files. E.g.:

    ```
    example_folder
        ├── organism_1
        │  ├── organism_1.region001.gbk
        │  ├── organism_1.region002.gbk
        │  ├── organism_1.region003.gbk
        │  ├── organism_1.final.gbk          <- skipped!
        │  └── ...
        ├── organism_2
        │  ├── ...
        └── ...
    ```

    By default, only GBK Files with "cluster" or "region" in the filename are
    accepted. GBK Files with "final" in the filename are excluded.

    Args:
        antismash_path: Path to the antismash output directory.
        output_path: Path to the output directory where BiG-SCAPE will write its results.
        extra_params: Additional parameters to pass to BiG-SCAPE.
        version: The version of BiG-SCAPE to run. Must be 1 or 2.

    Returns:
        True if BiG-SCAPE ran successfully, False otherwise.

    Raises:
        ValueError: If an unexpected BiG-SCAPE version number is specified.
        FileNotFoundError: If the antismash_path does not exist or if the BiG-SCAPE python
            script could not be found.
        RuntimeError: If BiG-SCAPE fails to run.

    Examples:
        >>>  from nplinker.genomics.bigscape import run_bigscape
        >>> run_bigscape(antismash_path="./antismash", output_path="./output",
        ... extra_params="--help", version=1)
    """
    # switch to correct version of BiG-SCAPE
    if version == 1:
        bigscape_py_path = "bigscape.py"
    elif version == 2:
        bigscape_py_path = "bigscape-v2.py"
    else:
        raise ValueError("Invalid BiG-SCAPE version number. Expected: 1 or 2.")

    try:
        subprocess.run([bigscape_py_path, "-h"], capture_output=True, check=True)
    except Exception as e:
        raise FileNotFoundError(
            f"Failed to find/run BiG-SCAPE executable program (path={bigscape_py_path}, err={e})"
        ) from e

    if not os.path.exists(antismash_path):
        raise FileNotFoundError(f'antismash_path "{antismash_path}" does not exist!')

    logger.info(f"Running BiG-SCAPE version {version}")
    logger.info(
        f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
    )

    # assemble arguments. first argument is the python file
    args = [bigscape_py_path]

    # version 2 points to specific Pfam file, version 1 points to directory
    # version 2 also requires the cluster subcommand
    if version == 1:
        args.extend(["--pfam_dir", PFAM_PATH])
    elif version == 2:
        args.extend(["cluster", "--pfam_path", os.path.join(PFAM_PATH, "Pfam-A.hmm")])

    # add input and output paths. these are unchanged
    args.extend(["-i", str(antismash_path), "-o", str(output_path)])

    # append the user supplied params, if any
    if len(extra_params) > 0:
        args.extend(extra_params.split(" "))

    logger.info(f"BiG-SCAPE command: {args}")
    result = subprocess.run(args, stdout=sys.stdout, stderr=sys.stderr)

    # return true on any non-error return code
    if result.returncode == 0:
        logger.info(f"BiG-SCAPE completed with return code {result.returncode}")
        return True

    # otherwise log details and raise a runtime error
    logger.error(f"BiG-SCAPE failed with return code {result.returncode}")
    logger.error(f"output: {str(result.stdout)}")
    logger.error(f"stderr: {str(result.stderr)}")

    raise RuntimeError(f"Failed to run BiG-SCAPE with error code {result.returncode}")