Skip to content

Scoring Methods

nplinker.scoring

ScoringMethod

Bases: Enum

Enum class for scoring methods.

METCALF class-attribute instance-attribute

METCALF = 'metcalf'

ROSETTA class-attribute instance-attribute

ROSETTA = 'rosetta'

NPLCLASS class-attribute instance-attribute

NPLCLASS = 'nplclass'

has_value classmethod

has_value(value: str) -> bool

Check if the enum has a value.

Source code in src/nplinker/scoring/scoring_method.py
@classmethod
def has_value(cls, value: str) -> bool:
    """Check if the enum has a value."""
    return any(value == item.value for item in cls)

MetcalfScoring

Bases: ScoringBase

Metcalf scoring method.

Attributes:

  • name

    The name of this scoring method, set to a fixed value metcalf.

  • npl (NPLinker | None) –

    The NPLinker object.

  • CACHE (str) –

    The name of the cache file to use for storing the MetcalfScoring.

  • presence_gcf_strain (DataFrame) –

    A DataFrame to store presence of gcfs with respect to strains. The index of the DataFrame are the GCF objects and the columns are Strain objects. The values are 1 where the gcf occurs in the strain, 0 otherwise.

  • presence_spec_strain (DataFrame) –

    A DataFrame to store presence of spectra with respect to strains. The index of the DataFrame are the Spectrum objects and the columns are Strain objects. The values are 1 where the spectrum occurs in the strain, 0 otherwise.

  • presence_mf_strain (DataFrame) –

    A DataFrame to store presence of molecular families with respect to strains. The index of the DataFrame are the MolecularFamily objects and the columns are Strain objects. The values are 1 where the molecular family occurs in the strain, 0 otherwise.

  • raw_score_spec_gcf (DataFrame) –

    A DataFrame to store the raw Metcalf scores for spectrum-gcf links. The columns are "spec", "gcf" and "score":

    • The "spec" and "gcf" columns contain the Spectrum and GCF objects respectively,
    • The "score" column contains the raw Metcalf scores.
  • raw_score_mf_gcf (DataFrame) –

    A DataFrame to store the raw Metcalf scores for molecular family-gcf links. The columns are "mf", "gcf" and "score":

    • The "mf" and "gcf" columns contain the MolecularFamily and GCF objects respectively,
    • the "score" column contains the raw Metcalf scores.
  • metcalf_mean (ndarray | None) –

    A numpy array to store the mean value used for standardising Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains.

  • metcalf_std (ndarray | None) –

    A numpy array to store the standard deviation value used for standardising Metcalf scores. The array has shape (n_strains+1, n_strains+1), where n_strains is the number of strains.

name class-attribute instance-attribute

name = METCALF.value

npl class-attribute instance-attribute

npl: NPLinker | None = None

CACHE class-attribute instance-attribute

CACHE: str = 'cache_metcalf_scoring.pckl'

metcalf_weights class-attribute instance-attribute

metcalf_weights: tuple[int, int, int, int] = (10, -10, 0, 1)

presence_gcf_strain class-attribute instance-attribute

presence_gcf_strain: DataFrame = DataFrame()

presence_spec_strain class-attribute instance-attribute

presence_spec_strain: DataFrame = DataFrame()

presence_mf_strain class-attribute instance-attribute

presence_mf_strain: DataFrame = DataFrame()

raw_score_spec_gcf class-attribute instance-attribute

raw_score_spec_gcf: DataFrame = DataFrame(
    columns=["spec", "gcf", "score"]
)

raw_score_mf_gcf class-attribute instance-attribute

raw_score_mf_gcf: DataFrame = DataFrame(
    columns=["mf", "gcf", "score"]
)

metcalf_mean class-attribute instance-attribute

metcalf_mean: ndarray | None = None

metcalf_std class-attribute instance-attribute

metcalf_std: ndarray | None = None

setup classmethod

setup(npl: NPLinker) -> None

Setup the MetcalfScoring object.

This method is only called once to setup the MetcalfScoring object.

Parameters:

  • npl (NPLinker) –

    The NPLinker object.

Source code in src/nplinker/scoring/metcalf_scoring.py
@classmethod
def setup(cls, npl: NPLinker) -> None:
    """Setup the MetcalfScoring object.

    This method is only called once to setup the MetcalfScoring object.

    Args:
        npl: The NPLinker object.
    """
    if cls.npl is not None:
        logger.info("MetcalfScoring.setup already called, skipping.")
        return

    logger.info(
        f"MetcalfScoring.setup starts: #bgcs={len(npl.bgcs)}, #gcfs={len(npl.gcfs)}, "
        f"#spectra={len(npl.spectra)}, #mfs={len(npl.mfs)}, #strains={npl.strains}"
    )
    cls.npl = npl

    # calculate presence of gcfs/spectra/mfs with respect to strains
    cls.presence_gcf_strain = get_presence_gcf_strain(npl.gcfs, npl.strains)
    cls.presence_spec_strain = get_presence_spec_strain(npl.spectra, npl.strains)
    cls.presence_mf_strain = get_presence_mf_strain(npl.mfs, npl.strains)

    # calculate raw Metcalf scores for spec-gcf links
    raw_score_spec_gcf = cls._calc_raw_score(
        cls.presence_spec_strain, cls.presence_gcf_strain, cls.metcalf_weights
    )
    cls.raw_score_spec_gcf = raw_score_spec_gcf.reset_index().melt(id_vars="index")
    cls.raw_score_spec_gcf.columns = ["spec", "gcf", "score"]  # type: ignore

    # calculate raw Metcalf scores for spec-gcf links
    raw_score_mf_gcf = cls._calc_raw_score(
        cls.presence_mf_strain, cls.presence_gcf_strain, cls.metcalf_weights
    )
    cls.raw_score_mf_gcf = raw_score_mf_gcf.reset_index().melt(id_vars="index")
    cls.raw_score_mf_gcf.columns = ["mf", "gcf", "score"]  # type: ignore

    # calculate mean and std for standardising Metcalf scores
    cls.metcalf_mean, cls.metcalf_std = cls._calc_mean_std(
        len(npl.strains), cls.metcalf_weights
    )

    logger.info("MetcalfScoring.setup completed")
get_links(*objects, **parameters)

Get links for the given objects.

Parameters:

  • objects

    The objects to get links for. All objects must be of the same type, i.e. GCF, Spectrum or MolecularFamily type. If no objects are provided, all detected objects (npl.gcfs) will be used.

  • parameters

    The scoring parameters to use for the links. The parameters are:

    • cutoff: The minimum score to consider a link (≥cutoff). Default is 0.
    • standardised: Whether to use standardised scores. Default is False.

Returns:

  • The LinkGraph object containing the links involving the input objects with the Metcalf scores.

Raises:

  • TypeError

    If the input objects are not of the same type or the object type is invalid.

Source code in src/nplinker/scoring/metcalf_scoring.py
def get_links(self, *objects, **parameters):
    """Get links for the given objects.

    Args:
        objects: The objects to get links for. All objects must be of the same type, i.e. `GCF`,
            `Spectrum` or `MolecularFamily` type.
            If no objects are provided, all detected objects (`npl.gcfs`) will be used.
        parameters: The scoring parameters to use for the links.
            The parameters are:

            - `cutoff`: The minimum score to consider a link (≥cutoff). Default is 0.
            - `standardised`: Whether to use standardised scores. Default is False.

    Returns:
        The [`LinkGraph`][nplinker.scoring.LinkGraph] object containing the links involving the
            input objects with the Metcalf scores.

    Raises:
        TypeError: If the input objects are not of the same type or the object type is invalid.
    """
    # validate input objects
    if len(objects) == 0:
        objects = self.npl.gcfs
    # check if all objects are of the same type
    types = {type(i) for i in objects}
    if len(types) > 1:
        raise TypeError("Input objects must be of the same type.")
    # check if the object type is valid
    obj_type = next(iter(types))
    if obj_type not in (GCF, Spectrum, MolecularFamily):
        raise TypeError(
            f"Invalid type {obj_type}. Input objects must be GCF, Spectrum or MolecularFamily objects."
        )

    # validate scoring parameters
    self._cutoff: float = parameters.get("cutoff", 0)
    self._standardised: bool = parameters.get("standardised", False)
    parameters.update({"cutoff": self._cutoff, "standardised": self._standardised})

    logger.info(
        f"MetcalfScoring: #objects={len(objects)}, type={obj_type}, cutoff={self._cutoff}, "
        f"standardised={self._standardised}"
    )
    if not self._standardised:
        scores_list = self._get_links(*objects, obj_type=obj_type, score_cutoff=self._cutoff)
    else:
        if self.metcalf_mean is None or self.metcalf_std is None:
            raise ValueError(
                "MetcalfScoring.metcalf_mean and metcalf_std are not set. Run MetcalfScoring.setup first."
            )
        # use negative infinity as the score cutoff to ensure we get all links
        scores_list = self._get_links(*objects, obj_type=obj_type, score_cutoff=-np.inf)
        scores_list = self._calc_standardised_score(scores_list)

    links = LinkGraph()
    for score_df in scores_list:
        for row in score_df.itertuples(index=False):  # row has attributes: spec/mf, gcf, score
            met = row.spec if score_df.name == LinkType.SPEC_GCF else row.mf
            links.add_link(
                row.gcf,
                met,
                metcalf=Score(self.name, row.score, parameters),
            )

    logger.info(f"MetcalfScoring: completed! Found {len(links.links)} links in total.")
    return links

format_data

format_data(data)

Format the data for display.

Source code in src/nplinker/scoring/metcalf_scoring.py
def format_data(self, data):
    """Format the data for display."""
    # for metcalf the data will just be a floating point value (i.e. the score)
    return f"{data:.4f}"

sort

sort(objects, reverse=True)

Sort the objects based on the score.

Source code in src/nplinker/scoring/metcalf_scoring.py
def sort(self, objects, reverse=True):
    """Sort the objects based on the score."""
    # sort based on score
    return sorted(objects, key=lambda objlink: objlink[self], reverse=reverse)