Source code for qsprpred.extra.models.pcm

"""Specialized models for proteochemometric models (PCM).

"""

from abc import ABC
from typing import Callable

import numpy as np
from rdkit import Chem
from rdkit.Chem import Mol

from qsprpred.extra.data.tables.pcm import PCMDataSet
from ..data.descriptors.sets import ProteinDescriptorSet
from ...data.tables.mol import MoleculeTable
from ...models.model import QSPRModel
from ...models.scikit_learn import SklearnModel


[docs]class PCMModel(QSPRModel, ABC):
    """Base class for PCM models.

    Extension of `QSPRModel` for proteochemometric models (PCM). It modifies
    the `predictMols` method to handle PCM descriptors and specification of protein ids.
    """

[docs]    def createPredictionDatasetFromMols(
        self,
        mols: list[str],
        protein_id: str,  # FIXME: this changes the signature from the base class
        smiles_standardizer: str | Callable = "chembl",
        n_jobs: int = 1,
        fill_value: float = np.nan,
    ) -> tuple[PCMDataSet, np.ndarray]:
        """
        Create a prediction data set of compounds using a PCM model
        given as a list of SMILES strings and a protein identifier.
        The protein identifier is used to calculate the protein descriptors.

        Args:
            mols (list[str]):
                List of SMILES strings.
            protein_id (str):
                Protein identifier.
            smiles_standardizer (str | Callable, optional):
                Smiles standardizer. Defaults to "chembl".
            n_jobs (int, optional):
                Number of parallel jobs. Defaults to 1.
            fill_value (float, optional):
                Value to fill missing features with. Defaults to np.nan.
        Returns:
            PCMDataSet:
                Dataset with the features calculated for the molecules.
        """
        # make a molecule table first and add the target properties
        if isinstance(mols[0], Mol):
            mols = [Chem.MolToSmiles(mol) for mol in mols]
        dataset = MoleculeTable.fromSMILES(
            f"{self.__class__.__name__}_{hash(self)}",
            mols,
            drop_invalids=False,
            n_jobs=n_jobs,
        )
        for target_property in self.targetProperties:
            target_property.imputer = None
            dataset.addProperty(target_property.name, np.nan)
        dataset.addProperty("protein_id", protein_id)
        # convert to PCMDataSet
        dataset = PCMDataSet.fromMolTable(
            dataset,
            "protein_id",
            target_props=self.targetProperties,
            drop_empty=False,
            drop_invalids=False,
            n_jobs=n_jobs,
        )
        # standardize smiles
        dataset.standardizeSmiles(smiles_standardizer, drop_invalid=False)
        failed_mask = dataset.dropInvalids().values
        # calculate features and prepare dataset
        dataset.prepareDataset(
            smiles_standardizer=smiles_standardizer,
            feature_calculators=self.featureCalculators,
            feature_standardizer=self.featureStandardizer,
            feature_fill_value=fill_value,
            shuffle=False,
        )
        return dataset, failed_mask

[docs]    def predictMols(
        self,
        mols: list[str],
        protein_id: str,  # FIXME: this changes the signature from the base class
        use_probas: bool = False,
        smiles_standardizer: str | Callable = "chembl",
        n_jobs: int = 1,
        fill_value: float = np.nan,
    ) -> np.ndarray:
        """
        Predict the target properties of a list of molecules using a PCM model.
        The protein identifier is used to calculate the protein descriptors for
        a target of interest.

        Args:
            mols (list[str]):
                List of SMILES strings.
            protein_id (str):
                Protein identifier.
            use_probas (bool, optional):
                Whether to return class probabilities. Defaults to False.
            smiles_standardizer (str | Callable, optional):
                Smiles standardizer. Defaults to "chembl".
            n_jobs (int, optional):
                Number of parallel jobs. Defaults to 1.
            fill_value (float, optional):
                Value to fill missing features with. Defaults to np.nan.

        Returns:
            np.ndarray:
                Array of predictions.

        """
        # check if the model contains a feature calculator
        if not self.featureCalculators:
            raise ValueError("No feature calculator set on this instance.")
        # run PCM checks to validate the protein ids and descriptors
        is_pcm = False
        protein_ids = set()
        for calc in self.featureCalculators:
            if isinstance(calc, ProteinDescriptorSet):
                is_pcm = True
                if not protein_ids and hasattr(calc, "msaProvider"):
                    protein_ids = set(calc.msaProvider.current.keys())
                if protein_ids and hasattr(calc, "msaProvider"):
                    assert protein_ids == set(calc.msaProvider.current.keys()), (
                        "All protein descriptor calculators "
                        "must have the same protein ids."
                    )
            if (
                isinstance(calc, ProteinDescriptorSet)
                and hasattr(calc, "msaProvider")
                and calc.msaProvider
                and protein_id not in calc.msaProvider.current.keys()
            ):
                raise ValueError(
                    f"Protein id {protein_id} not found in the available MSA, "
                    f"cannot calculate PCM descriptors. Options are: {protein_ids}."
                )
        if not is_pcm:
            raise ValueError(
                "No protein descriptors found on this instance. "
                "Are you sure this is a PCM model?"
            )
        # create data set from mols
        dataset, failed_mask = self.createPredictionDatasetFromMols(
            mols, protein_id, smiles_standardizer, n_jobs, fill_value
        )
        # make predictions for the dataset
        predictions = self.predictDataset(dataset, use_probas)
        # handle invalids
        predictions = self.handleInvalidsInPredictions(mols, predictions, failed_mask)
        return predictions


[docs]class SklearnPCMModel(SklearnModel, PCMModel):
    """Wrapper for sklearn models for PCM.

    Just replaces some methods in `SklearnModel` with those in `PCMModel`.
    """