Source code for qsprpred.extra.models.pcm

"""Specialized models for proteochemometric models (PCM).

"""

from abc import ABC
from typing import Callable

import numpy as np
from rdkit import Chem
from rdkit.Chem import Mol

from qsprpred.extra.data.tables.pcm import PCMDataSet
from ..data.descriptors.sets import ProteinDescriptorSet
from ...data.tables.mol import MoleculeTable
from ...models.model import QSPRModel
from ...models.scikit_learn import SklearnModel


[docs]class PCMModel(QSPRModel, ABC): """Base class for PCM models. Extension of `QSPRModel` for proteochemometric models (PCM). It modifies the `predictMols` method to handle PCM descriptors and specification of protein ids. """
[docs] def createPredictionDatasetFromMols( self, mols: list[str], protein_id: str, # FIXME: this changes the signature from the base class smiles_standardizer: str | Callable = "chembl", n_jobs: int = 1, fill_value: float = np.nan, ) -> tuple[PCMDataSet, np.ndarray]: """ Create a prediction data set of compounds using a PCM model given as a list of SMILES strings and a protein identifier. The protein identifier is used to calculate the protein descriptors. Args: mols (list[str]): List of SMILES strings. protein_id (str): Protein identifier. smiles_standardizer (str | Callable, optional): Smiles standardizer. Defaults to "chembl". n_jobs (int, optional): Number of parallel jobs. Defaults to 1. fill_value (float, optional): Value to fill missing features with. Defaults to np.nan. Returns: PCMDataSet: Dataset with the features calculated for the molecules. """ # make a molecule table first and add the target properties if isinstance(mols[0], Mol): mols = [Chem.MolToSmiles(mol) for mol in mols] dataset = MoleculeTable.fromSMILES( f"{self.__class__.__name__}_{hash(self)}", mols, drop_invalids=False, n_jobs=n_jobs, ) for target_property in self.targetProperties: target_property.imputer = None dataset.addProperty(target_property.name, np.nan) dataset.addProperty("protein_id", protein_id) # convert to PCMDataSet dataset = PCMDataSet.fromMolTable( dataset, "protein_id", target_props=self.targetProperties, drop_empty=False, drop_invalids=False, n_jobs=n_jobs, ) # standardize smiles dataset.standardizeSmiles(smiles_standardizer, drop_invalid=False) failed_mask = dataset.dropInvalids().values # calculate features and prepare dataset dataset.prepareDataset( smiles_standardizer=smiles_standardizer, feature_calculators=self.featureCalculators, feature_standardizer=self.featureStandardizer, feature_fill_value=fill_value, shuffle=False, ) return dataset, failed_mask
[docs] def predictMols( self, mols: list[str], protein_id: str, # FIXME: this changes the signature from the base class use_probas: bool = False, smiles_standardizer: str | Callable = "chembl", n_jobs: int = 1, fill_value: float = np.nan, ) -> np.ndarray: """ Predict the target properties of a list of molecules using a PCM model. The protein identifier is used to calculate the protein descriptors for a target of interest. Args: mols (list[str]): List of SMILES strings. protein_id (str): Protein identifier. use_probas (bool, optional): Whether to return class probabilities. Defaults to False. smiles_standardizer (str | Callable, optional): Smiles standardizer. Defaults to "chembl". n_jobs (int, optional): Number of parallel jobs. Defaults to 1. fill_value (float, optional): Value to fill missing features with. Defaults to np.nan. Returns: np.ndarray: Array of predictions. """ # check if the model contains a feature calculator if not self.featureCalculators: raise ValueError("No feature calculator set on this instance.") # run PCM checks to validate the protein ids and descriptors is_pcm = False protein_ids = set() for calc in self.featureCalculators: if isinstance(calc, ProteinDescriptorSet): is_pcm = True if not protein_ids and hasattr(calc, "msaProvider"): protein_ids = set(calc.msaProvider.current.keys()) if protein_ids and hasattr(calc, "msaProvider"): assert protein_ids == set(calc.msaProvider.current.keys()), ( "All protein descriptor calculators " "must have the same protein ids." ) if ( isinstance(calc, ProteinDescriptorSet) and hasattr(calc, "msaProvider") and calc.msaProvider and protein_id not in calc.msaProvider.current.keys() ): raise ValueError( f"Protein id {protein_id} not found in the available MSA, " f"cannot calculate PCM descriptors. Options are: {protein_ids}." ) if not is_pcm: raise ValueError( "No protein descriptors found on this instance. " "Are you sure this is a PCM model?" ) # create data set from mols dataset, failed_mask = self.createPredictionDatasetFromMols( mols, protein_id, smiles_standardizer, n_jobs, fill_value ) # make predictions for the dataset predictions = self.predictDataset(dataset, use_probas) # handle invalids predictions = self.handleInvalidsInPredictions(mols, predictions, failed_mask) return predictions
[docs]class SklearnPCMModel(SklearnModel, PCMModel): """Wrapper for sklearn models for PCM. Just replaces some methods in `SklearnModel` with those in `PCMModel`. """