Source code for qsprpred.extra.data.tables.pcm

from typing import Callable

import pandas as pd

from qsprpred.data.descriptors.sets import DescriptorSet
from qsprpred.data.tables.mol import MoleculeTable
from qsprpred.data.tables.qspr import QSPRDataset
from qsprpred.extra.data.descriptors.sets import ProteinDescriptorSet
from qsprpred.logs import logger
from qsprpred.tasks import TargetProperty
from qsprpred.utils.serialization import function_as_string, function_from_string


[docs]class PCMDataSet(QSPRDataset): """Extension of `QSARDataset` for PCM modelling. It allows specification of a column with protein identifiers and the calculation of protein descriptors. Attributes: proteinCol (str): name of column in df containing the protein target identifier (usually a UniProt ID) to use for protein descriptors for PCM modelling and other protein related tasks. proteinSeqProvider (Callable): function that takes a list of protein identifiers and returns a `dict` mapping those identifiers to their sequences. Defaults to `None`. """ def __init__( self, name: str, protein_col: str, target_props: list[TargetProperty | dict], df: pd.DataFrame | None = None, smiles_col: str = "SMILES", protein_seq_provider: Callable | None = None, add_rdkit: bool = False, store_dir: str = ".", overwrite: bool = False, n_jobs: int | None = 1, chunk_size: int | None = None, drop_invalids: bool = True, drop_empty: bool = True, index_cols: list[str] | None = None, autoindex_name: str = "QSPRID", random_state: int | None = None, store_format: str = "pkl", ): """Construct a data set to handle PCM data. Args: name (str): data name, used in saving the data protein_col (str): name of column in df containing the protein target identifier (usually a UniProt ID) to use for protein descriptors for PCM modelling and other protein related tasks. protein_seq_provider: Callable = None, optional): function that takes a list of protein identifiers and returns a `dict` mapping those identifiers to their sequences. Defaults to `None`. target_props (list[TargetProperty | dict]): target properties, names should correspond with target column name in `df` df (pd.DataFrame, optional): input dataframe containing smiles and target property. Defaults to `None`. smiles_col (str, optional): name of column in `df` containing SMILES. Defaults to "SMILES". add_rdkit (bool, optional): if `True`, column with rdkit molecules will be added to `df`. Defaults to `False`. store_dir (str, optional): directory for saving the output data. Defaults to '.'. overwrite (bool, optional): if `True`, existing data will be overwritten. Defaults to `False`. n_jobs (int, optional): number of parallel jobs. If <= 0, all available cores will be used. Defaults to 1. chunk_size (int, optional): chunk size for parallel processing. Defaults to 50. drop_invalids (bool, optional): If `True`, invalid SMILES will be dropped. Defaults to `True`. drop_empty (bool, optional): If `True`, rows with empty SMILES will be dropped. Defaults to `True`. index_cols (List[str], optional): columns to be used as index in the dataframe. Defaults to `None` in which case a custom ID will be generated. autoindex_name (str, optional): Column name to use for automatically generated IDs. random_state (int, optional): random state for reproducibility. Defaults to `None`. store_format format to use for storing the data ('pkl' or 'csv'). Raises: `ValueError`: Raised if threshold given with non-classification task. """ super().__init__( name, df=df, smiles_col=smiles_col, add_rdkit=add_rdkit, store_dir=store_dir, overwrite=overwrite, n_jobs=n_jobs, chunk_size=chunk_size, drop_invalids=drop_invalids, index_cols=index_cols, target_props=target_props, drop_empty=drop_empty, autoindex_name=autoindex_name, random_state=random_state, store_format=store_format, ) self.proteinCol = protein_col self.proteinSeqProvider = protein_seq_provider
[docs] def getProteinKeys(self) -> list[str]: """Return a list of keys identifying the proteins in the data frame. Returns: keys (list): List of protein keys. """ return self.df[self.proteinCol].unique().tolist()
[docs] def getProteinSequences(self) -> dict[str, str]: """Return a dictionary of protein sequences for the proteins in the data frame. Returns: sequences (dict): Dictionary of protein sequences. """ if not self.proteinSeqProvider: raise ValueError( "Protein sequence provider not set. Cannot get protein sequences." ) return self.proteinSeqProvider(self.getProteinKeys())
[docs] def addDescriptors( self, descriptors: list[DescriptorSet | ProteinDescriptorSet], recalculate: bool = False, featurize: bool = True, *args, **kwargs, ): # make sure the acc_keys property is set for ProteinDescriptorSets self.df["acc_keys"] = self.df[self.proteinCol] # get protein sequences and metadata sequences, info = ( self.getProteinSequences() if self.proteinSeqProvider else (None, {}) ) # append sequences and metadata to kwargs kwargs["sequences"] = sequences for key in info: kwargs[key] = info[key] # pass everything to the descriptor calculation return super().addDescriptors( descriptors, recalculate, featurize, *args, **kwargs )
def __getstate__(self): o_dict = super().__getstate__() if self.proteinSeqProvider: o_dict["proteinSeqProvider"] = function_as_string(self.proteinSeqProvider) return o_dict def __setstate__(self, state): super().__setstate__(state) if self.proteinSeqProvider: try: self.proteinSeqProvider = function_from_string(self.proteinSeqProvider) except Exception as e: logger.warning( "Failed to load protein sequence provider from metadata. " f"The function object could not be recreated from the code. " f"\nError: {e}" f"\nDeserialized Code: {self.proteinSeqProvider}" f"\nSetting protein sequence provider to `None` for now." ) self.proteinSeqProvider = None
[docs] @staticmethod def fromSDF(name, filename, smiles_prop, *args, **kwargs): raise NotImplementedError( f"SDF loading not implemented for {PCMDataSet.__name__}, yet. " f"Use `PCMDataSet.fromMolTable` to convert a `MoleculeTable`" f"read from an SDF instead." )
[docs] @staticmethod def fromMolTable( mol_table: MoleculeTable, protein_col: str, target_props: list[TargetProperty | dict] | None = None, name: str | None = None, **kwargs, ) -> "PCMDataSet": """Construct a data set to handle PCM data from a `MoleculeTable`. Args: mol_table (MoleculeTable): `MoleculeTable` instance containing the PCM data. protein_col (str): name of column in df containing the protein target identifier (usually a UniProt ID) to use for protein descriptors for PCM modelling and other protein related tasks. target_props (list[TargetProperty | dict], optional): target properties, names should correspond with target column name in `df` name (str, optional): data name, used in saving the data. Defaults to `None`. **kwargs: keyword arguments to be passed to the `PCMDataset` constructor. Returns: PCMDataSet: `PCMDataset` instance containing the PCM data. """ protein_seq_provider = None if "protein_seq_provider" in kwargs: protein_seq_provider = kwargs.pop("protein_seq_provider") ret = QSPRDataset.fromMolTable(mol_table, target_props, name, **kwargs) ret.proteinCol = protein_col ret.proteinSeqProvider = protein_seq_provider ret.__class__ = PCMDataSet return ret
[docs] def searchWithIndex( self, index: pd.Index, name: str | None = None ) -> "MoleculeTable": ret = super().searchWithIndex(index, name) ret = PCMDataSet.fromMolTable( ret, self.proteinCol, self.targetProperties, name=ret.name, ) ret.featureStandardizer = self.featureStandardizer ret.featurize() return ret