Source code for qsprpred.extra.data.tables.pcm

from typing import Callable

import pandas as pd

from qsprpred.data.descriptors.sets import DescriptorSet
from qsprpred.data.tables.mol import MoleculeTable
from qsprpred.data.tables.qspr import QSPRDataset
from qsprpred.extra.data.descriptors.sets import ProteinDescriptorSet
from qsprpred.logs import logger
from qsprpred.tasks import TargetProperty
from qsprpred.utils.serialization import function_as_string, function_from_string


[docs]class PCMDataSet(QSPRDataset):
    """Extension of `QSARDataset` for PCM modelling.

    It allows specification of a column with protein identifiers
    and the calculation of protein descriptors.

    Attributes:
        proteinCol (str):
            name of column in df containing the protein target identifier (usually a
            UniProt ID) to use for protein descriptors for PCM modelling
            and other protein related tasks.
        proteinSeqProvider (Callable):
            function that takes a list of protein identifiers and returns a `dict`
            mapping those identifiers to their sequences. Defaults to `None`.
    """

    def __init__(
        self,
        name: str,
        protein_col: str,
        target_props: list[TargetProperty | dict],
        df: pd.DataFrame | None = None,
        smiles_col: str = "SMILES",
        protein_seq_provider: Callable | None = None,
        add_rdkit: bool = False,
        store_dir: str = ".",
        overwrite: bool = False,
        n_jobs: int | None = 1,
        chunk_size: int | None = None,
        drop_invalids: bool = True,
        drop_empty: bool = True,
        index_cols: list[str] | None = None,
        autoindex_name: str = "QSPRID",
        random_state: int | None = None,
        store_format: str = "pkl",
    ):
        """Construct a data set to handle PCM data.

        Args:
            name (str): data name, used in saving the data
            protein_col (str): name of column in df containing the protein target
                identifier (usually a UniProt ID) to use for protein descriptors for PCM
                modelling and other protein related tasks.
            protein_seq_provider: Callable = None, optional):
                function that takes a list of protein identifiers and returns a `dict`
                mapping those identifiers to their sequences. Defaults to `None`.
            target_props (list[TargetProperty | dict]):
                target properties,
                names should correspond with target column name in `df`
            df (pd.DataFrame, optional):
                input dataframe containing smiles and target property.
                Defaults to `None`.
            smiles_col (str, optional):
                name of column in `df` containing SMILES. Defaults to "SMILES".
            add_rdkit (bool, optional):
                if `True`, column with rdkit molecules will be added to `df`.
                Defaults to `False`.
            store_dir (str, optional):
                directory for saving the output data. Defaults to '.'.
            overwrite (bool, optional):
                if `True`, existing data will be overwritten. Defaults to `False`.
            n_jobs (int, optional):
                number of parallel jobs. If <= 0, all available cores will be used.
                Defaults to 1.
            chunk_size (int, optional):
                chunk size for parallel processing. Defaults to 50.
            drop_invalids (bool, optional):
                If `True`, invalid SMILES will be dropped. Defaults to `True`.
            drop_empty (bool, optional):
                If `True`, rows with empty SMILES will be dropped. Defaults to `True`.
            index_cols (List[str], optional):
                columns to be used as index in the dataframe.
                Defaults to `None` in which case a custom ID will be generated.
            autoindex_name (str, optional):
                Column name to use for automatically generated IDs.
            random_state (int, optional):
                random state for reproducibility. Defaults to `None`.
            store_format
                format to use for storing the data ('pkl' or 'csv').

        Raises:
            `ValueError`:
                Raised if threshold given with non-classification task.
        """
        super().__init__(
            name,
            df=df,
            smiles_col=smiles_col,
            add_rdkit=add_rdkit,
            store_dir=store_dir,
            overwrite=overwrite,
            n_jobs=n_jobs,
            chunk_size=chunk_size,
            drop_invalids=drop_invalids,
            index_cols=index_cols,
            target_props=target_props,
            drop_empty=drop_empty,
            autoindex_name=autoindex_name,
            random_state=random_state,
            store_format=store_format,
        )
        self.proteinCol = protein_col
        self.proteinSeqProvider = protein_seq_provider

[docs]    def getProteinKeys(self) -> list[str]:
        """Return a list of keys identifying the proteins in the data frame.

        Returns:
            keys (list): List of protein keys.
        """
        return self.df[self.proteinCol].unique().tolist()

[docs]    def getProteinSequences(self) -> dict[str, str]:
        """Return a dictionary of protein sequences for the proteins in the data frame.

        Returns:
            sequences (dict): Dictionary of protein sequences.
        """
        if not self.proteinSeqProvider:
            raise ValueError(
                "Protein sequence provider not set. Cannot get protein sequences."
            )
        return self.proteinSeqProvider(self.getProteinKeys())

[docs]    def addDescriptors(
        self,
        descriptors: list[DescriptorSet | ProteinDescriptorSet],
        recalculate: bool = False,
        featurize: bool = True,
        *args,
        **kwargs,
    ):
        # make sure the acc_keys property is set for ProteinDescriptorSets
        self.df["acc_keys"] = self.df[self.proteinCol]
        # get protein sequences and metadata
        sequences, info = (
            self.getProteinSequences() if self.proteinSeqProvider else (None, {})
        )
        # append sequences and metadata to kwargs
        kwargs["sequences"] = sequences
        for key in info:
            kwargs[key] = info[key]
        # pass everything to the descriptor calculation
        return super().addDescriptors(
            descriptors, recalculate, featurize, *args, **kwargs
        )

    def __getstate__(self):
        o_dict = super().__getstate__()
        if self.proteinSeqProvider:
            o_dict["proteinSeqProvider"] = function_as_string(self.proteinSeqProvider)
        return o_dict

    def __setstate__(self, state):
        super().__setstate__(state)
        if self.proteinSeqProvider:
            try:
                self.proteinSeqProvider = function_from_string(self.proteinSeqProvider)
            except Exception as e:
                logger.warning(
                    "Failed to load protein sequence provider from metadata. "
                    f"The function object could not be recreated from the code. "
                    f"\nError: {e}"
                    f"\nDeserialized Code: {self.proteinSeqProvider}"
                    f"\nSetting protein sequence provider to `None` for now."
                )
                self.proteinSeqProvider = None

[docs]    @staticmethod
    def fromSDF(name, filename, smiles_prop, *args, **kwargs):
        raise NotImplementedError(
            f"SDF loading not implemented for {PCMDataSet.__name__}, yet. "
            f"Use `PCMDataSet.fromMolTable` to convert a `MoleculeTable`"
            f"read from an SDF instead."
        )

[docs]    @staticmethod
    def fromMolTable(
        mol_table: MoleculeTable,
        protein_col: str,
        target_props: list[TargetProperty | dict] | None = None,
        name: str | None = None,
        **kwargs,
    ) -> "PCMDataSet":
        """Construct a data set to handle PCM data from a `MoleculeTable`.

        Args:
            mol_table (MoleculeTable):
                `MoleculeTable` instance containing the PCM data.
            protein_col (str):
                name of column in df containing the protein target identifier (usually a
                UniProt ID) to use for protein descriptors for PCM modelling
                and other protein related tasks.
            target_props (list[TargetProperty | dict], optional):
                target properties,
                names should correspond with target column name in `df`
            name (str, optional):
                data name, used in saving the data. Defaults to `None`.
            **kwargs:
                keyword arguments to be passed to the `PCMDataset` constructor.

        Returns:
            PCMDataSet:
                `PCMDataset` instance containing the PCM data.
        """
        protein_seq_provider = None
        if "protein_seq_provider" in kwargs:
            protein_seq_provider = kwargs.pop("protein_seq_provider")
        ret = QSPRDataset.fromMolTable(mol_table, target_props, name, **kwargs)
        ret.proteinCol = protein_col
        ret.proteinSeqProvider = protein_seq_provider
        ret.__class__ = PCMDataSet
        return ret

[docs]    def searchWithIndex(
        self, index: pd.Index, name: str | None = None
    ) -> "MoleculeTable":
        ret = super().searchWithIndex(index, name)
        ret = PCMDataSet.fromMolTable(
            ret,
            self.proteinCol,
            self.targetProperties,
            name=ret.name,
        )
        ret.featureStandardizer = self.featureStandardizer
        ret.featurize()
        return ret