Source code for qsprpred.extra.data.storage.protein.tabular_pcm

from typing import Any, Callable, Iterable, Optional

import pandas as pd

from qsprpred.data.tables.pnds import PandasDataTable
from qsprpred.extra.data.storage.protein.interfaces.protein_storage import (
    ProteinStorage,
)
from qsprpred.extra.data.storage.protein.interfaces.storedprotein import StoredProtein
from qsprpred.logs import logger
from qsprpred.utils.parallel import ParallelGenerator
from qsprpred.utils.serialization import function_as_string, function_from_string



[docs]
class TabularProtein(StoredProtein):
    """A protein object that is stored in a tabular format.

    Attributes:
        id (str): id of the protein
        sequence (str): sequence of the protein
        props (dict[str, Any]): properties of the protein
        representations (Iterable[TabularProtein]): representations of the protein
    """

    def __init__(
            self,
            protein_id: str,
            sequence: str | None = None,
            parent: Optional["TabularProtein"] = None,
            props: dict[str, Any] | None = None,
            representations: Iterable["TabularProtein"] | None = None,
    ) -> None:
        """Create a new protein instance.

        Args:
            protein_id (str): identifier of the protein
            sequence (str): sequence of the protein
            parent (TabularProtein): parent protein
            props (dict[str, Any]): properties of the protein
            representations (Iterable[TabularProtein]): representations of the protein
        """
        self._parent = parent
        self._id = protein_id
        self._sequence = sequence
        self._props = props
        self._representations = representations

    @property
    def id(self) -> str:
        """Get the id of the protein."""
        return self._id

    @property
    def sequence(self) -> str | None:
        """Get the sequence of the protein."""
        return self._sequence

    @property
    def props(self) -> dict[str, Any] | None:
        """Get the properties of the protein."""
        return self._props


[docs]
    def as_pdb(self) -> str | None:
        """Return the protein as a PDB file."""
        return self._props["pdb"] if "pdb" in self._props else None



[docs]
    def as_fasta(self) -> str | None:
        """Return the protein as a FASTA file."""
        return self._props["fasta"] if "fasta" in self._props else None


    @property
    def representations(self) -> Iterable["TabularProtein"]:
        """Get all representations of the protein."""
        return self._representations

    @property
    def parent(self) -> "TabularProtein":
        """Get the parent protein."""
        return self._parent




[docs]
class TabularProteinStorage(ProteinStorage, PandasDataTable):
    """A storage class for proteins stored in a tabular format.

    Attributes:
        sequenceCol (str): name of the column that contains all protein sequences
        proteinSeqProvider (Callable): function that provides protein
        sequenceProp (str): name of the property that contains all protein sequences
        proteins (Iterable[TabularProtein]): all proteins in the store
    """

    def __init__(
            self,
            name: str,
            df: pd.DataFrame | None = None,
            sequence_col: str = "Sequence",
            sequence_provider: Optional[Callable] = None,
            store_dir: str = ".",
            overwrite: bool = False,
            index_cols: list[str] | None = None,
            n_jobs: int = 1,
            chunk_size: int | None = None,
            protein_col: str = "accession",
            random_state: int | None = None,
            store_format: str = "pkl",
            parallel_generator: ParallelGenerator | None = None,
    ):
        """Create a new protein storage instance.

        Args:
            name (str): name of the storage
            df (pd.DataFrame): data frame containing the proteins
            sequence_col (str): name of the column that contains all protein sequences
            sequence_provider (Callable): function that provides protein
            store_dir (str): directory to store the data
            overwrite (bool): overwrite the existing data
            index_cols (list[str]): columns to use as index
            n_jobs (int): number of parallel jobs
            chunk_size (int): size of the chunks
            protein_col (str): name of the column that contains the protein ids
            random_state (int): random state
            store_format (str): format to store the data
            parallel_generator (ParallelGenerator): parallel generator
        """
        super().__init__(
            name,
            (
                df if df is not None else pd.DataFrame(
                    columns=(
                        [sequence_col, *index_cols]
                        if index_cols else [sequence_col, protein_col]
                    )
                )
            ),
            store_dir,
            overwrite,
            index_cols or [protein_col],
            n_jobs,
            chunk_size,
            protein_col,
            random_state,
            store_format,
            parallel_generator,
        )
        self._sequenceCol = sequence_col
        self.proteinSeqProvider = sequence_provider
        if self.proteinSeqProvider is not None:
            self.getPCMInfo()
        else:
            assert self.sequenceProp in self.getProperties()

    def __getstate__(self):
        o_dict = super().__getstate__()
        if self.proteinSeqProvider:
            o_dict["proteinSeqProvider"] = function_as_string(self.proteinSeqProvider)
        return o_dict

    def __setstate__(self, state):
        super().__setstate__(state)
        if self.proteinSeqProvider and type(self.proteinSeqProvider) is str:
            try:
                self.proteinSeqProvider = function_from_string(self.proteinSeqProvider)
            except Exception as e:
                logger.warning(
                    "Failed to load protein sequence provider from metadata. "
                    f"The function object could not be recreated from the code. "
                    f"\nError: {e}"
                    f"\nDeserialized Code: {self.proteinSeqProvider}"
                    f"\nSetting protein sequence provider to `None` for now."
                )
                self.proteinSeqProvider = None


[docs]
    def getPCMInfo(self) -> tuple[dict[str, str], dict]:
        """Return a dictionary of protein sequences for the proteins
        in the data frame and the additional metadata separately.

        Returns:
            sequences (dict): Dictionary of protein sequences.
        """
        if self.proteinSeqProvider is not None:
            mapping, props = self.proteinSeqProvider(set(self.getProperty(self.idProp)))
            assert set(mapping.keys()) == set(self.getProperty(self.idProp)), (
                "Protein sequence provider did not return sequences "
                "for all proteins. Could"
                " not get sequences for the following proteins: "
                f"{set(self.getProperty(self.idProp)) - set(mapping.keys())}"
            )
            for protein_id in mapping:
                self.addProperty(self.sequenceProp, [mapping[protein_id]], [protein_id])
                if props:
                    for prop in props[protein_id]:
                        self.addProperty(prop, [props[protein_id][prop]], [protein_id])
            return mapping, props
        else:
            return dict(
                zip(self.getProperty(self.idProp), self.getProperty(self.sequenceProp))
            ), {  # return all remaining props as metadata
                prop: self.getProperty(prop)
                for prop in self.getProperties()
                if prop not in [self.idProp, self.sequenceProp]
            }


    @property
    def sequenceProp(self) -> str:
        """Get the name of the property that contains all protein sequences."""
        return self._sequenceCol


[docs]
    def add_protein(self, protein: TabularProtein, raise_on_existing=True):
        """Add a protein to the store.

        Args:
            protein (TabularProtein): protein sequence
            raise_on_existing (bool):
                raise an exception if the protein already exists in the store
        """
        self.addEntries(
            [protein.id],
            {
                prop: [val]
                for prop, val in protein.props.items()
            },
            raise_on_existing,
        )


    def _make_proteins_from_chunk(self, df: pd.DataFrame) -> list[TabularProtein]:
        """Create a list of proteins from a chunk of the data frame.

        Args:
            df (pd.DataFrame): chunk of the data frame

        Returns:
            list[TabularProtein]: list of proteins
        """
        ids = df[self.idProp].values
        sequences = df[self.sequenceProp].values
        props = df.columns.difference([self.idProp, self.sequenceProp])
        return [
            TabularProtein(
                protein_id=ids[i],
                sequence=sequences[i],
                props={prop: df[prop].values[i]
                       for prop in props},
            ) for i in range(len(df))
        ]

    @property
    def proteins(self) -> list[TabularProtein]:
        """Get all proteins in the store.

        Returns:
            list[TabularProtein]: list of proteins
        """
        ret = []
        for chunk in self.iterChunks(len(self)):
            ret.extend(self._make_proteins_from_chunk(chunk))
        return ret


[docs]
    def getProtein(self, protein_id: str) -> TabularProtein:
        """Get a protein from the store using its name.

        Args:
            protein_id (str): name of the protein to search

        Returns:
            TabularProtein: instance of `Protein`

        Raises:
            ValueError: if the protein is not found
        """
        df = self.getDF()
        protein = df[df[self.idProp] == protein_id]
        if protein.empty:
            raise ValueError(f"Protein {protein_id} not found.")
        return self._make_proteins_from_chunk(protein)[0]