Source code for qsprpred.data.tables.descriptor

import pandas as pd

from qsprpred.data.descriptors.sets import DescriptorSet
from qsprpred.data.tables.pnds import PandasDataTable
from qsprpred.utils.parallel import ParallelGenerator


[docs] class DescriptorTable(PandasDataTable): """Pandas table that holds descriptor data for modelling and other analyses. Attributes: calculator (DescriptorSet): `DescriptorSet` used for descriptor calculation. """ def __init__( self, calculator: DescriptorSet, name: str, df: pd.DataFrame | None = None, store_dir: str = ".", overwrite: bool = False, index_cols: list[str] | None = None, n_jobs: int = 1, chunk_size: int | None = None, autoindex_name: str | None = None, random_state: int | None = None, store_format: str = "pkl", parallel_generator: ParallelGenerator | None = None, ): """Initialize a `DescriptorTable` object. Args: calculator (DescriptorSet): `DescriptorSet` used for descriptor calculation. name (str): Name of the new descriptor table. df (pd.DataFrame): data frame containing the descriptors. If you provide a dataframe for a dataset that already exists on disk, the dataframe from disk will override the supplied data frame. Set 'overwrite' to `True` to override the data frame on disk. store_dir (str): Directory to store the dataset files. Defaults to the current directory. If it already contains files with the same name, the existing data will be loaded. overwrite (bool): Overwrite existing dataset. index_cols (list): list of columns to use as index. If None, the index will be a custom generated ID. n_jobs (int): Number of jobs to use for parallel processing. If <= 0, all available cores will be used. chunk_size (int): Size of chunks to use per job in parallel processing. autoindex_name (str): Column name to use for automatically generated IDs. random_state (int): Random state to use for shuffling and other random ops. store_format (str): Format to use for storing the data ('pkl' or 'csv'). parallel_generator (ParallelGenerator): Generator to use for parallel processing. If None, a new generator will be created. """ super().__init__( name, df, store_dir, overwrite, index_cols, n_jobs, chunk_size, autoindex_name, random_state, store_format, parallel_generator, ) self.calculator = calculator
[docs] def getSubset( self, properties: list[str], ids: list[str] | None = None, name: str | None = None, path: str | None = None, ignore_missing: bool = False, ) -> "DescriptorTable": """Get a subset of the descriptor table. Args: properties (list): List of properties to include in the subset. ids (list, optional): List of IDs to include in the subset. name (str, optional): Name of the new descriptor table. path (str, optional): Path to store the new descriptor table. ignore_missing (bool, optional): Whether to ignore missing IDs. Returns: DescriptorTable: The subset of the descriptor table. """ pd_data = super().getSubset(properties, ids, name, path, ignore_missing) pd_data.calculator = self.calculator pd_data.__class__ = DescriptorTable return pd_data
[docs] def getDescriptors(self, active_only: bool = True) -> pd.DataFrame: """Get the descriptors stored in this table. Args: active_only (bool): Whether to return only active descriptors. Returns: pd.DataFrame: The descriptors. """ return self.df[self.getDescriptorNames(active_only=active_only)]
[docs] def getDescriptorNames(self, active_only: bool = True) -> list[str]: """Get the names of the descriptors in this represented by this table. By default, only active descriptors are returned. You can use active_only=False to get all descriptors saved in the table. Args: active_only (bool): Whether to return only descriptors that are active in the current descriptor set. Defaults to `True`. Returns: (list): list of descriptor names """ if active_only: return self.calculator.transformToFeatureNames() else: return self.df.columns[~self.df.columns.isin(self.indexCols)].tolist()
[docs] def fillMissing(self, fill_value: float, names: list[str] | None = None): """Fill missing values in the descriptor table. Args: fill_value (float): Value to fill missing values with. names (list): List of descriptor names to fill. If `None`, all descriptors are filled. """ columns = names if names else self.getDescriptorNames() self.df[columns] = self.df[columns].fillna(fill_value)
[docs] def keepDescriptors(self, descriptors: list[str]) -> list[str]: """Mark only the given descriptors as active in this set. Args: descriptors (list): list of descriptor names to keep Returns: list[str]: list of descriptor names that were kept Raises: ValueError: If any of the descriptors are not present in the table. """ all_descs = self.getDescriptorNames(active_only=False) to_keep = set(all_descs) & set(descriptors) prefix = str(self.calculator) + "_" self.calculator.descriptors = [ x.replace(prefix, "", 1) # remove prefix for x in self.calculator.transformToFeatureNames() if x in to_keep ] return self.getDescriptorNames()
[docs] def restoreDescriptors(self) -> list[str]: """Restore all descriptors to active in this set. Returns: list[str]: list of all active descriptor names """ all_descs = self.getDescriptorNames(active_only=False) prefix = str(self.calculator) + "_" self.calculator.descriptors = [ x.replace(prefix, "", 1) for x in all_descs # remove prefix ] return self.getDescriptorNames()