Source code for qsprpred.data.tables.mol

import os
import pickle
from multiprocessing import Pool
from typing import Optional, ClassVar, Generator, Literal, Callable, Any

import numpy as np
import pandas as pd
from rdkit import Chem

from qsprpred.data.tables.searchable import SearchableMolTable
from .pandas import PandasDataTable
from ..chem.matching import match_mol_to_smarts
from ..descriptors.sets import DescriptorSet
from ..processing.mol_processor import MolProcessor
from ...data.chem.scaffolds import Scaffold
from ...data.chem.standardization import (
    CheckSmilesValid,
    chembl_smi_standardizer,
    old_standardize_sanitize,
)
from ...logs import logger
from ...utils.interfaces.summarizable import Summarizable


[docs]class DescriptorTable(PandasDataTable):
    """Pandas table that holds descriptor data for modelling and other analyses.

    Attributes:
        calculator (DescriptorSet):
            `DescriptorSet` used for descriptor calculation.
    """

    def __init__(
            self,
            calculator: DescriptorSet,
            name: str,
            df: Optional[pd.DataFrame] = None,
            store_dir: str = ".",
            overwrite: bool = False,
            key_cols: list | None = None,
            n_jobs: int = 1,
            chunk_size: int = 1000,
            autoindex_name: str = "QSPRID",
            random_state: int | None = None,
            store_format: str = "pkl",
    ):
        """Initialize a `DescriptorTable` object.

        Args:
            calculator (DescriptorSet):
                `DescriptorSet` used for descriptor calculation.
            name (str):
                Name of the  new  descriptor table.
            df (pd.DataFrame):
                data frame containing the descriptors. If you provide a
                dataframe for a dataset that already exists on disk,
                the dataframe from disk will override the supplied data
                frame. Set 'overwrite' to `True` to override
                the data frame on disk.
            store_dir (str):
                Directory to store the dataset files. Defaults to the
                current directory. If it already contains files with the same name,
                the existing data will be loaded.
            overwrite (bool):
                Overwrite existing dataset.
            key_cols (list):
                list of columns to use as index. If None, the index
                will be a custom generated ID.
            n_jobs (int):
                Number of jobs to use for parallel processing. If <= 0,
                all available cores will be used.
            chunk_size (int):
                Size of chunks to use per job in parallel processing.
            autoindex_name (str):
                Column name to use for automatically generated IDs.
            random_state (int):
                Random state to use for shuffling and other random ops.
            store_format (str):
                Format to use for storing the data ('pkl' or 'csv').
        """
        super().__init__(
            name,
            df,
            store_dir,
            overwrite,
            key_cols,
            n_jobs,
            chunk_size,
            autoindex_name,
            random_state,
            store_format,
        )
        self.calculator = calculator

[docs]    def getDescriptors(self, active_only=True):
        """Get the descriptors stored in this table."""
        return self.df[self.getDescriptorNames(active_only=active_only)]

[docs]    def getDescriptorNames(self, active_only=True):
        """Get the names of the descriptors in this represented by this table.
        By default, only active descriptors are returned. You can use active_only=False
        to get all descriptors saved in the table.

        Args:
            active_only (bool): Whether to return only descriptors that are active in
                the current descriptor set. Defaults to `True`.

        """
        if active_only:
            return self.calculator.transformToFeatureNames()
        else:
            return self.df.columns[~self.df.columns.isin(self.indexCols)].tolist()

[docs]    def fillMissing(self, fill_value, names):
        """Fill missing values in the descriptor table.

        Args:
            fill_value (float): Value to fill missing values with.
            names (list): List of descriptor names to fill. If `None`, all descriptors
                are filled.
        """
        columns = names if names else self.getDescriptorNames()
        self.df[columns] = self.df[columns].fillna(fill_value)

[docs]    def keepDescriptors(self, descriptors: list[str]) -> list[str]:
        """Mark only the given descriptors as active in this set.

        Args:
            descriptors (list): list of descriptor names to keep

        Returns:
            list[str]: list of descriptor names that were kept

        Raises:
            ValueError: If any of the descriptors are not present in the table.
        """
        all_descs = self.getDescriptorNames(active_only=False)
        to_keep = set(all_descs) & set(descriptors)
        prefix = str(self.calculator) + "_"
        self.calculator.descriptors = [
            x.replace(prefix, "", 1)  # remove prefix
            for x in self.calculator.transformToFeatureNames()
            if x in to_keep
        ]
        return self.getDescriptorNames()

[docs]    def restoreDescriptors(self) -> list[str]:
        """Restore all descriptors to active in this set."""
        all_descs = self.getDescriptorNames(active_only=False)
        prefix = str(self.calculator) + "_"
        self.calculator.descriptors = [
            x.replace(prefix, "", 1) for x in all_descs  # remove prefix
        ]
        return self.getDescriptorNames()


[docs]class MoleculeTable(PandasDataTable, SearchableMolTable, Summarizable):
    """Class that holds and prepares molecule data for modelling and other analyses.

    Attributes:
        smilesCol (str):
            Name of the column containing the SMILES sequences
            of molecules.
        includesRdkit (bool):
            Whether the data frame contains RDKit molecules as one of
            the properties.
        descriptors (list[DescriptorTable]):
            List of `DescriptorTable` objects containing the descriptors
            calculated for this table.
    """

    _notJSON: ClassVar = PandasDataTable._notJSON + ["descriptors"]

    def __init__(
            self,
            name: str,
            df: Optional[pd.DataFrame] = None,
            smiles_col: str = "SMILES",
            add_rdkit: bool = False,
            store_dir: str = ".",
            overwrite: bool = False,
            n_jobs: int | None = 1,
            chunk_size: int | None = None,
            drop_invalids: bool = True,
            index_cols: Optional[list[str]] = None,
            autoindex_name: str = "QSPRID",
            random_state: int | None = None,
            store_format: str = "pkl",
    ):
        """Initialize a `MoleculeTable` object.

        This object wraps a pandas dataframe and provides short-hand methods to prepare
        molecule data for modelling and analysis.

        Args:
            name (str): Name of the dataset. You can use this name to load the dataset
                from disk anytime and create a new instance.
            df (pd.DataFrame): Pandas dataframe containing the data. If you provide a
                dataframe for a dataset that already exists on disk,
            the dataframe from disk will override the supplied data frame. Set
                'overwrite' to `True` to override the data frame on disk.
            smiles_col (str): Name of the column containing the SMILES sequences
                of molecules.
            add_rdkit (bool): Add RDKit molecule instances to the dataframe.
                WARNING: This can take a lot of memory.
            store_dir (str): Directory to store the dataset files. Defaults to the
                current directory. If it already contains files with the same name,
                the existing data will be loaded.
            overwrite (bool): Overwrite existing dataset.
            n_jobs (int): Number of jobs to use for parallel processing. If <= 0, all
                available cores will be used.
            chunk_size (int): Size of chunks to use per job in parallel processing.
            drop_invalids (bool): Drop invalid molecules from the data frame.
            index_cols (list[str]): list of columns to use as index. If None, the index
                will be a custom generated ID.
            autoindex_name (str): Column name to use for automatically generated IDs.
            random_state (int): Random state to use for shuffling and other random ops.
            store_format (str): Format to use for storing the data ('pkl' or 'csv').
        """
        super().__init__(
            name,
            df,
            store_dir,
            overwrite,
            index_cols,
            n_jobs,
            chunk_size,
            autoindex_name,
            random_state,
            store_format,
        )
        # the descriptors
        self.descriptors = []
        # settings
        self.smilesCol = smiles_col
        self.includesRdkit = add_rdkit
        # drop invalid columns
        self.invalidsRemoved = False
        if drop_invalids:
            self.dropInvalids()
            # update chunk size if count changed
            self.chunkSize = chunk_size
            # label invalids removed
            self.invalidsRemoved = True
        # add rdkit molecules if requested
        if self.includesRdkit and "RDMol" not in self.df.columns:
            from rdkit.Chem import PandasTools
            PandasTools.AddMoleculeColumnToFrame(
                self.df,
                smilesCol=self.smilesCol,
                molCol="RDMol",
                includeFingerprints=False,
            )
            self.includesRdkit = True

[docs]    def searchWithIndex(
            self, index: pd.Index, name: str | None = None
    ) -> "MoleculeTable":
        """Search in this table using a pandas index. The return values
        is a new table with the molecules from the old table with the given indices.

        Args:
            index(pd.Index):
                Indices to search for in this table.
            name(str):
                Name of the new table. Defaults to the name of the old table,
                plus the `_searched` suffix.

        Returns:
            MoleculeTable:
                A new table with the molecules from the
                old table with the given indices.
        """
        name = f"{self.name}_searched" if name is None else name
        ret = MoleculeTable(
            name=name,
            df=self.df.loc[index, :],
            smiles_col=self.smilesCol,
            add_rdkit=False,
            store_dir=self.storeDir,
            overwrite=True,
            n_jobs=self.nJobs,
            chunk_size=self.chunkSize,
            drop_invalids=False,
            index_cols=self.indexCols,
            random_state=self.randomState,
            store_format=self.storeFormat,
        )
        for table in self.descriptors:
            ret.descriptors.append(
                DescriptorTable(
                    table.calculator,
                    name=ret.generateDescriptorDataSetName(table.calculator),
                    df=table.getDF().loc[index, :],
                    store_dir=table.storeDir,
                    overwrite=True,
                    key_cols=table.indexCols,
                    n_jobs=table.nJobs,
                    chunk_size=table.chunkSize,
                    store_format=table.storeFormat,
                    random_state=table.randomState,
                )
            )
        return ret

[docs]    def searchOnProperty(
            self, prop_name: str, values: list[str], name: str | None = None,
            exact=False
    ) -> "MoleculeTable":
        """Search in this table using a property name and a list of values. It is
        assumed that the property is searchable with string matching. Either an
        exact match or a partial match can be used. If 'exact' is `False`, the
        search will be performed with partial matching, i.e. all molecules that
        contain any of the given values in the property will be returned. If
        'exact' is `True`, only molecules that have the exact property value for
        any of the given values will be returned.

        Args:
            prop_name (str):
                Name of the property to search on.
            values (list[str]):
                List of values to search for. If any of the values is found in the
                property, the molecule will be considered a match.
            name (str | None, optional):
                Name of the new table. Defaults to the name of
                the old table, plus the `_searched` suffix.
            exact (bool, optional):
                Whether to use exact matching, i.e. whether to
                search for exact strings or just substrings. Defaults to False.

        Returns:
            MoleculeTable:
                A new table with the molecules from the
                old table with the given property values.
        """
        mask = [False] * len(self.df)
        for value in values:
            mask = (
                mask | (self.df[prop_name].str.contains(value))
                if not exact
                else mask | (self.df[prop_name] == value)
            )
        matches = self.df.index[mask]
        return self.searchWithIndex(matches, name)

[docs]    def searchWithSMARTS(
            self,
            patterns: list[str],
            operator: Literal["or", "and"] = "or",
            use_chirality: bool = False,
            name: str | None = None,
            match_function: Callable = match_mol_to_smarts,
    ) -> "MoleculeTable":
        """Search the molecules in the table with a SMARTS pattern.

        Args:
            patterns:
                List of SMARTS patterns to search with.
            operator (object):
                Whether to use an "or" or "and" operator on patterns. Defaults to "or".
            use_chirality:
                Whether to use chirality in the search.
            name:
                Name of the new table. Defaults to the name of the old table,
                plus the `smarts_searched` suffix.
            match_function:
                Function to use for matching the molecules to the SMARTS patterns.
                Defaults to `match_mol_to_smarts`.

        Returns:
            (MolTable): A dataframe with the molecules that match the pattern.
        """
        matches = self.df.index[
            self.df[self.smilesCol].apply(
                lambda x: match_function(
                    x, patterns, operator=operator, use_chirality=use_chirality
                )
            )
        ]
        return self.searchWithIndex(
            matches, name=f"{self.name}_smarts_searched" if name is None else name
        )

[docs]    def getSummary(self):
        """
        Make a summary with some statistics about the molecules in this table.
        The summary contains the number of molecules per target and the number of
        unique molecules per target.

        Requires this data set to be imported from Papyrus for now.

        Returns:
            (pd.DataFrame): A dataframe with the summary statistics.

        """
        summary = {
            "mols_per_target": self.df.groupby("accession")
            .count()["InChIKey"]
            .to_dict(),
            "mols_per_target_unique": self.df.groupby("accession")
            .aggregate(lambda x: len(set(x)))["InChIKey"]
            .to_dict(),
        }
        return pd.DataFrame(summary)

[docs]    def sample(
            self, n: int, name: str | None = None, random_state: int | None = None
    ) -> "MoleculeTable":
        """
        Sample n molecules from the table.

        Args:
            n (int):
                Number of molecules to sample.
            name (str):
                Name of the new table. Defaults to the name of the old
                table, plus the `_sampled` suffix.
            random_state (int):
                Random state to use for shuffling and other random ops.

        Returns:
            (MoleculeTable): A dataframe with the sampled molecules.
        """
        random_state = random_state or self.randomState
        name = f"{self.name}_sampled" if name is None else name
        index = self.df.sample(n=n, random_state=random_state).index
        return self.searchWithIndex(index, name=name)

    def __getstate__(self):
        o_dict = super().__getstate__()
        o_dict["descriptors"] = []
        for desc in self.descriptors:
            o_dict["descriptors"].append(os.path.basename(desc.storeDir))
        return o_dict

    def __setstate__(self, state):
        super().__setstate__(state)
        self.descriptors = []
        for desc in state["descriptors"]:
            desc = os.path.join(self.storeDir, desc, f"{desc}_meta.json")
            self.descriptors.append(DescriptorTable.fromFile(desc))

[docs]    def toFile(self, filename: str):
        ret = super().toFile(filename)
        for desc in self.descriptors:
            desc.save()
        return ret

    @property
    def descriptorSets(self):
        """Get the descriptor calculators for this table."""
        return [x.calculator for x in self.descriptors]

[docs]    @staticmethod
    def fromSMILES(name: str, smiles: list, *args, **kwargs):
        """Create a `MoleculeTable` instance from a list of SMILES sequences.

        Args:
            name (str): Name of the data set.
            smiles (list): list of SMILES sequences.
            *args: Additional arguments to pass to the `MoleculeTable` constructor.
            **kwargs: Additional keyword arguments to pass to the `MoleculeTable`
                constructor.
        """
        smilescol = "SMILES"
        df = pd.DataFrame({smilescol: smiles})
        return MoleculeTable(name, df, *args, smiles_col=smilescol, **kwargs)

[docs]    @staticmethod
    def fromTableFile(name: str, filename: str, sep="\t", *args, **kwargs):
        """Create a `MoleculeTable` instance from a file containing a table of molecules
        (i.e. a CSV file).

        Args:
            name (str): Name of the data set.
            filename (str): Path to the file containing the table.
            sep (str): Separator used in the file for different columns.
            *args: Additional arguments to pass to the `MoleculeTable` constructor.
            **kwargs: Additional keyword arguments to pass to the `MoleculeTable`
                constructor.
        """
        return MoleculeTable(name, pd.read_table(filename, sep=sep), *args, **kwargs)

[docs]    @staticmethod
    def fromSDF(name, filename, smiles_prop, *args, **kwargs):
        """Create a `MoleculeTable` instance from an SDF file.

        Args:
            name (str): Name of the data set.
            filename (str): Path to the SDF file.
            smiles_prop (str): Name of the property in the SDF file containing the
                SMILES sequence.
            *args: Additional arguments to pass to the `MoleculeTable` constructor.
            **kwargs: Additional keyword arguments to pass to the `MoleculeTable`
                constructor.
        """
        # FIXME: the RDKit mols are always added here, which might be unnecessary
        return MoleculeTable(
            name,
            PandasTools.LoadSDF(filename, molColName="RDMol"),
            smiles_col=smiles_prop,
            *args,  # noqa: B026 # FIXME: this is a bug in flake8...
            **kwargs,
        )

[docs]    @classmethod
    def runMolProcess(
            cls,
            props: dict[str, list] | pd.DataFrame,
            func: MolProcessor,
            add_rdkit: bool,
            smiles_col: str,
            *args,
            **kwargs,
    ):
        """A helper method to run a `MolProcessor` on a list of molecules via `apply`.
        It converts the SMILES to RDKit molecules if required and then applies the
        function to the `MolProcessor` object.

        Args:
            props (dict):
                Dictionary of properties that will be passed in addition to the
                molecule structure.
            func (MolProcessor):
                `MolProcessor` object to use for processing.
            add_rdkit (bool):
                Whether to convert the SMILES to RDKit molecules before
                applying the function.
            smiles_col (str):
                Name of the property containing the SMILES sequences.
            *args:
                Additional positional arguments to pass to the function.
            **kwargs:
                Additional keyword arguments to pass to the function.
        """
        if add_rdkit:
            mols = (
                props["RDMol"]
                if "RDMol" in props
                else [Chem.MolFromSmiles(x) for x in props[smiles_col]]
            )
        else:
            mols = props[smiles_col]
        return func(mols, props, *args, **kwargs)

[docs]    def processMols(
            self,
            processor: MolProcessor,
            proc_args: tuple[Any] | None = None,
            proc_kwargs: dict[str, Any] | None = None,
            add_props: list[str] | None = None,
            as_rdkit: bool = False,
            chunk_size: int | None = None,
            n_jobs: int | None = None,
    ) -> Generator:
        """Apply a function to the molecules in the data frame.
        The SMILES  or an RDKit molecule will be supplied as the first
        positional argument to the function. Additional properties
        to provide from the data set can be specified with 'add_props', which will be
        a dictionary supplied as an additional positional argument to the function.

        IMPORTANT: For successful parallel processing, the processor must be picklable.
        Also note that
        the returned generator will produce results as soon as they are ready,
        which means that the chunks of data will
        not be in the same order as the original data frame. However, you can pass the
        value of `idProp` in `add_props` to identify the processed molecules.
        See `CheckSmilesValid` for an example.

        Args:
            processor (MolProcessor):
                `MolProcessor` object to use for processing.
            proc_args (list, optional):
                Any additional positional arguments to pass to the processor.
            proc_kwargs (dict, optional):
                Any additional keyword arguments to pass to the processor.
            add_props (list, optional):
                List of data set properties to send to the processor. If `None`, all
                properties will be sent.
            as_rdkit (bool, optional):
                Whether to convert the molecules to RDKit
                molecules before applying the processor.
            chunk_size (int, optional):
                Size of chunks to use per job in parallel.
                If not specified, `self.chunkSize` is used.
            n_jobs (int, optional):
                Number of jobs to use for parallel processing.
                If not specified, `self.nJobs` is used.

        Returns:
            Generator:
                A generator that yields the results of the supplied processor on
                the chunked molecules from the data set.
        """
        chunk_size = chunk_size or self.chunkSize
        n_jobs = n_jobs or self.nJobs
        proc_args = proc_args or ()
        proc_kwargs = proc_kwargs or {}
        add_props = add_props or self.df.columns.tolist()
        if add_props is not None and self.smilesCol not in add_props:
            add_props.append(self.smilesCol)
            add_props.append(self.idProp)
        for prop in processor.requiredProps:
            if prop not in self.df.columns:
                raise ValueError(
                    f"Cannot apply function '{processor}' to {self.name} because "
                    f"it requires the property '{prop}', which is not present in the "
                    "data set."
                )
            if prop not in add_props:
                add_props.append(prop)
        if self.nJobs > 1 and processor.supportsParallel:
            logger.debug(
                f"Applying processor '{processor}' to '{self.name}' in parallel."
            )
            for result in self.apply(
                    self.runMolProcess,
                    func_args=[processor, as_rdkit, self.smilesCol, *proc_args],
                    func_kwargs=proc_kwargs,
                    on_props=add_props,
                    as_df=False,
                    chunk_size=chunk_size,
                    n_jobs=n_jobs,
            ):
                yield result
        else:
            logger.debug(
                f"Applying processor '{processor}' to '{self.name}' in serial."
            )
            for result in self.iterChunks(
                    include_props=add_props, as_dict=True, chunk_size=len(self)
            ):
                yield self.runMolProcess(
                    result,
                    processor,
                    as_rdkit,
                    self.smilesCol,
                    *proc_args,
                    **proc_kwargs,
                )

[docs]    def checkMols(self, throw: bool = True):
        """
        Returns a boolean array indicating whether each molecule is valid or not.
        If `throw` is `True`, an exception is thrown if any molecule is invalid.

        Args:
            throw (bool): Whether to throw an exception if any molecule is invalid.

        Returns:
            mask (pd.Series): Boolean series indicating whether each molecule is valid.
        """
        mask = pd.Series([False] * len(self), index=self.df.index, dtype=bool)
        for result in self.processMols(
                CheckSmilesValid(id_prop=self.idProp), proc_kwargs={"throw": throw}
        ):
            mask.loc[result.index] = result.values
        return mask

[docs]    def generateDescriptorDataSetName(self, ds_set: str | DescriptorSet):
        """Generate a descriptor set name from a descriptor set."""
        return f"Descriptors_{self.name}_{ds_set}"

[docs]    def dropDescriptors(self, descriptors: list[str]):
        """Drop descriptors by name. Performs a simple feature selection by removing
        the given descriptor names from the data set.

        Args:
            descriptors (list[str]): List of descriptor names to drop.
        """
        for ds in self.descriptors:
            calc = ds.calculator
            ds_names = calc.transformToFeatureNames()
            to_keep = [x for x in ds_names if x not in descriptors]
            ds.keepDescriptors(to_keep)

[docs]    def dropDescriptorSets(
            self,
            descriptors: list[DescriptorSet | str],
            full_removal: bool = False,
    ):
        """
        Drop descriptors from the given sets from the data frame.

        Args:
            descriptors (list[DescriptorSet | str]):
                List of `DescriptorSet` objects or their names. Name of a descriptor
                set corresponds to the result returned by its `__str__` method.
            full_removal (bool):
                Whether to remove the descriptor data (will perform full removal).
                By default, a soft removal is performed by just rendering the
                descriptors inactive. A full removal will remove the descriptorSet from the
                dataset, including the saved files. It is not possible to restore a
                descriptorSet after a full removal.
        """
        # sanity check
        assert (
                len(self.descriptors) != 0
        ), "Cannot drop descriptors because the data set does not contain any."
        if len(descriptors) == 0:
            logger.warning(
                "No descriptors specified to drop. All descriptors will be retained."
            )
            return
        if not isinstance(descriptors[0], str):
            descriptors = [str(x) for x in descriptors]
        # remove the descriptors
        to_remove = []
        to_drop = []
        for name in descriptors:
            for idx, ds in enumerate(self.descriptors):
                calc = ds.calculator
                if name == str(calc):
                    to_drop.extend(ds.getDescriptorNames())
                    if full_removal:
                        to_remove.append(idx)
        self.dropDescriptors(to_drop)
        for idx in reversed(to_remove):
            self.descriptors[idx].clearFiles()
            self.descriptors.pop(idx)

[docs]    def restoreDescriptorSets(self, descriptors: list[DescriptorSet | str]):
        """Restore descriptors that were previously removed.

        Args:
            descriptors (list[DescriptorSet | str]):
                List of `DescriptorSet` objects or their names. Name of a descriptor
                set corresponds to the result returned by its `__str__` method.
        """
        if not isinstance(descriptors[0], str):
            descriptors = [str(x) for x in descriptors]
        for name in descriptors:
            for ds in self.descriptors:
                calc = ds.calculator
                if name == str(calc):
                    ds.restoreDescriptors()

[docs]    def dropEmptySmiles(self):
        """Drop rows with empty SMILES from the data set."""
        self.df.dropna(subset=[self.smilesCol], inplace=True)

[docs]    def attachDescriptors(
            self,
            calculator: DescriptorSet,
            descriptors: pd.DataFrame,
            index_cols: list,
    ):
        """Attach descriptors to the data frame.

        Args:
            calculator (DescriptorsCalculator): DescriptorsCalculator object to use for
                descriptor calculation.
            descriptors (pd.DataFrame): DataFrame containing the descriptors to attach.
            index_cols (list): List of column names to use as index.
        """
        self.descriptors.append(
            DescriptorTable(
                calculator,
                self.generateDescriptorDataSetName(calculator),
                descriptors,
                store_dir=self.storeDir,
                n_jobs=self.nJobs,
                overwrite=True,
                key_cols=index_cols,
                chunk_size=self.chunkSize,
                random_state=self.randomState,
                store_format=self.storeFormat,
            )
        )

[docs]    def addDescriptors(
            self,
            descriptors: list[DescriptorSet],
            recalculate: bool = False,
            fail_on_invalid: bool = True,
            *args,
            **kwargs,
    ):
        """Add descriptors to the data frame with the given descriptor calculators.

        Args:
            descriptors (list[DescriptorSet]):
                List of `DescriptorSet` objects to use for descriptor
                calculation.
            recalculate (bool):
                Whether to recalculate descriptors even if they are
                already present in the data frame. If `False`, existing descriptors are
                kept and no calculation takes place.
            fail_on_invalid (bool):
                Whether to throw an exception if any molecule
                is invalid.
            *args:
                Additional positional arguments to pass to each descriptor set.
            **kwargs:
                Additional keyword arguments to pass to each descriptor set.
        """
        if recalculate and self.hasDescriptors():
            self.dropDescriptorSets(descriptors, full_removal=True)
        to_calculate = []
        for desc_set, exists in zip(descriptors, self.hasDescriptors(descriptors)):
            if exists:
                logger.warning(
                    f"Molecular descriptors already exist in {self.name}. "
                    "Calculation will be skipped. "
                    "Use `recalculate=True` to overwrite them."
                )
            else:
                to_calculate.append(desc_set)
        # check for invalid molecules if required
        if fail_on_invalid:
            try:
                self.checkMols(throw=True)
            except Exception as exp:
                logger.error(
                    f"Cannot add descriptors to {self.name} because it contains one or "
                    "more invalid molecules. Remove the invalid molecules from your "
                    "data or try to standardize the data set first with "
                    "'standardizeSmiles()'. You can also pass 'fail_on_invalid=False' "
                    "to remove this exception, but the calculation might not be "
                    "successful or correct. See the following list of invalid molecule "
                    "SMILES for more information:"
                )
                logger.error(
                    self.df[~self.checkMols(throw=False)][self.smilesCol].to_numpy()
                )
                raise exp
        # get the data frame with the descriptors
        # and attach it to this table as descriptors
        for calculator in to_calculate:
            df_descriptors = []
            for result in self.processMols(
                    calculator, proc_args=args, proc_kwargs=kwargs
            ):
                df_descriptors.append(result)
            df_descriptors = pd.concat(df_descriptors, axis=0)
            df_descriptors[self.indexCols] = None
            df_descriptors.loc[self.df.index, self.indexCols] = self.df[self.indexCols]
            self.attachDescriptors(calculator, df_descriptors, [self.idProp])

[docs]    def getDescriptors(self, active_only=False):
        """Get the calculated descriptors as a pandas data frame.

        Returns:
            pd.DataFrame: Data frame containing only descriptors.
        """
        ret = pd.DataFrame(index=pd.Index(self.df.index.values, name=self.idProp))
        for descriptors in self.descriptors:
            df_descriptors = descriptors.getDescriptors(active_only=active_only)
            ret = ret.join(df_descriptors, how="left")
        return ret

[docs]    def getDescriptorNames(self):
        """Get the names of the descriptors present for  molecules  in  this data  set.

        Returns:
            list: list of descriptor names.
        """
        names = []
        for ds in self.descriptors:
            names.extend(ds.getDescriptorNames())
        return names

[docs]    def hasDescriptors(
            self, descriptors: list[DescriptorSet | str] | None = None
    ) -> bool | list[bool]:
        """Check whether the data frame contains given descriptors.

        Args:
            descriptors (list): list of `DescriptorSet` objects or prefixes of
                descriptors to check for. If `None`,
                all descriptors are checked for and
                a single boolean is returned if any descriptors are found.

        Returns:
            list: list of booleans indicating whether each descriptor is present or not.
        """
        if not descriptors:
            return len(self.getDescriptorNames()) > 0
        else:
            descriptors = [self.generateDescriptorDataSetName(x) for x in descriptors]
            descriptors_in = [x.name for x in self.descriptors]
            ret = []
            for name in descriptors:
                if name in descriptors_in:
                    ret.append(True)
                else:
                    ret.append(False)
            return ret

    @property
    def smiles(self) -> Generator[str, None, None]:
        """Get the SMILES strings of the molecules in the data frame.

        Returns:
            Generator[str, None, None]: Generator of SMILES strings.
        """
        return iter(self.df[self.smilesCol].values)

[docs]    def addScaffolds(
            self,
            scaffolds: list[Scaffold],
            add_rdkit_scaffold: bool = False,
            recalculate: bool = False,
    ):
        """Add scaffolds to the data frame.

        A new column is created that contains the SMILES of the corresponding scaffold.
        If `add_rdkit_scaffold` is set to `True`, a new column is created that contains
        the RDKit scaffold of the corresponding molecule.

        Args:
            scaffolds (list): list of `Scaffold` calculators.
            add_rdkit_scaffold (bool): Whether to add the RDKit scaffold of the molecule
                as a new column.
            recalculate (bool): Whether to recalculate scaffolds even if they are
                already present in the data frame.
        """
        for scaffold in scaffolds:
            if not recalculate and f"Scaffold_{scaffold}" in self.df.columns:
                continue
            for scaffolds in self.processMols(scaffold):
                self.df.loc[scaffolds.index, f"Scaffold_{scaffold}"] = scaffolds.values
            if add_rdkit_scaffold:
                from rdkit.Chem import PandasTools
                PandasTools.AddMoleculeColumnToFrame(
                    self.df,
                    smilesCol=f"Scaffold_{scaffold}",
                    molCol=f"Scaffold_{scaffold}_RDMol",
                )

[docs]    def getScaffoldNames(
            self, scaffolds: list[Scaffold] | None = None, include_mols: bool = False
    ):
        """Get the names of the scaffolds in the data frame.

        Args:
            include_mols (bool): Whether to include the RDKit scaffold columns as well.


        Returns:
            list: List of scaffold names.
        """
        all_names = [
            col
            for col in self.df.columns
            if col.startswith("Scaffold_")
               and (include_mols or not col.endswith("_RDMol"))
        ]
        if scaffolds:
            wanted = [str(x) for x in scaffolds]
            return [x for x in all_names if x.split("_", 1)[1] in wanted]
        return all_names

[docs]    def getScaffolds(
            self, scaffolds: list[Scaffold] | None = None, include_mols: bool = False
    ):
        """Get the subset of the data frame that contains only scaffolds.

        Args:
            include_mols (bool): Whether to include the RDKit scaffold columns as well.

        Returns:
            pd.DataFrame: Data frame containing only scaffolds.
        """
        names = self.getScaffoldNames(scaffolds, include_mols=include_mols)
        return self.df[names]

    @property
    def hasScaffolds(self):
        """Check whether the data frame contains scaffolds.

        Returns:
            bool: Whether the data frame contains scaffolds.
        """
        return len(self.getScaffoldNames()) > 0

[docs]    def createScaffoldGroups(self, mols_per_group: int = 10):
        """Create scaffold groups.

        A scaffold group is a list of molecules that share the same scaffold. New
        columns are created that contain the scaffold group ID and the scaffold group
        size.

        Args:
            mols_per_group (int): number of molecules per scaffold group.
        """
        scaffolds = self.getScaffolds(include_mols=False)
        for scaffold in scaffolds.columns:
            counts = pd.value_counts(self.df[scaffold])
            mask = counts.lt(mols_per_group)
            name = f"ScaffoldGroup_{scaffold}_{mols_per_group}"
            if name not in self.df.columns:
                self.df[name] = np.where(
                    self.df[scaffold].isin(counts[mask].index),
                    "Other",
                    self.df[scaffold],
                )

[docs]    def getScaffoldGroups(self, scaffold_name: str, mol_per_group: int = 10):
        """Get the scaffold groups for a given combination of scaffold and number of
        molecules per scaffold group.

        Args:
            scaffold_name (str): Name of the scaffold.
            mol_per_group (int): Number of molecules per scaffold group.

        Returns:
            list: list of scaffold groups.
        """
        return self.df[
            self.df.columns[
                self.df.columns.str.startswith(
                    f"ScaffoldGroup_{scaffold_name}_{mol_per_group}"
                )
            ][0]
        ]

    @property
    def hasScaffoldGroups(self):
        """Check whether the data frame contains scaffold groups.

        Returns:
            bool: Whether the data frame contains scaffold groups.
        """
        return (
                len([col for col in self.df.columns if
                     col.startswith("ScaffoldGroup_")])
                > 0
        )

[docs]    def addClusters(
        self,
        clusters: list["MoleculeClusters"],
        recalculate: bool = False,
    ):
        """Add clusters to the data frame.

        A new column is created that contains the identifier of the corresponding
        cluster calculator.

        Args:
            clusters (list): list of `MoleculeClusters` calculators.
            recalculate (bool): Whether to recalculate clusters even if they are
                already present in the data frame.
        """
        for cluster in clusters:
            if not recalculate and f"Cluster_{cluster}" in self.df.columns:
                continue
            for clusters in self.processMols(cluster):
                self.df.loc[clusters.index, f"Cluster_{cluster}"] = clusters.values


[docs]    def getClusterNames(
        self, clusters: list["MoleculeClusters"] | None = None
    ):
        """Get the names of the clusters in the data frame.

        Returns:
            list: List of cluster names.
        """
        all_names = [
            col
            for col in self.df.columns
            if col.startswith("Cluster_")
        ]
        if clusters:
            wanted = [str(x) for x in clusters]
            return [x for x in all_names if x.split("_", 1)[1] in wanted]
        return all_names

[docs]    def getClusters(
        self, clusters: list["MoleculeClusters"] | None = None
    ):
        """Get the subset of the data frame that contains only clusters.

        Returns:
            pd.DataFrame: Data frame containing only clusters.
        """
        names = self.getClusterNames(clusters)
        return self.df[names]

    @property
    def hasClusters(self):
        """Check whether the data frame contains clusters.

        Returns:
            bool: Whether the data frame contains clusters.
        """
        return len(self.getClusterNames()) > 0

[docs]    def standardizeSmiles(self, smiles_standardizer, drop_invalid=True):
        """Apply smiles_standardizer to the compounds in parallel

        Args:
            smiles_standardizer (): either `None` to skip the
                standardization, `chembl`, `old`, or a partial function that reads
                and standardizes smiles.
            drop_invalid (bool): whether to drop invalid SMILES from the data set.
                Defaults to `True`. If `False`, invalid SMILES will be retained in
                their original form. If `self.invalidsRemoved` is `True`, there will be
                no effect even if `drop_invalid` is `True`. Set `self.invalidsRemoved`
                to `False` on this instance to force the removal of invalid SMILES.

        Raises:
            ValueError: when smiles_standardizer is not a callable or one of the
                predefined strings.
        """
        std_jobs = self.nJobs
        if smiles_standardizer is None:
            return
        if callable(smiles_standardizer):
            try:  # Prevents weird error if the user inputs a lambda function
                pickle.dumps(smiles_standardizer)
            except pickle.PicklingError:
                logger.warning("Standardizer is not pickleable. Will set n_jobs to 1")
                std_jobs = 1
            std_func = smiles_standardizer
        elif smiles_standardizer.lower() == "chembl":
            std_func = chembl_smi_standardizer
        elif smiles_standardizer.lower() == "old":
            std_func = old_standardize_sanitize
        else:
            raise ValueError("Standardizer must be either 'chembl', or a callable")
        if std_jobs == 1:
            std_smi = [std_func(smi) for smi in self.df[self.smilesCol].values]
        else:
            with Pool(std_jobs) as pool:
                std_smi = pool.map(std_func, self.df[self.smilesCol].values)
        self.df[self.smilesCol] = std_smi
        if drop_invalid and not self.invalidsRemoved:
            self.dropInvalids()

[docs]    def dropInvalids(self):
        """
        Drops invalid molecules from the data set.

        Returns:
            mask (pd.Series): Boolean mask of invalid molecules in the original
                data set.
        """
        invalid_mask = self.checkMols(throw=False)
        self.df.drop(self.df.index[~invalid_mask], inplace=True)
        invalids = (~invalid_mask).sum()
        if invalids > 0:
            logger.warning(f"Dropped {invalids} invalid molecules from the data set.")
        return ~invalid_mask