Source code for qsprpred.data.sampling.splits

"""Different splitters to create train and tests for evalutating QSPR model performance.

To add a new data splitter:
* Add a DataSplit subclass for your new splitter
"""
import platform
from abc import ABC, abstractmethod
from typing import Iterable

import numpy as np
import pandas as pd
from gbmtsplits import GloballyBalancedSplit
from sklearn.model_selection import ShuffleSplit

from ...data.chem.clustering import (
    FPSimilarityMaxMinClusters,
    MoleculeClusters,
    RandomClusters,
    ScaffoldClusters,
)
from ...data.chem.scaffolds import BemisMurckoRDKit, Scaffold
from ...data.tables.base import MoleculeDataTable, DataSetDependant
from ...data.tables.qspr import QSPRDataset
from ...logs import logger
from ...utils.interfaces.randomized import Randomized


[docs]class DataSplit(DataSetDependant, ABC): """ Defines a function split a dataframe into train and test set. Attributes: dataset (MoleculeDataTable): The dataset to split. """ def __init__(self, dataset: MoleculeDataTable | None = None) -> None: super().__init__(dataset)
[docs] @abstractmethod def split( self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame | pd.Series ) -> Iterable[tuple[list[int], list[int]]]: """Split the given data into one or multiple train/test subsets. These classes handle partitioning of a feature matrix by returning an generator of train and test indices. It is compatible with the approach taken in the `sklearn` package (see `sklearn.model_selection._BaseKFold`). This can be used for both cross-validation or a one time train/test split. Args: X (np.ndarray | pd.DataFrame): the input data matrix y (np.ndarray | pd.DataFrame | pd.Series): the target variable(s) Returns: an generator over the generated subsets represented as a tuple of (train_indices, test_indices) where the indices are the row indices of the input data matrix X (note that these are integer indices, rather than a pandas index!) """
[docs] def splitDataset(self, dataset: "QSPRDataset"): return self.split( dataset.getFeatures(concat=True), dataset.getTargetPropertiesValues(concat=True), )
[docs]class RandomSplit(DataSplit, Randomized): """Splits dataset in random train and test subsets. Attributes: testFraction (float): fraction of total dataset to testset seed (int): Random state to use for shuffling and other random operations. """ def __init__( self, test_fraction=0.1, dataset: QSPRDataset | None = None, seed: int | None = None, ) -> None: DataSplit.__init__(self, dataset) Randomized.__init__(self, seed) self.testFraction = test_fraction self.setSeed(seed or (dataset.randomState if self.hasDataSet else None))
[docs] def split(self, X, y): if self.seed is None: self.seed = self.setSeed( self.getDataSet().randomState if self.hasDataSet else None ) if self.seed is None: logger.info( "No random state supplied, " "and could not find random state on the dataset." "Random seed will be set randomly." ) return ShuffleSplit( 1, test_size=self.testFraction, random_state=self.seed ).split(X, y)
[docs]class BootstrapSplit(DataSplit, Randomized): """Splits dataset in random train and test subsets (bootstraps). Unlike cross-validation, bootstrapping allows for repeated samples in the test set. Attributes: nBootstraps (int): number of bootstraps to perform seed (int): Random state to use for shuffling and other random operations. """ def __init__(self, split: DataSplit, n_bootstraps=5, seed=None): """Initialize a BootstrapSplit object. Args: split (DataSplit): the splitter to use for the bootstraps n_bootstraps (int): number of bootstraps to perform seed (int): random seed to use for random operations """ Randomized.__init__(self, seed) self._split = split self._original_split_seed = split.seed if hasattr(split, "seed") else None self.nBootstraps = n_bootstraps self._current = 0
[docs] def split( self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame | pd.Series ) -> Iterable[tuple[list[int], list[int]]]: """Split the given data into `nBootstraps` training and test sets. Args: X (np.ndarray | pd.DataFrame): the input data matrix y (np.ndarray | pd.DataFrame | pd.Series): the target variable(s) Returns: an generator over `nBootstraps` tuples generated by the underlying splitter """ if hasattr(self._split, "setDataSet") and self.hasDataSet: self._split.setDataSet(self.getDataSet()) for i in range(self.nBootstraps): if hasattr(self._split, "setSeed") and self.seed is not None: self._split.setSeed(self.seed + self._current) yield from self._split.split(X, y) self._current += 1 if hasattr(self._split, "setSeed"): self._split.setSeed(self._original_split_seed) self._current = 0
[docs]class ManualSplit(DataSplit): """Splits dataset in train and test subsets based on a column in the dataframe. Attributes: splitCol (pd.Series): pandas series with split information trainVal (str): value in splitcol that will be used for training testVal (str): value in splitcol that will be used for testing Raises: ValueError: if there are more values in splitcol than trainval and testval """ def __init__(self, splitcol: pd.Series, trainval: str, testval: str) -> None: """Initialize the ManualSplit object with the splitcol, trainval and testval attributes. Args: splitCol (pd.Series): pandas series with split information trainVal (str): value in splitcol that will be used for training testVal (str): value in splitcol that will be used for testing Raises: ValueError: if there are more values in splitcol than trainval and testval """ super().__init__() self.splitCol = splitcol.reset_index(drop=True) self.trainVal = trainval self.testVal = testval # check if only trainval and testval are present in splitcol if not set(splitcol.unique()).issubset({trainval, testval}): raise ValueError( "There are more values in splitcol than trainval and testval" )
[docs] def split(self, X, y): """ Split the given data into one or multiple train/test subsets based on the predefined splitcol. Args: X (np.ndarray | pd.DataFrame): the input data matrix y (np.ndarray | pd.DataFrame | pd.Series): the target variable(s) Returns: an generator over the generated subsets represented as a tuple of (train_indices, test_indices) where the indices are the row indices of the input data matrix """ train = self.splitCol[self.splitCol == self.trainVal].index.values test = self.splitCol[self.splitCol == self.testVal].index.values return iter([(train, test)])
[docs]class TemporalSplit(DataSplit): """ Splits dataset train and test subsets based on a threshold in time. Attributes: dataset (QSPRDataset): dataset that this splitter will be acting on timeSplit(float): time point after which sample to test set timeCol (str): name of the column within the dataframe with timepoints """ def __init__( self, timesplit: float | list[float], timeprop: str, dataset: QSPRDataset | None = None, ): """Initialize a TemporalSplit object. Args: dataset (QSPRDataset): dataset that this splitter will be acting on timesplit (float | list[float]): time point after which sample is moved to test set. If a list is provided, the splitter will split the dataset into multiple subsets based on the timepoints in the list. timeprop (str): name of the column within the dataframe with timepoints """ super().__init__(dataset=dataset) self.timeSplit = timesplit self.timeCol = timeprop
[docs] def split(self, X, y): """ Split single-task dataset based on a time threshold. Returns: an generator over the generated subsets represented as a tuple of (train_indices, test_indices) where the indices are the row indices of the input data matrix """ timesplits = ( self.timeSplit if isinstance(self.timeSplit, list) else [ self.timeSplit, ] ) for timesplit in timesplits: # Get dataset, dataframe and tasks ds = self.getDataSet() df = ds.getDF().loc[X.index, :].copy() task_names = [TargetProperty.name for TargetProperty in ds.targetProperties] assert len(task_names) > 0, "No target properties found." assert len(X) == len( df ), "X and the current data in the dataset must have same length" if len(task_names) > 1: logger.warning( "The TemporalSplit is not recommended for multitask " "or PCM datasets might lead to very unbalanced subsets " "for some tasks" ) indices = np.array(list(range(len(df)))) mask = df[self.timeCol] > timesplit mask = mask.values test = indices[mask] # Check if there are any test samples for each task for task in task_names: if len(df[mask][task]) == 0: raise ValueError(f"No test samples found for task {task.name}") elif len(df[~mask][task]) == 0: raise ValueError(f"No train samples found for task {task.name}") train = indices[~mask] yield train, test
[docs]class GBMTDataSplit(DataSplit): """ Splits dataset into balanced train and test subsets based on an initial clustering algorithm. If `nFolds` is specified, the determined clusters will be split into `nFolds` groups of approximately equal size, and the splits will be generated by leaving out one group at a time. Attributes: dataset (QSPRDataset): dataset that this splitter will be acting on clustering (MoleculeClusters): clustering algorithm to use testFraction (float): fraction of total dataset to testset nFolds (int): number of folds to split the dataset into (this overrides `testFraction` and `customTestList`) customTestList (list): list of molecule indexes to force in test set split_kwargs (dict): additional arguments to be passed to the GloballyBalancedSplit """ def __init__( self, dataset: QSPRDataset = None, clustering: MoleculeClusters = FPSimilarityMaxMinClusters(), test_fraction: float = 0.1, n_folds: int = 1, custom_test_list: list[str] | None = None, **split_kwargs, ): super().__init__(dataset) self.testFraction = test_fraction self.customTestList = custom_test_list self.clustering = clustering self.split_kwargs = split_kwargs if split_kwargs else {} self.nFolds = n_folds if self.nFolds > 1: self.testFraction = None self.customTestList = None
[docs] def setDataSet(self, dataset: MoleculeDataTable): super().setDataSet(dataset) if self.nFolds > 1: self.testFraction = (len(dataset) / self.nFolds) / len(dataset)
[docs] def split( self, X: np.ndarray | pd.DataFrame, y: np.ndarray | pd.DataFrame | pd.Series ) -> Iterable[tuple[list[int], list[int]]]: """ Split dataset into balanced train and test subsets based on an initial clustering algorithm. Args: X (np.ndarray | pd.DataFrame): the input data matrix y (np.ndarray | pd.DataFrame | pd.Series): the target variable(s) Returns: an generator over the generated subsets represented as a tuple of (train_indices, test_indices) where the indices are the row indices of the input data matrix """ # if we are on Windows, raise an error if platform.system() == "Windows": logger.warning( "The GBMTDataSplit currently has a problem on Windows:" "https://github.com/coin-or/pulp/issues/671 and might hang up..." ) # Get dataset, dataframe and tasks ds = self.getDataSet() df = ds.getDF().copy() # need numeric index splits df = df.loc[X.index, :] df.reset_index(drop=True, inplace=True) task_names = [TargetProperty.name for TargetProperty in ds.targetProperties] assert len(task_names) > 0, "No target properties found." assert len(X) == len( df ), "X and the current data in the dataset must have same length" # Get clusters clusters = self.clustering.get_clusters(df[ds.smilesCol].tolist()) # Pre-assign smiles of custom_test_list to test set preassigned_smiles = ( { df.loc[df.QSPRID == qspridx][ds.smilesCol].values[0]: 1 for qspridx in self.customTestList } if self.customTestList else None ) logger.debug(f"Split arguments: {self.split_kwargs}") # Split dataset if self.nFolds == 1: sizes = [1 - self.testFraction, self.testFraction] else: sizes = [self.testFraction] * self.nFolds splitter = GloballyBalancedSplit( sizes=sizes, clusters=clusters, clustering_method=None, # As precomputed clusters are provided **self.split_kwargs, ) df_split = splitter( df, ds.smilesCol, task_names, preassigned_smiles=preassigned_smiles, ) # Get indices for split in ( df_split["Split"].unique() if self.nFolds > 1 else [ 1, ] ): split = int(split) test_indices = df_split[df_split["Split"] == split].index.values train_indices = df_split[df_split["Split"] != split].index.values assert len(train_indices) + len(test_indices) == len( df ), "Not all samples were assigned to a split" # Reset index back to QSPRID df.set_index(ds.indexCols, inplace=True, drop=False) yield train_indices, test_indices
[docs]class GBMTRandomSplit(GBMTDataSplit): """ Splits dataset into balanced random train and test subsets. Attributes: dataset (QSPRDataset): dataset that this splitter will be acting on testFraction (float): fraction of total dataset to testset seed (int): Random state to use for shuffling and other random operations. customTestList (list): list of molecule indexes to force in test set split_kwargs (dict): additional arguments to be passed to the GloballyBalancedSplit """ def __init__( self, dataset: QSPRDataset | None = None, test_fraction: float = 0.1, n_folds: int = 1, seed: int | None = None, n_initial_clusters: int | None = None, custom_test_list: list[str] | None = None, **split_kwargs, ) -> None: seed = seed or (dataset.randomState if dataset is not None else None) if seed is None: logger.info( "No random state supplied, " "and could not find random state on the dataset." "Random seed will be set randomly." ) super().__init__( dataset, RandomClusters(seed, n_initial_clusters), test_fraction, n_folds, custom_test_list, **split_kwargs, )
[docs]class ScaffoldSplit(GBMTDataSplit): """ Splits dataset into balanced train and test subsets based on molecular scaffolds. Attributes: dataset (QSPRDataset): dataset that this splitter will be acting on testFraction (float): fraction of total dataset to testset customTestList (list): list of molecule indexes to force in test set split_kwargs (dict): additional arguments to be passed to the GloballyBalancedSplit """ def __init__( self, dataset: QSPRDataset | None = None, scaffold: Scaffold = BemisMurckoRDKit(), test_fraction: float = 0.1, n_folds: int = 1, custom_test_list: list | None = None, **split_kwargs, ) -> None: super().__init__( dataset, ScaffoldClusters(scaffold), test_fraction, n_folds, custom_test_list, **split_kwargs, )
[docs]class ClusterSplit(GBMTDataSplit): """ Splits dataset into balanced train and test subsets based on clusters of similar molecules. Attributes: dataset (QSPRDataset): dataset that this splitter will be acting on testFraction (float): fraction of total dataset to testset customTestList (list): list of molecule indexes to force in test set seed (int): Random state to use for shuffling and other random operations. split_kwargs (dict): additional arguments to be passed to the GloballyBalancedSplit """ def __init__( self, dataset: QSPRDataset = None, test_fraction: float = 0.1, n_folds: int = 1, custom_test_list: list[str] | None = None, seed: int | None = None, clustering: MoleculeClusters | None = None, **split_kwargs, ) -> None: seed = seed or (dataset.randomState if dataset is not None else None) if seed is None: logger.info( "No random state supplied, " "and could not find random state on the dataset." "Random seed will be set randomly." ) clustering = ( clustering if clustering is not None else FPSimilarityMaxMinClusters(seed=seed) ) super().__init__( dataset, clustering, test_fraction, n_folds, custom_test_list, **split_kwargs, )
[docs] def setSeed(self, seed: int | None): """Set the seed for this instance. Args: seed (int): Random state to use for shuffling and other random operations. """ self.seed = seed if hasattr(self.clustering, "seed"): self.clustering.seed = seed
[docs] def getSeed(self): """Get the seed for this instance. Returns: int: the seed for this instance or None if no seed is set. """ if hasattr(self, "seed"): return self.seed else: return None