Source code for qsprpred.utils.testing.path_mixins

import copy
import itertools
import os
import shutil
import tempfile
from typing import Iterable

import pandas as pd
from mlchemad.applicability_domains import TopKatApplicabilityDomain
from sklearn.preprocessing import StandardScaler

from ...data import RandomSplit, QSPRDataset
from ...data.descriptors.fingerprints import (
    AtomPairFP,
    AvalonFP,
    LayeredFP,
    MaccsFP,
    MorganFP,
    PatternFP,
    RDKitFP,
    RDKitMACCSFP,
    TopologicalFP,
)
from ...data.descriptors.sets import (
    RDKitDescs,
    DrugExPhyschem,
    PredictorDesc,
    TanimotoDistances,
)
from ...data.processing.data_filters import RepeatsFilter
from ...data.processing.feature_filters import HighCorrelationFilter, LowVarianceFilter
from ...data.processing.feature_standardizers import SKLearnStandardizer
from ...models import SklearnModel
from ...tasks import TargetTasks


[docs]class PathMixIn: """Mix-in class that provides paths to test files and directories and handles their creation and deletion. Attributes: generatedPath (str): path to the directory where generated files are stored, this directory is created before and cleared after each test """
[docs] def setUpPaths(self): """Create the directories that are used for testing.""" self.generatedPath = tempfile.mkdtemp(prefix="qsprpred_test_") self.clearGenerated() if not os.path.exists(self.generatedPath): os.makedirs(self.generatedPath)
[docs] def tearDown(self): """Remove all files and directories that are used for testing.""" self.clearGenerated()
[docs] def clearGenerated(self): """Remove the directories that are used for testing.""" if os.path.exists(self.generatedPath): shutil.rmtree(self.generatedPath)
[docs]class DataSetsPathMixIn(PathMixIn): """Mix-in class that provides a small and large testing data set and some common preparation settings to use in tests."""
[docs] def setUpPaths(self): """Create the directories that are used for testing.""" super().setUpPaths() self.inputBasePath = f"{os.path.dirname(__file__)}/test_files" self.inputDataPath = f"{self.inputBasePath}/data/" self.generatedDataPath = f"{self.generatedPath}/datasets" if not os.path.exists(self.generatedDataPath): os.makedirs(self.generatedDataPath)
[docs] @staticmethod def getDefaultPrep(): """Return a dictionary with default preparation settings.""" return { "feature_calculators": [MorganFP(radius=2, nBits=128)], "split": RandomSplit(test_fraction=0.2), "feature_standardizer": StandardScaler(), "feature_filters": [LowVarianceFilter(0.05), HighCorrelationFilter(0.8)], }
[docs] @classmethod def getAllDescriptors(cls): """Return a list of (ideally) all available descriptor sets. For now they need to be added manually to the list below. TODO: would be nice to create the list automatically by implementing a descriptor set registry that would hold all installed descriptor sets. Returns: list: `list` of `DescriptorCalculator` objects """ descriptor_sets = [ RDKitDescs(), DrugExPhyschem(), PredictorDesc( SklearnModel.fromFile( f"{os.path.dirname(__file__)}/test_files/test_predictor/" f"RFC_SINGLECLASS/RFC_SINGLECLASS_meta.json" ) ), TanimotoDistances( list_of_smiles=["C", "CC", "CCC"], fingerprint_type=MorganFP(radius=2, nBits=128), radius=2, nBits=128, ), AtomPairFP(nBits=128), AvalonFP(nBits=128), LayeredFP(nBits=128), MaccsFP(), MorganFP(radius=2, nBits=128), PatternFP(nBits=128), RDKitFP(nBits=128), RDKitMACCSFP(), TopologicalFP(nBits=128), ] return descriptor_sets
[docs] @classmethod def getDefaultCalculatorCombo(cls): """ Makes a list of default descriptor calculators that can be used in tests. It creates a calculator with only morgan fingerprints and rdkit descriptors, but also one with them both to test behaviour with multiple descriptor sets. Override this method if you want to test with other descriptor sets and calculator combinations. Returns: list: `list` of created `DescriptorCalculator` objects """ feature_sets = [ MorganFP(radius=3, nBits=128), RDKitDescs(), ] mol_descriptor_calculators = list( itertools.combinations(feature_sets, 1) ) + list(itertools.combinations(feature_sets, 2)) return mol_descriptor_calculators
[docs] @classmethod def getDataPrepGrid(cls): """Return a list of many possible combinations of descriptor calculators, splits, feature standardizers, feature filters and data filters. Again, this is not exhaustive, but should cover a lot of cases. Returns: grid: a generator that yields tuples of all possible combinations as stated above, each tuple is defined as: (descriptor_calculator, split, feature_standardizer, feature_filters, data_filters) """ # get the feature calculators descriptor_calculators = cls.getDefaultCalculatorCombo() # lists with common preparation settings splits = [None, RandomSplit(test_fraction=0.1)] feature_standardizers = [None, StandardScaler()] feature_filters = [None, HighCorrelationFilter(0.9)] applicability_domains = [None, TopKatApplicabilityDomain()] data_filters = [ None, RepeatsFilter(), # CategoryFilter( # FIXME: this needs to be made more general and not specific to one dataset # name="moka_ionState7.4", # values=["cationic"] # ), ] # All combinations of the above preparation settings (passed to prepareDataset) return ( # deep copy to avoid conflicts cayed by operating on one instance twice copy.deepcopy(combo) for combo in itertools.product( descriptor_calculators, splits, feature_standardizers, feature_filters, data_filters, applicability_domains, ) )
[docs] @classmethod def getPrepCombos(cls): """ Return a list of all possible preparation combinations as generated by `getDataPrepGrid` as well as their names. The generated list can be used to parameterize tests with the given named combinations. Returns: list: `list` of `list`s of all possible combinations of preparation """ def get_name(obj: object): """ Get the name of a data preparation object, or its class name if it is not a string. Args: obj: the object to get name for Returns: str: the generated name of the object """ return ( str(None) if obj is None else obj.__class__.__name__ if (not isinstance(obj, SKLearnStandardizer)) else str(obj) ) def get_name_list(obj: Iterable | object): """ Parse an generator of data preparation objects and return a string. Note that the method proceeds recursively so nested iterables are also parsed. Args: obj: the object or an generator of objects to get name for Returns: str: the generated name of the object or a list of objects """ if isinstance(obj, Iterable): return "_".join([get_name_list(i) for i in obj]) else: return get_name(obj) ret = [2 * [get_name_list(x)] + list(x) for x in cls.getDataPrepGrid()] return ret
[docs] def getBigDF(self): """Get a large data frame for testing purposes. Returns: pd.DataFrame: a `pandas.DataFrame` containing the dataset """ return pd.read_csv(f"{self.inputDataPath}/test_data_large.tsv", sep="\t")
[docs] def getSmallDF(self): """Get a small data frame for testing purposes. Returns: pd.DataFrame: a `pandas.DataFrame` containing the dataset """ return pd.read_csv(f"{self.inputDataPath}/test_data.tsv", sep="\t").sample(10)
[docs] def createLargeTestDataSet( self, name="QSPRDataset_test_large", target_props=[{"name": "CL", "task": TargetTasks.REGRESSION}], preparation_settings=None, random_state=42, n_jobs=1, chunk_size=None, ): """Create a large dataset for testing purposes. Args: name (str): name of the dataset target_props (List of dicts or TargetProperty): list of target properties random_state (int): random state to use for splitting and shuffling preparation_settings (dict): dictionary containing preparation settings Returns: QSPRDataset: a `QSPRDataset` object """ return self.createTestDataSetFromFrame( self.getBigDF(), name=name, target_props=target_props, prep=preparation_settings, random_state=random_state, n_jobs=n_jobs, chunk_size=chunk_size, )
[docs] def createSmallTestDataSet( self, name="QSPRDataset_test_small", target_props=[{"name": "CL", "task": TargetTasks.REGRESSION}], preparation_settings=None, random_state=42, ): """Create a small dataset for testing purposes. Args: name (str): name of the dataset target_props (List of dicts or TargetProperty): list of target properties random_state (int): random state to use for splitting and shuffling preparation_settings (dict): dictionary containing preparation settings Returns: QSPRDataset: a `QSPRDataset` object """ return self.createTestDataSetFromFrame( self.getSmallDF(), name=name, target_props=target_props, random_state=random_state, prep=preparation_settings, )
[docs] def createTestDataSetFromFrame( self, df, name="QSPRDataset_test", target_props=[{"name": "CL", "task": TargetTasks.REGRESSION}], random_state=None, prep=None, n_jobs=1, chunk_size=None, ): """Create a dataset for testing purposes from the given data frame. Args: df (pd.DataFrame): data frame containing the dataset name (str): name of the dataset target_props (List of dicts or TargetProperty): list of target properties random_state (int): random state to use for splitting and shuffling prep (dict): dictionary containing preparation settings Returns: QSPRDataset: a `QSPRDataset` object """ ret = QSPRDataset( name, target_props=target_props, df=df, store_dir=self.generatedDataPath, random_state=random_state, n_jobs=n_jobs, chunk_size=chunk_size, ) if prep: ret.prepareDataset(**prep) return ret
[docs] def createLargeMultitaskDataSet( self, name="QSPRDataset_multi_test", target_props=[ {"name": "HBD", "task": TargetTasks.MULTICLASS, "th": [-1, 1, 2, 100]}, {"name": "CL", "task": TargetTasks.REGRESSION}, ], preparation_settings=None, random_state=42, ): """Create a large dataset for testing purposes. Args: name (str): name of the dataset target_props (List of dicts or TargetProperty): list of target properties preparation_settings (dict): dictionary containing preparation settings random_state (int): random state to use for splitting and shuffling Returns: QSPRDataset: a `QSPRDataset` object """ return self.createTestDataSetFromFrame( self.getBigDF(), name=name, target_props=target_props, random_state=random_state, prep=preparation_settings, )
[docs] def validate_split(self, dataset): """Check if the split has the data it should have after splitting.""" self.assertTrue(dataset.X is not None) self.assertTrue(dataset.X_ind is not None) self.assertTrue(dataset.y is not None) self.assertTrue(dataset.y_ind is not None)
[docs]class ModelDataSetsPathMixIn(DataSetsPathMixIn): """This class sets up the datasets for the model tests."""
[docs] def setUpPaths(self): """Set up the test environment.""" super().setUpPaths() self.generatedModelsPath = f"{self.generatedPath}/models/" if not os.path.exists(self.generatedModelsPath): os.makedirs(self.generatedModelsPath)