Source code for qsprpred.utils.testing.path_mixins

import copy
import itertools
import os
import shutil
import tempfile
from typing import Iterable

import pandas as pd
from mlchemad.applicability_domains import TopKatApplicabilityDomain
from sklearn.preprocessing import StandardScaler

from ...data import RandomSplit, QSPRDataset
from ...data.descriptors.fingerprints import (
    AtomPairFP,
    AvalonFP,
    LayeredFP,
    MaccsFP,
    MorganFP,
    PatternFP,
    RDKitFP,
    RDKitMACCSFP,
    TopologicalFP,
)
from ...data.descriptors.sets import (
    RDKitDescs,
    DrugExPhyschem,
    PredictorDesc,
    TanimotoDistances,
)
from ...data.processing.data_filters import RepeatsFilter
from ...data.processing.feature_filters import HighCorrelationFilter, LowVarianceFilter
from ...data.processing.feature_standardizers import SKLearnStandardizer
from ...models import SklearnModel
from ...tasks import TargetTasks


[docs]class PathMixIn:
    """Mix-in class that provides paths to test files and directories and handles their
    creation and deletion.

    Attributes:
        generatedPath (str):
            path to the directory where generated files are stored, this directory is
            created before and cleared after each test

    """

[docs]    def setUpPaths(self):
        """Create the directories that are used for testing."""
        self.generatedPath = tempfile.mkdtemp(prefix="qsprpred_test_")
        self.clearGenerated()
        if not os.path.exists(self.generatedPath):
            os.makedirs(self.generatedPath)

[docs]    def tearDown(self):
        """Remove all files and directories that are used for testing."""
        self.clearGenerated()

[docs]    def clearGenerated(self):
        """Remove the directories that are used for testing."""
        if os.path.exists(self.generatedPath):
            shutil.rmtree(self.generatedPath)


[docs]class DataSetsPathMixIn(PathMixIn):
    """Mix-in class that provides a small and large testing data set and some common
    preparation settings to use in tests."""

[docs]    def setUpPaths(self):
        """Create the directories that are used for testing."""
        super().setUpPaths()
        self.inputBasePath = f"{os.path.dirname(__file__)}/test_files"
        self.inputDataPath = f"{self.inputBasePath}/data/"
        self.generatedDataPath = f"{self.generatedPath}/datasets"
        if not os.path.exists(self.generatedDataPath):
            os.makedirs(self.generatedDataPath)

[docs]    @staticmethod
    def getDefaultPrep():
        """Return a dictionary with default preparation settings."""
        return {
            "feature_calculators": [MorganFP(radius=2, nBits=128)],
            "split": RandomSplit(test_fraction=0.2),
            "feature_standardizer": StandardScaler(),
            "feature_filters": [LowVarianceFilter(0.05), HighCorrelationFilter(0.8)],
        }

[docs]    @classmethod
    def getAllDescriptors(cls):
        """Return a list of (ideally) all available descriptor sets. For now
        they need to be added manually to the list below.

        TODO: would be nice to create the list automatically by implementing a
        descriptor set registry that would hold all installed descriptor sets.

        Returns:
            list: `list` of `DescriptorCalculator` objects
        """
        descriptor_sets = [
            RDKitDescs(),
            DrugExPhyschem(),
            PredictorDesc(
                SklearnModel.fromFile(
                    f"{os.path.dirname(__file__)}/test_files/test_predictor/"
                    f"RFC_SINGLECLASS/RFC_SINGLECLASS_meta.json"
                )
            ),
            TanimotoDistances(
                list_of_smiles=["C", "CC", "CCC"],
                fingerprint_type=MorganFP(radius=2, nBits=128),
                radius=2,
                nBits=128,
            ),
            AtomPairFP(nBits=128),
            AvalonFP(nBits=128),
            LayeredFP(nBits=128),
            MaccsFP(),
            MorganFP(radius=2, nBits=128),
            PatternFP(nBits=128),
            RDKitFP(nBits=128),
            RDKitMACCSFP(),
            TopologicalFP(nBits=128),
        ]

        return descriptor_sets

[docs]    @classmethod
    def getDefaultCalculatorCombo(cls):
        """
        Makes a list of default descriptor calculators that can be used in tests.
        It creates a calculator with only morgan fingerprints and rdkit descriptors,
        but also one with them both to test behaviour with multiple descriptor sets.
        Override this method if you want to test with other descriptor sets and
        calculator combinations.

        Returns:
            list: `list` of created `DescriptorCalculator` objects
        """
        feature_sets = [
            MorganFP(radius=3, nBits=128),
            RDKitDescs(),
        ]
        mol_descriptor_calculators = list(
            itertools.combinations(feature_sets, 1)
        ) + list(itertools.combinations(feature_sets, 2))
        return mol_descriptor_calculators

[docs]    @classmethod
    def getDataPrepGrid(cls):
        """Return a list of many possible combinations of descriptor calculators,
        splits, feature standardizers, feature filters and data filters. Again, this is
        not exhaustive, but should cover a lot of cases.

        Returns:
            grid:
                a generator that yields tuples of all possible combinations as stated
                above, each tuple is defined as: (descriptor_calculator, split,
                feature_standardizer, feature_filters, data_filters)
        """
        # get the feature calculators
        descriptor_calculators = cls.getDefaultCalculatorCombo()
        # lists with common preparation settings
        splits = [None, RandomSplit(test_fraction=0.1)]
        feature_standardizers = [None, StandardScaler()]
        feature_filters = [None, HighCorrelationFilter(0.9)]
        applicability_domains = [None, TopKatApplicabilityDomain()]
        data_filters = [
            None,
            RepeatsFilter(),
            # CategoryFilter(
            # FIXME: this needs to be made more general and not specific to one dataset
            #     name="moka_ionState7.4",
            #     values=["cationic"]
            # ),
        ]
        # All combinations of the above preparation settings (passed to prepareDataset)
        return (
            # deep copy to avoid conflicts cayed by operating on one instance twice
            copy.deepcopy(combo)
            for combo in itertools.product(
                descriptor_calculators,
                splits,
                feature_standardizers,
                feature_filters,
                data_filters,
                applicability_domains,
            )
        )

[docs]    @classmethod
    def getPrepCombos(cls):
        """
        Return a list of all possible preparation combinations as generated by
        `getDataPrepGrid` as well as their names. The generated list can be used
        to parameterize tests with the given named combinations.

        Returns:
            list: `list` of `list`s of all possible combinations of preparation
        """

        def get_name(obj: object):
            """
            Get the name of a data preparation object,
            or its class name if it is not a string.

            Args:
                obj: the object to get name for

            Returns:
                str: the generated name of the object
            """
            return (
                str(None)
                if obj is None
                else obj.__class__.__name__
                if (not isinstance(obj, SKLearnStandardizer))
                else str(obj)
            )

        def get_name_list(obj: Iterable | object):
            """
            Parse an generator of data preparation objects and return a string.
            Note that the method proceeds recursively so nested iterables are also
            parsed.

            Args:
                obj: the object or an generator of objects to get name for

            Returns:
                str: the generated name of the object or a list of objects
            """
            if isinstance(obj, Iterable):
                return "_".join([get_name_list(i) for i in obj])
            else:
                return get_name(obj)

        ret = [2 * [get_name_list(x)] + list(x) for x in cls.getDataPrepGrid()]
        return ret

[docs]    def getBigDF(self):
        """Get a large data frame for testing purposes.

        Returns:
            pd.DataFrame: a `pandas.DataFrame` containing the dataset
        """
        return pd.read_csv(f"{self.inputDataPath}/test_data_large.tsv", sep="\t")

[docs]    def getSmallDF(self):
        """Get a small data frame for testing purposes.

        Returns:
            pd.DataFrame: a `pandas.DataFrame` containing the dataset
        """
        return pd.read_csv(f"{self.inputDataPath}/test_data.tsv", sep="\t").sample(10)

[docs]    def createLargeTestDataSet(
        self,
        name="QSPRDataset_test_large",
        target_props=[{"name": "CL", "task": TargetTasks.REGRESSION}],
        preparation_settings=None,
        random_state=42,
        n_jobs=1,
        chunk_size=None,
    ):
        """Create a large dataset for testing purposes.

        Args:
            name (str): name of the dataset
            target_props (List of dicts or TargetProperty): list of target properties
            random_state (int): random state to use for splitting and shuffling
            preparation_settings (dict): dictionary containing preparation settings

        Returns:
            QSPRDataset: a `QSPRDataset` object
        """
        return self.createTestDataSetFromFrame(
            self.getBigDF(),
            name=name,
            target_props=target_props,
            prep=preparation_settings,
            random_state=random_state,
            n_jobs=n_jobs,
            chunk_size=chunk_size,
        )

[docs]    def createSmallTestDataSet(
        self,
        name="QSPRDataset_test_small",
        target_props=[{"name": "CL", "task": TargetTasks.REGRESSION}],
        preparation_settings=None,
        random_state=42,
    ):
        """Create a small dataset for testing purposes.

        Args:
            name (str): name of the dataset
            target_props (List of dicts or TargetProperty): list of target properties
            random_state (int): random state to use for splitting and shuffling
            preparation_settings (dict): dictionary containing preparation settings

        Returns:
            QSPRDataset: a `QSPRDataset` object
        """
        return self.createTestDataSetFromFrame(
            self.getSmallDF(),
            name=name,
            target_props=target_props,
            random_state=random_state,
            prep=preparation_settings,
        )

[docs]    def createTestDataSetFromFrame(
        self,
        df,
        name="QSPRDataset_test",
        target_props=[{"name": "CL", "task": TargetTasks.REGRESSION}],
        random_state=None,
        prep=None,
        n_jobs=1,
        chunk_size=None,
    ):
        """Create a dataset for testing purposes from the given data frame.

        Args:
            df (pd.DataFrame): data frame containing the dataset
            name (str): name of the dataset
            target_props (List of dicts or TargetProperty): list of target properties
            random_state (int): random state to use for splitting and shuffling
            prep (dict): dictionary containing preparation settings

        Returns:
            QSPRDataset: a `QSPRDataset` object
        """
        ret = QSPRDataset(
            name,
            target_props=target_props,
            df=df,
            store_dir=self.generatedDataPath,
            random_state=random_state,
            n_jobs=n_jobs,
            chunk_size=chunk_size,
        )
        if prep:
            ret.prepareDataset(**prep)
        return ret

[docs]    def createLargeMultitaskDataSet(
        self,
        name="QSPRDataset_multi_test",
        target_props=[
            {"name": "HBD", "task": TargetTasks.MULTICLASS, "th": [-1, 1, 2, 100]},
            {"name": "CL", "task": TargetTasks.REGRESSION},
        ],
        preparation_settings=None,
        random_state=42,
    ):
        """Create a large dataset for testing purposes.

        Args:
            name (str): name of the dataset
            target_props (List of dicts or TargetProperty): list of target properties
            preparation_settings (dict): dictionary containing preparation settings
            random_state (int): random state to use for splitting and shuffling

        Returns:
            QSPRDataset: a `QSPRDataset` object
        """
        return self.createTestDataSetFromFrame(
            self.getBigDF(),
            name=name,
            target_props=target_props,
            random_state=random_state,
            prep=preparation_settings,
        )

[docs]    def validate_split(self, dataset):
        """Check if the split has the data it should have after splitting."""
        self.assertTrue(dataset.X is not None)
        self.assertTrue(dataset.X_ind is not None)
        self.assertTrue(dataset.y is not None)
        self.assertTrue(dataset.y_ind is not None)


[docs]class ModelDataSetsPathMixIn(DataSetsPathMixIn):
    """This class sets up the datasets for the model tests."""

[docs]    def setUpPaths(self):
        """Set up the test environment."""
        super().setUpPaths()
        self.generatedModelsPath = f"{self.generatedPath}/models/"
        if not os.path.exists(self.generatedModelsPath):
            os.makedirs(self.generatedModelsPath)