Source code for qsprpred.data.descriptors.tests

import numpy as np
from parameterized import parameterized
from rdkit.Chem import Descriptors

from .fingerprints import MorganFP
from .sets import (
    DrugExPhyschem,
    PredictorDesc,
    TanimotoDistances,
    RDKitDescs,
    SmilesDesc,
)
from ... import TargetTasks
from ...data import RandomSplit
from ...data.processing.feature_filters import LowVarianceFilter, HighCorrelationFilter
from ...models import SklearnModel
from ...utils.testing.base import QSPRTestCase
from ...utils.testing.check_mixins import DescriptorInDataCheckMixIn
from ...utils.testing.path_mixins import DataSetsPathMixIn


[docs]class TestDescriptorCalculation(DataSetsPathMixIn, QSPRTestCase): """Test the calculation of descriptors."""
[docs] def setUp(self): """Set up the test Dataframe.""" super().setUp() self.setUpPaths()
[docs] @staticmethod def getDescList(): return [MorganFP(radius=3, nBits=256), DrugExPhyschem()]
[docs] def testDropping(self): """Test dropping of descriptors from data sets.""" dataset = self.createLargeTestDataSet("TestDropping") # test dropping of all sets dataset.addDescriptors(self.getDescList()) full_len = sum(len(x) for x in dataset.descriptorSets) self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) dataset.dropDescriptorSets(dataset.descriptorSets) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True) self.assertEqual(len(dataset.descriptors), 0) dataset.addDescriptors(self.getDescList()) dataset.dropDescriptorSets([str(x) for x in self.getDescList()]) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) dataset.dropDescriptorSets( [str(x) for x in self.getDescList()], full_removal=True ) self.assertEqual(len(dataset.descriptors), 0) # test dropping of single set dataset.addDescriptors(self.getDescList()) self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) dataset.dropDescriptorSets([dataset.descriptorSets[0]]) self.assertEqual( dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) ) dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=True) dataset.addDescriptors(self.getDescList()) dataset.dropDescriptorSets([str(dataset.descriptorSets[0])], full_removal=True) self.assertEqual( dataset.getFeatures(concat=True).shape[1], len(self.getDescList()[1]) ) # test restoring of dropped sets dataset.addDescriptors(self.getDescList()) self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len) dataset.dropDescriptorSets(dataset.descriptorSets, full_removal=False) self.assertEqual(dataset.getFeatures(concat=True).shape[1], 0) dataset.restoreDescriptorSets(dataset.descriptorSets) self.assertTrue(dataset.getFeatures(concat=True).shape[1] == full_len)
@parameterized.expand([(None, None), (1, None), (2, None), (4, 50)]) def testSwitching(self, n_cpu, chunk_size): """Test if the feature calculator can be switched to a new dataset.""" dataset = self.createLargeTestDataSet( "TestSwitching", n_jobs=n_cpu, chunk_size=chunk_size ) feature_calculators = [ MorganFP(radius=3, nBits=256), DrugExPhyschem(), ] split = RandomSplit(test_fraction=0.1) lv = LowVarianceFilter(0.05) hc = HighCorrelationFilter(0.9) dataset.prepareDataset( split=split, feature_calculators=feature_calculators, feature_filters=[lv, hc], recalculate_features=True, feature_fill_value=np.nan, ) # create new dataset with the same calculator dataset_next = self.createLargeTestDataSet(self.__class__.__name__) dataset_next.prepareDataset( split=split, feature_calculators=feature_calculators, feature_filters=[lv, hc], recalculate_features=True, feature_fill_value=np.nan, ) self.assertEqual(dataset.X.shape, dataset_next.X.shape)
[docs]class TestDescriptorSets(DataSetsPathMixIn, QSPRTestCase): """Test the descriptor sets."""
[docs] def setUp(self): """Create the test Dataframe.""" super().setUp() self.setUpPaths() self.dataset = self.createLargeTestDataSet(self.__class__.__name__) self.dataset.nJobs = self.nCPU self.dataset.chunkSize = None self.dataset.shuffle()
[docs] def testPredictorDescriptor(self): """Test the PredictorDesc descriptor set.""" # give path to saved model parameters meta_path = ( f"{self.inputBasePath}/test_predictor/" f"RFC_SINGLECLASS/RFC_SINGLECLASS_meta.json" ) model = SklearnModel.fromFile(meta_path) desc_calc = PredictorDesc(model) self.dataset.addDescriptors([desc_calc]) self.assertEqual(self.dataset.X.shape, (len(self.dataset), 1)) self.assertTrue(self.dataset.X.any().any()) # test from file instantiation desc_calc.toFile(f"{self.generatedDataPath}/test_calc.json") desc_calc_file = desc_calc.fromFile(f"{self.generatedDataPath}/test_calc.json") self.dataset.addDescriptors([desc_calc_file], recalculate=True) self.assertEqual(self.dataset.X.shape, (len(self.dataset), 1)) self.assertTrue(self.dataset.X.any().any())
[docs] def testFingerprintSet(self): """Test the fingerprint set descriptor calculator.""" desc_calc = MorganFP(radius=3, nBits=128) self.dataset.addDescriptors([desc_calc]) self.assertEqual(self.dataset.X.shape, (len(self.dataset), 128)) self.assertTrue(self.dataset.X.any().any()) self.assertTrue(self.dataset.X.any().sum() > 1)
[docs] def testTanimotoDistances(self): """Test the Tanimoto distances descriptor calculator, which calculates the Tanimoto distances between a list of SMILES.""" list_of_smiles = ["C", "CC", "CCC", "CCCC", "CCCCC", "CCCCCC", "CCCCCCC"] desc_calc = [ TanimotoDistances( list_of_smiles=list_of_smiles, fingerprint_type=MorganFP(radius=3, nBits=128), ) ] self.dataset.addDescriptors(desc_calc)
[docs] def testDrugExPhyschem(self): """Test the DrugExPhyschem descriptor calculator.""" desc_calc = [DrugExPhyschem()] self.dataset.addDescriptors(desc_calc) self.assertEqual(self.dataset.X.shape, (len(self.dataset), 19)) self.assertTrue(self.dataset.X.any().any()) self.assertTrue(self.dataset.X.any().sum() > 1)
[docs] def testRDKitDescs(self): """Test the rdkit descriptors calculator.""" desc_calc = [RDKitDescs()] self.dataset.addDescriptors(desc_calc) rdkit_desc_count = len(set(Descriptors._descList)) self.assertEqual(self.dataset.X.shape, (len(self.dataset), rdkit_desc_count)) self.assertTrue(self.dataset.X.any().any()) self.assertTrue(self.dataset.X.any().sum() > 1) # with 3D desc_calc = [RDKitDescs(include_3d=True)] self.dataset.addDescriptors(desc_calc, recalculate=True) self.assertEqual( self.dataset.X.shape, (len(self.dataset), rdkit_desc_count + 10) )
[docs] def testSmilesDesc(self): """Test the smiles descriptors calculator.""" desc_calc = [SmilesDesc()] self.dataset.addDescriptors(desc_calc) self.assertEqual(self.dataset.X.shape, (len(self.dataset), 1)) self.assertTrue(self.dataset.X.any().any())
[docs] def testConsistency(self): """Test if the descriptor calculator is consistent with the dataset.""" len_prev = len(self.dataset) desc_calc = [MorganFP(radius=3, nBits=128)] self.dataset.addDescriptors(desc_calc) self.assertEqual(len_prev, len(self.dataset)) self.assertEqual(len_prev, len(self.dataset.getDescriptors())) self.assertEqual(len_prev, len(self.dataset.X)) self.assertEqual(128, self.dataset.getDescriptors().shape[1]) self.assertEqual(128, self.dataset.X.shape[1]) self.assertEqual(128, self.dataset.X_ind.shape[1]) self.assertEqual(128, self.dataset.getFeatures(concat=True).shape[1]) self.assertEqual(len_prev, self.dataset.getFeatures(concat=True).shape[0])
[docs]class TestDescriptorsAll(DataSetsPathMixIn, DescriptorInDataCheckMixIn, QSPRTestCase): """Test all descriptor sets in all data sets."""
[docs] def setUp(self): super().setUp() self.setUpPaths()
@parameterized.expand( [ ( f"{desc_set}_{TargetTasks.REGRESSION}", desc_set, [{"name": "CL", "task": TargetTasks.REGRESSION}], ) for desc_set in DataSetsPathMixIn.getAllDescriptors() ] ) def testDescriptorsAll(self, _, desc_set, target_props): """Tests all available descriptor sets. Note that they are not checked with all possible settings and all possible preparations, but only with the default settings provided by `DataSetsPathMixIn.getDefaultPrep()`. The list itself is defined and configured by `DataSetsPathMixIn.getAllDescriptors()`, so if you need a specific descriptor tested, add it there. """ np.random.seed(42) dataset = self.createLargeTestDataSet( name=self.getDatSetName(desc_set, target_props), target_props=target_props, n_jobs=self.nCPU, chunk_size=None, ) self.checkDataSetContainsDescriptorSet( dataset, desc_set, self.getDefaultPrep(), target_props )