Source code for qsprpred.data.storage.tests

import os
import shutil
from abc import ABC, abstractmethod
from unittest import TestCase

import pandas as pd
from rdkit import Chem
from rdkit.Chem.rdDistGeom import EmbedMultipleConfs

from qsprpred.data.chem.identifiers import InchiIdentifier
from qsprpred.data.chem.standardizers.check_smiles import CheckSmilesValid
from qsprpred.data.chem.standardizers.papyrus import PapyrusStandardizer
from qsprpred.data.storage.interfaces.chem_store import ChemStore
from qsprpred.data.storage.tabular.hierarchical import PandasRepresentationStore, \
    RepresentationMol
from qsprpred.data.storage.tabular.simple import PandasChemStore


[docs] class StorageTest(ABC):
[docs] def setUp(self): self.testDir = os.path.join(os.path.dirname(__file__), "test_files") self.outputPath = os.path.join(self.testDir, "output") if os.path.exists(self.outputPath): shutil.rmtree(self.outputPath) os.makedirs(self.outputPath, exist_ok=True) self.exampleFileBasic = os.path.join(self.testDir, "example_table_default.csv") self.exampleFileIndex = os.path.join(self.testDir, "example_table_index.csv")
[docs] def tearDown(self): if os.path.exists(self.outputPath): shutil.rmtree(self.outputPath)
[docs] @abstractmethod def getStorage(self) -> ChemStore: pass
[docs] class TabularStorageTest(StorageTest, TestCase):
[docs] def getStorage(self) -> PandasChemStore: store = PandasChemStore( f"{self.__class__.__name__}_test_basic", self.outputPath, pd.read_csv(self.exampleFileBasic), standardizer=PapyrusStandardizer(), identifier=InchiIdentifier(), ) store.addLibrary( f"{store.name}_2", pd.read_csv(self.exampleFileIndex), smiles_col="smiles", ) return store
[docs] def checkSerialization(self, store): store.save() # create new and check consistency store2 = PandasChemStore(store.name, self.outputPath) self.assertEqual(store2.nLibs, store.nLibs) self.assertEqual(len(store2), len(store)) self.assertListEqual(list(store2.smiles), list(store.smiles)) # create from meta file and check consistency store2 = PandasChemStore.fromFile(store.metaFile) self.assertEqual(store2.nLibs, store.nLibs) self.assertEqual(len(store2), len(store)) self.assertListEqual(list(store2.smiles), list(store.smiles)) # add a new library and check consistency after reload len_before = len(store) added = store.addMols( ["CN1[C@H]2CC[C@@H]1[C@@H](C(OC)=O)[C@@H](OC(C3=CC=CC=C3)=O)C2"], ) self.assertEqual(len(added), 1) self.assertEqual(len(store), len_before + 1) store.reload() self.assertEqual(store.nLibs, store2.nLibs) self.assertEqual(len(store), len(store2)) self.assertListEqual(list(store.smiles), list(store2.smiles)) self.assertEqual(len(store), len(store.getDF())) self.assertEqual(len(store2), len(store2.getDF()))
[docs] def testInitsAndSaves(self): # test default store_default = PandasChemStore( f"{self.__class__.__name__}_test_basic", self.outputPath, pd.read_csv(self.exampleFileBasic), standardizer=PapyrusStandardizer(), identifier=InchiIdentifier(), ) self.assertEqual(store_default.nLibs, 1) self.assertEqual(len(store_default), 2) self.checkSerialization(store_default) # try to add store with the same name self.assertRaises( ValueError, lambda: store_default.addLibrary( f"{store_default.name}_library", pd.read_csv(self.exampleFileBasic), ), ) # add a library with duplicated molecules store_default.addLibrary( f"{store_default.name}_2", pd.read_csv(self.exampleFileBasic), ) self.assertEqual(store_default.nLibs, 2) self.assertEqual(len(store_default), 2) self.checkSerialization(store_default) # add a new library with additional compounds store_default.addLibrary( f"{store_default.name}_3", pd.read_csv(self.exampleFileIndex), smiles_col="smiles", ) self.assertEqual(store_default.nLibs, 3) self.assertEqual(len(store_default), 3) self.checkSerialization(store_default) # test empty init store_empty = PandasChemStore( f"{self.__class__.__name__}_test_empty", self.outputPath, standardizer=PapyrusStandardizer(), identifier=InchiIdentifier(), ) self.assertEqual(store_empty.nLibs, 1) self.assertEqual(len(store_empty), 0) self.checkSerialization(store_empty) # test with defaults df = pd.read_csv(self.exampleFileIndex) store_default = PandasChemStore( f"{self.__class__.__name__}_test_default", self.outputPath, df, smiles_col="smiles", ) self.assertEqual(store_default.nLibs, 1) self.assertEqual(len(store_default), len(df)) self.checkSerialization(store_default) # try from DF df = pd.read_csv(self.exampleFileIndex) PandasChemStore.fromDF( df, name=f"{self.__class__.__name__}_test_default_df", path=self.outputPath, smiles_col="smiles", ) self.assertEqual(store_default.nLibs, 1) self.assertEqual(len(store_default), len(df)) self.checkSerialization(store_default)
[docs] def testAddMols(self): store = self.getStorage() len_before = len(store) added = store.addMols(["O=C(OCCN(CC)CC)c1ccc(N)cc1"], ) self.assertEqual(len(added), 1) self.assertEqual(len(store), len_before + 1) self.checkSerialization(store) # add to a new library store.addMols( ["O=C(OC(C)CN(CC)CC)c1ccc(N)cc1"], library=f"{store.name}_2", ) self.assertEqual(store.nLibs, 2) self.assertEqual(len(store), len_before + 2) self.checkSerialization(store) # add with new properties mols = store.addMols( ["O=C(OC(CCC)CN(CC)CC)c1ccc(N)cc1", "O=C(OC(CCC)CN(CC)CC)c1ccc(N)cc1C"], props={"new_prop": [1, 2]}, ) self.assertEqual(len(mols), 2) for idx, mol in enumerate(mols): self.assertIn("new_prop", mol.props) self.assertEqual(mol.props["new_prop"], idx + 1) for mol in store: self.assertIn("new_prop", mol.props) self.assertEqual(len(store), len_before + 4) self.checkSerialization(store) # add with existing properties mols = store.addMols( [ "O=C(OC(CCC)CN(CC)CC)c1ccc(N)cc1C(C)C", "O=C(OC(CCC)CN(C(C)C)CC)c1ccc(N)cc1C", ], props={ "TestProp1": [3, 4], "TestProp2": [5, 6] }, ) self.assertEqual(len(mols), 2) for mol in store: self.assertIn("TestProp1", mol.props) self.assertIn("TestProp2", mol.props) self.assertEqual(len(store), len_before + 6) self.checkSerialization(store) # add with existing properties and new ones mols = store.addMols( [ "O=C(OC(C(O)C)CN(CC)CC)c1ccc(N)cc1C(C)C", "O=C(OC(CC(N)C)CN(C(C)C)CC)c1ccc(N)cc1C", ], props={ "TestProp1": [3, 4], "TestProp2": [5, 6], "new_prop": [7, 8] }, ) self.assertEqual(len(mols), 2) for mol in store: self.assertIn("TestProp1", mol.props) self.assertIn("TestProp2", mol.props) self.assertIn("new_prop", mol.props) self.assertEqual(len(store), len_before + 8) self.checkSerialization(store)
[docs] def testMolProcess(self): store = self.getStorage() result = pd.concat(list(store.processMols(CheckSmilesValid()))) self.assertEqual(len(result), len(store)) self.assertTrue(all(result)) for idx in result.index: self.assertTrue(idx in store) # test with parallel store.nJobs = 2 result = list(store.processMols(CheckSmilesValid())) self.assertEqual(len(result), 3) result = pd.concat(result) self.assertEqual(len(result), len(store)) self.assertTrue(all(result)) for idx in result.index: self.assertTrue(idx in store)
[docs] def testSubsetting(self): store = self.getStorage() mol_1 = next(iter(store)) mol_2 = list(store)[1] subset = store.getSubset(["TestProp1", "ExtraIndexColumn"]) self.assertEqual(len(subset), len(store)) for mol in [mol_1, mol_2]: self.assertIn(mol.id, subset) self.assertIn("TestProp1", subset[mol.id].props) self.assertIn("ExtraIndexColumn", subset[mol.id].props) subset = store.getSubset( ["TestProp1", "ExtraIndexColumn"], [mol_2.id, mol_1.id] ) self.assertEqual(len(subset), 2) for mol in [mol_1, mol_2]: self.assertIn(mol.id, subset) self.assertIn("TestProp1", subset[mol.id].props) self.assertIn("ExtraIndexColumn", subset[mol.id].props)
[docs] def testSearch(self): # by property store = self.getStorage() result = store.searchOnProperty("TestProp1", [1.0]) result = result.getProperty("TestProp1") self.assertTrue(all(result == 1.0)) # using a string result = store.searchOnProperty("ExtraIndexColumn", ["Molecule8"]) result = result.getProperty("ExtraIndexColumn") self.assertTrue(all(result == "Molecule8")) self.assertEqual(len(result), 1) # find non-existing result = store.searchOnProperty("ExtraIndexColumn", ["MoleculeX"]) self.assertEqual(len(result), 0) # by SMARTS result = store.searchWithSMARTS( ["N[C@H]"], name="test_smarts", use_chirality=True, ) self.assertEqual(len(result), len(store) - 1) # using non-chiral match result = store.searchWithSMARTS( ["N[C@H]"], name="test_smarts", use_chirality=False, ) self.assertEqual(len(result), len(store)) # using multiple patterns result = store.searchWithSMARTS( ["N[C@H]", "C1CCC1"], name="test_smarts", operator="and", use_chirality=True, ) self.assertEqual(len(result), 0) result = store.searchWithSMARTS( ["N[C@H]", "C1CCC1"], name="test_smarts", operator="and", use_chirality=False, ) self.assertEqual(len(result), 1) # get single molecule and check that is has all the props result_mol = next(iter(result)) result_mol = store.getMol(result_mol.id) for prop in store.getProperties(): self.assertIn(prop, result_mol.props) # drop it and check that it is not there result.removeMol(result_mol.id) self.assertNotIn(result_mol.id, result) self.assertEqual(len(result), 0)
[docs] class TabularRepresentationStorageTest(StorageTest, TestCase):
[docs] def setUp(self): super().setUp() store = PandasChemStore( f"{self.__class__.__name__}_test_basic_main", self.outputPath, pd.read_csv(self.exampleFileBasic), standardizer=PapyrusStandardizer(), identifier=InchiIdentifier(), ) store.addLibrary( f"{store.name}_2", pd.read_csv(self.exampleFileIndex), smiles_col="smiles", ) self.main = store
[docs] def getStorage(self) -> PandasRepresentationStore: return PandasRepresentationStore( f"{self.__class__.__name__}_test_basic", path=self.outputPath, chem_store=self.main, )
[docs] def addConformers(self, store): # generate conformers for each molecule in main store parent_ids = [] sdfs = [] smiles = [] for mol in self.main: rd_mol = mol.as_rd_mol() rd_mol = Chem.AddHs(rd_mol) EmbedMultipleConfs( rd_mol, numConfs=3, randomSeed=42, pruneRmsThresh=0.5, ) # get SDFs for each conformer for conf in rd_mol.GetConformers(): parent_ids.append(mol.id) smiles.append(mol.smiles) sdfs.append(Chem.MolToMolBlock(rd_mol, confId=conf.GetId())) # add them as representations store.addMols(smiles, { "parent_id": parent_ids, "sdf": sdfs, }) self.assertEqual(len(store.representations), len(parent_ids))
[docs] def testAddConformers(self): store = self.getStorage() self.addConformers(store) for mol in store: self.assertTrue(mol.representations) for rep in mol.representations: mol = rep.as_rd_mol() self.assertTrue(mol) self.assertFalse(rep.representations)
[docs] @staticmethod def check_representations(mols): ret = [] for mol in mols: reps = mol.representations for rep in reps: assert isinstance(rep, RepresentationMol) ret.append(rep.as_rd_mol()) return ret
[docs] def testParallel(self): store = self.getStorage() self.addConformers(store) # iterate in parallel and check that all conformers are valid rdkit molecules store.nJobs = 2 for result in store.apply(self.check_representations): for mol in result: self.assertIsInstance(mol, Chem.Mol) self.assertTrue(mol)