Source code for drugex.training.scorers.similarity

"""
similarity

Created by: Sohvi Luukkonen
On: 07.10.22, 15:05
"""
import tqdm
import networkx

import numpy as np

from rdkit import Chem, DataStructs
from rdkit.Chem.Fraggle import FraggleSim
from rdkit.Chem import Descriptors as  rdFMCS

from drugex.utils.fingerprints import get_fingerprint
from drugex.training.scorers.interfaces import Scorer

[docs]class TverskyFingerprintSimilarity(Scorer): """ Scoring function for similarity to a reference molecule. Tversky similarity between fingerprints. If both alpha and beta are set to 1, reduces to Tanimoto similarity. """ def __init__(self, smiles : str, fp_type : str, alpha : float = 1., beta : float = 1., modifier=None): """ Initialize the TverskyFingerprintSimilarity scorer. Parameters ---------- smiles : str The SMILES string of the reference molecule. fp_type : str The type of fingerprint to use. alpha : float, optional The weight of the features of the reference compound. beta : float, optional The weight of the features of the compound to be scored. modifier : ScorerModifier, optional A modifier that can be used to modify the scores returned by this scorer. """ super().__init__(modifier) self.smiles = smiles self.mol = Chem.MolFromSmiles(smiles) self.fp_type = fp_type self.fp = get_fingerprint(self.mol, fp_type=fp_type) self.alpha = alpha self.beta = beta
[docs] def getScores(self, mols, frags=None): """ Get Tversky similarity scores for a list of molecules. Parameters ---------- mols : List[str] A list of SMILES strings representing molecules. frags : List[str], optional A list of fragments used to generate the molecules. This is not used by this scorer. """ scores = np.zeros(len(mols)) for i, mol in enumerate(tqdm.tqdm(mols)): try: fp = get_fingerprint(mol, fp_type=self.fp_type) scores[i] = DataStructs.TverskySimilarity(self.fp, fp, self.alpha, self.beta) except: continue return scores
[docs] def getKey(self): return f"Fingerprint similarity (fp_type={self.fp_type}, Tversky weights={self.alpha},{self.beta}, smiles={self.smiles})"
[docs]class TverskyGraphSimilarity(Scorer): """ Scoring function for similarity to a reference molecule. Tversky similarity between graphs. If both alpha and beta are set to 1, reduces to Tanimoto similarity. """ def __init__(self, smiles : str, alpha : float = 1., beta : str = 1., modifier=None): """ Initialize the TverskyGraphSimilarity scorer. Parameters ---------- smiles : str The SMILES string of the reference molecule. alpha : float, optional The weight of the features of the reference molecule, by default 1. beta : str, optional The weight of the features of the compound to be scored, by default 1. modifier : ScorerModifier, optional A ScorerModifier object to modify the scores, by default None. """ super().__init__(modifier) self.smiles = smiles self.mol = Chem.MolFromSmiles(smiles) self.alpha = alpha self.beta = beta
[docs] def getScores(self, mols, frags=None): """ Calculate the Tversky graph similarity scores for a list of molecules. Parameters ---------- mols : list of rdkit molecules The molecules to be scored. frags : list of rdkit molecules, optional The fragments used to generate the molecules, by default None. Returns ------- scores : np.array The scores for the molecules. """ scores = np.zeros(len(mols)) for i, mol in enumerate(tqdm.tqdm(mols)): try: mcs = rdFMCS.FindMCS(mols) nmcs = mcs.numAtoms nref = self.mol.GetNumAtoms() - nmcs nmol = mol.GetNumAtoms() - nmcs scores[i] = nmcs / (nmcs + self.alpha * nref + self.beta * nmol) except: continue return scores
[docs] def getKey(self): return f"Graph similarity (Tversky weights={self.alpha},{self.beta}, smiles={self.smiles})"
[docs]class FraggleSimilarity(Scorer): """ Scoring function for similarity to a reference molecule. Fraggle similarity from python source for an implementation of the fraggle similarity algorithm developed at GSK and described in this RDKit UGMpresentation: https://github.com/rdkit/UGM_2013/blob/master/Presentations/Hussain.Fraggle.pdf """ def __init__(self, smiles : str, trevsky_th : float = 0.8, modifier=None): """ Initiate the Fraggle similarity scorer. Parameters ---------- smiles : str Reference compound. trevsky_th : float, optional Trevsky threshold used by Fraggle, by default 0.8 modifier : ScoreModifier, optional Score modifier to be applied to the scores, by default None """ super().__init__(modifier) self.smiles = smiles self.mol = Chem.MolFromSmiles(smiles) self.th = trevsky_th
[docs] def getScores(self, mols, frags=None): """ Calculate the Fraggle similarity scores for a list of molecules. Parameters ---------- mols : list of rdkit molecules List of molecules to be scored. frags : list of rdkit molecules, optional List of fragments used to generate molecules. Not used in this scorer, by default None Returns ------- scores : np.array Array of scores. """ scores = np.zeros(len(mols)) for i, mol in enumerate(tqdm.tqdm(mols)): try: scores[i] = FraggleSim.GetFraggleSimilarity(self.mol, mol) except: continue return scores
[docs] def getKey(self): return f"Fraggle similarity (Tversky threshold={self.th}, smiles={self.smiles})"
[docs]class GraphEditInverseDistance(Scorer): """ Scoring function for similarity to a reference molecule. Inverse of Graph Edit distance between two molecular graphs. WARNING: Extremly slow! TODO : See, if possible to speed up """ def __init__(self, smiles, modifier=None): super().__init__(modifier) self.mol = Chem.MolFromSmiles(smiles) self.graph = self.get_graph(self.mol)
[docs] def get_graph(self, mol): Chem.Kekulize(mol) atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()] am = Chem.GetAdjacencyMatrix(mol,useBO=True) for i,atom in enumerate(atoms): am[i,i] = atom G = networkx.from_numpy_matrix(am) return G
[docs] def getScores(self, mols, frags=None): scores = np.zeros(len(mols)) for i, mol in enumerate(tqdm.tqdm(mols)): try: graph = self.get_graph(mol) for v in networkx.optimize_graph_edit_distance(self.graph, graph, edge_match=lambda a,b: a['weight']==b['weight']): dist = v scores[i] = 1 / np.sqrt(dist) except: continue return scores
[docs] def getKey(self): return f"Graph similarity (Tversky weights={self.alpha},{self.beta}, smiles={self.smiles})"