Source code for drugex.training.scorers.properties

"""
properties

Created by: Martin Sicho
On: 06.06.22, 20:17
"""
import re
import tqdm

import numpy as np
from typing import List

from rdkit import Chem
from rdkit.Chem.QED import qed
from rdkit.Chem.GraphDescriptors import BertzCT
from rdkit.Chem.Fraggle import FraggleSim
from rdkit.Chem import Descriptors as desc, Crippen, AllChem, Lipinski

from drugex.training.scorers.interfaces import Scorer
from drugex.training.scorers.sascorer import calculateScore
from drugex.training.scorers.modifiers import Gaussian

[docs]class Property(Scorer): def __init__(self, prop='MW', modifier=None): super().__init__(modifier) self.prop = prop self.prop_dict = {'MW': desc.MolWt, 'logP': Crippen.MolLogP, 'HBA': AllChem.CalcNumLipinskiHBA, 'HBD': AllChem.CalcNumLipinskiHBD, 'Rotable': AllChem.CalcNumRotatableBonds, 'Amide': AllChem.CalcNumAmideBonds, 'Bridge': AllChem.CalcNumBridgeheadAtoms, 'Hetero': AllChem.CalcNumHeteroatoms, 'Heavy': Lipinski.HeavyAtomCount, 'Spiro': AllChem.CalcNumSpiroAtoms, 'FCSP3': AllChem.CalcFractionCSP3, 'Ring': Lipinski.RingCount, 'Aliphatic': AllChem.CalcNumAliphaticRings, 'Aromatic': AllChem.CalcNumAromaticRings, 'Saturated': AllChem.CalcNumSaturatedRings, 'HeteroR': AllChem.CalcNumHeterocycles, 'TPSA': AllChem.CalcTPSA, 'Valence': desc.NumValenceElectrons, 'MR': Crippen.MolMR, 'QED': qed, 'SA': calculateScore, 'Bertz': BertzCT}
[docs] def getScores(self, mols, frags=None): scores = np.zeros(len(mols)) for i, mol in enumerate(mols): try: scores[i] = self.prop_dict[self.prop](mol) except: continue return scores
[docs] def getKey(self): return self.prop
[docs]class AtomCounter(Scorer): def __init__(self, element: str, modifier=None) -> None: """ Initialize the AtomCounter scorer. Parameters ---------- element : str The element to count within the molecules. modifier : ScoreModifier, optional A `ScoreModifier` object to modify the scores, by default None. """ super().__init__(modifier) self.element = element
[docs] def getScores(self, mols, frags=None): """ Count the number of atoms of a given type in the molecules. Parameters ---------- mols : list of rdkit molecules The molecules to score. frags : list of rdkit molecules, optional The fragments used to generate the molecules, by default None. Returns ------- scores : np.array The scores for the molecules. """ # if the molecule contains H atoms, they may be implicit, so add them scores = np.zeros(len(mols)) for i, mol in enumerate(mols): try: if self.element in ['', 'H']: mol = Chem.AddHs(mol) if self.element == '': scores[i] = len(mol.GetAtoms()) else: scores[i] = sum(1 for a in mol.GetAtoms() if a.GetSymbol() == self.element) except: continue return scores
[docs] def getKey(self): return f"AtomCounter (element={self.element})"
[docs]class Isomer(Scorer): """ Scoring function for closeness to a molecular formula. The score penalizes deviations from the required number of atoms for each element type, and for the total number of atoms. F.i., if the target formula is C2H4, the scoring function is the average of three contributions: - number of C atoms with a Gaussian modifier with mu=2, sigma=1 - number of H atoms with a Gaussian modifier with mu=4, sigma=1 - total number of atoms with a Gaussian modifier with mu=6, sigma=2 """ def __init__(self, formula: str, mean_func='geometric', modifier=None) -> None: """ Initialize the Isomer scorer. Parameters ---------- formula : str The molecular formula to score against. mean_func : str, optional Which function to use for averaging the scores ('arithmetic' or 'geometric'), by default 'geometric' modifier : ScoreModifier, optional A `ScoreModifier` object to modify the scores, by default None. """ super().__init__(modifier) self.objs, self.mods = self.scoring_functions(formula) self.mean_func = mean_func
[docs] @staticmethod def parse_molecular_formula(formula: str): """ Parse a molecular formulat to get the element types and counts. Parameters ---------- formula : str The molecular formula to parse. Returns ------- results : list of tuples A list of tuples containing element types and number of occurrences. """ matches = re.findall(r'([A-Z][a-z]*)(\d*)', formula) # Convert matches to the required format results = [] for match in matches: # convert count to an integer, and set it to 1 if the count is not visible in the molecular formula count = 1 if not match[1] else int(match[1]) results.append((match[0], count)) return results
[docs] def scoring_functions(self, formula: str): """ Create the scoring functions for the molecular formula. Parameters ---------- formula : str The molecular formula to score against. Returns ------- objs : list of Scorer objects The scoring functions for each element type. mods : list of ScoreModifier objects The modifiers for each scoring function. """ element_occurrences = self.parse_molecular_formula(formula) total_n_atoms = sum(element_tuple[1] for element_tuple in element_occurrences) # scoring functions for each element objs = [AtomCounter(element) for element, n_atoms in element_occurrences] mods = [Gaussian(mu=n_atoms, sigma=1.0) for element, n_atoms in element_occurrences] # scoring functions for the total number of atoms objs.append(AtomCounter('')) mods.append(Gaussian(mu=total_n_atoms, sigma=2.0)) return objs, mods
[docs] def getScores(self, mols: list, frags=None) -> np.array: """ Get the scores for the molecules. Parameters ---------- mols : list of rdkit molecules The molecules to score. frags : list of rdkit molecules, optional The fragments used to generate the molecules, by default None. Returns ------- scores : np.array The scores for the molecules. """ # return the average of all scoring functions score = np.array([self.mods[i](obj(mols)) for i, obj in enumerate(self.objs)]) scores = score.prod(axis=0) ** (1.0 / len(score)) if self.mean_func == 'geometric' else np.mean(score, axis=0) return scores
[docs] def getKey(self): return f"Isomer (mean_func={self.mean_func})"
[docs]class Scaffold(Scorer): def __init__(self, smart, is_match, modifier=None): """ Initialize the Scaffold scorer. Parameters ---------- smart : str The SMARTS pattern to match. is_match : bool Whether the SMARTS pattern should be matched or not. modifier : ScoreModifier, optional A `ScoreModifier` object to modify the scores, by default None. """ super().__init__(modifier) self.smart = smart self.frag = Chem.MolFromSmarts(smart) self.is_match = is_match
[docs] def getScores(self, mols, frags=None): """ Get the scores for the molecules. Parameters ---------- mols : list of rdkit molecules The molecules to score. frags : list of rdkit molecules, optional The fragments used to generate the molecules, by default None. Returns ------- scores : np.array The scores for the molecules. """ scores = np.zeros(len(mols)) for i, mol in enumerate(tqdm.tqdm(mols)): try: match = mol.HasSubstructMatch(self.frag) scores[i] = (match == self.is_match) except: continue return scores
[docs] def getKey(self): return f"Scaffold(smart={self.smart},is_match={self.is_match})"
[docs]class Uniqueness(Scorer): """ Calculates the ratio of occurence of a molecule in a set of molecules """ def __init__(self, modifier=None): super().__init__(modifier)
[docs] def getScores(self, mols : List[str], frags=None): scores = np.zeros(len(mols)) for i, mol in enumerate(mols): scores[i] = (mols.count(mol)-1) / (len(mols)-1) return scores
[docs] def getKey(self): return "Unique"
[docs]class LipophilicEfficiency(Scorer): """ Calculates the lipophilic efficiency of a molecule: LiPE = pChEMBL value - logP """ def __init__(self, qsar_scorer, modifier=None): super().__init__(modifier) self.qsar_scorer = qsar_scorer self.key = f'LipE_{qsar_scorer.getKey()}'
[docs] def getScores(self, mols : List[str], frags=None): pChEMBL = self.qsar_scorer.getScores(mols) logP = Property('logP').getScores(mols) scores = pChEMBL - logP return scores
[docs] def getKey(self): return self.key
[docs]class LigandEfficiency(Scorer): """ Calculates the ligand efficiency of a molecule: LE = 1.4 * pChEMBL / nAtoms """ def __init__(self, qsar_scorer, modifier=None): super().__init__(modifier) self.qsar_scorer = qsar_scorer self.key = f'LE_{qsar_scorer.getKey()}'
[docs] def getScores(self, mols : List[str], frags=None): pChEMBL = self.qsar_scorer.getScores(mols) nAtoms = [mol.GerNumAtoms() for mol in mols] scores = 1.4 * pChEMBL / nAtoms return scores
[docs] def getKey(self): return self.key