"""
properties
Created by: Martin Sicho
On: 06.06.22, 20:17
"""
import re
import tqdm
import numpy as np
from typing import List
from rdkit import Chem
from rdkit.Chem.QED import qed
from rdkit.Chem.GraphDescriptors import BertzCT
from rdkit.Chem.Fraggle import FraggleSim
from rdkit.Chem import Descriptors as desc, Crippen, AllChem, Lipinski
from drugex.training.scorers.interfaces import Scorer
from drugex.training.scorers.sascorer import calculateScore
from drugex.training.scorers.modifiers import Gaussian
[docs]class Property(Scorer):
def __init__(self, prop='MW', modifier=None):
super().__init__(modifier)
self.prop = prop
self.prop_dict = {'MW': desc.MolWt,
'logP': Crippen.MolLogP,
'HBA': AllChem.CalcNumLipinskiHBA,
'HBD': AllChem.CalcNumLipinskiHBD,
'Rotable': AllChem.CalcNumRotatableBonds,
'Amide': AllChem.CalcNumAmideBonds,
'Bridge': AllChem.CalcNumBridgeheadAtoms,
'Hetero': AllChem.CalcNumHeteroatoms,
'Heavy': Lipinski.HeavyAtomCount,
'Spiro': AllChem.CalcNumSpiroAtoms,
'FCSP3': AllChem.CalcFractionCSP3,
'Ring': Lipinski.RingCount,
'Aliphatic': AllChem.CalcNumAliphaticRings,
'Aromatic': AllChem.CalcNumAromaticRings,
'Saturated': AllChem.CalcNumSaturatedRings,
'HeteroR': AllChem.CalcNumHeterocycles,
'TPSA': AllChem.CalcTPSA,
'Valence': desc.NumValenceElectrons,
'MR': Crippen.MolMR,
'QED': qed,
'SA': calculateScore,
'Bertz': BertzCT}
[docs] def getScores(self, mols, frags=None):
scores = np.zeros(len(mols))
for i, mol in enumerate(mols):
try:
scores[i] = self.prop_dict[self.prop](mol)
except:
continue
return scores
[docs] def getKey(self):
return self.prop
[docs]class AtomCounter(Scorer):
def __init__(self, element: str, modifier=None) -> None:
"""
Initialize the AtomCounter scorer.
Parameters
----------
element : str
The element to count within the molecules.
modifier : ScoreModifier, optional
A `ScoreModifier` object to modify the scores, by default None.
"""
super().__init__(modifier)
self.element = element
[docs] def getScores(self, mols, frags=None):
"""
Count the number of atoms of a given type in the molecules.
Parameters
----------
mols : list of rdkit molecules
The molecules to score.
frags : list of rdkit molecules, optional
The fragments used to generate the molecules, by default None.
Returns
-------
scores : np.array
The scores for the molecules.
"""
# if the molecule contains H atoms, they may be implicit, so add them
scores = np.zeros(len(mols))
for i, mol in enumerate(mols):
try:
if self.element in ['', 'H']:
mol = Chem.AddHs(mol)
if self.element == '':
scores[i] = len(mol.GetAtoms())
else:
scores[i] = sum(1 for a in mol.GetAtoms() if a.GetSymbol() == self.element)
except: continue
return scores
[docs] def getKey(self):
return f"AtomCounter (element={self.element})"
[docs]class Isomer(Scorer):
"""
Scoring function for closeness to a molecular formula.
The score penalizes deviations from the required number of atoms for each element type, and for the total
number of atoms.
F.i., if the target formula is C2H4, the scoring function is the average of three contributions:
- number of C atoms with a Gaussian modifier with mu=2, sigma=1
- number of H atoms with a Gaussian modifier with mu=4, sigma=1
- total number of atoms with a Gaussian modifier with mu=6, sigma=2
"""
def __init__(self, formula: str, mean_func='geometric', modifier=None) -> None:
"""
Initialize the Isomer scorer.
Parameters
----------
formula : str
The molecular formula to score against.
mean_func : str, optional
Which function to use for averaging the scores ('arithmetic' or 'geometric'), by default 'geometric'
modifier : ScoreModifier, optional
A `ScoreModifier` object to modify the scores, by default None.
"""
super().__init__(modifier)
self.objs, self.mods = self.scoring_functions(formula)
self.mean_func = mean_func
[docs] def scoring_functions(self, formula: str):
"""
Create the scoring functions for the molecular formula.
Parameters
----------
formula : str
The molecular formula to score against.
Returns
-------
objs : list of Scorer objects
The scoring functions for each element type.
mods : list of ScoreModifier objects
The modifiers for each scoring function.
"""
element_occurrences = self.parse_molecular_formula(formula)
total_n_atoms = sum(element_tuple[1] for element_tuple in element_occurrences)
# scoring functions for each element
objs = [AtomCounter(element) for element, n_atoms in element_occurrences]
mods = [Gaussian(mu=n_atoms, sigma=1.0) for element, n_atoms in element_occurrences]
# scoring functions for the total number of atoms
objs.append(AtomCounter(''))
mods.append(Gaussian(mu=total_n_atoms, sigma=2.0))
return objs, mods
[docs] def getScores(self, mols: list, frags=None) -> np.array:
"""
Get the scores for the molecules.
Parameters
----------
mols : list of rdkit molecules
The molecules to score.
frags : list of rdkit molecules, optional
The fragments used to generate the molecules, by default None.
Returns
-------
scores : np.array
The scores for the molecules.
"""
# return the average of all scoring functions
score = np.array([self.mods[i](obj(mols)) for i, obj in enumerate(self.objs)])
scores = score.prod(axis=0) ** (1.0 / len(score)) if self.mean_func == 'geometric' else np.mean(score, axis=0)
return scores
[docs] def getKey(self):
return f"Isomer (mean_func={self.mean_func})"
[docs]class Scaffold(Scorer):
def __init__(self, smart, is_match, modifier=None):
"""
Initialize the Scaffold scorer.
Parameters
----------
smart : str
The SMARTS pattern to match.
is_match : bool
Whether the SMARTS pattern should be matched or not.
modifier : ScoreModifier, optional
A `ScoreModifier` object to modify the scores, by default None.
"""
super().__init__(modifier)
self.smart = smart
self.frag = Chem.MolFromSmarts(smart)
self.is_match = is_match
[docs] def getScores(self, mols, frags=None):
"""
Get the scores for the molecules.
Parameters
----------
mols : list of rdkit molecules
The molecules to score.
frags : list of rdkit molecules, optional
The fragments used to generate the molecules, by default None.
Returns
-------
scores : np.array
The scores for the molecules.
"""
scores = np.zeros(len(mols))
for i, mol in enumerate(tqdm.tqdm(mols)):
try:
match = mol.HasSubstructMatch(self.frag)
scores[i] = (match == self.is_match)
except: continue
return scores
[docs] def getKey(self):
return f"Scaffold(smart={self.smart},is_match={self.is_match})"
[docs]class Uniqueness(Scorer):
"""
Calculates the ratio of occurence of a molecule in a set of molecules
"""
def __init__(self, modifier=None):
super().__init__(modifier)
[docs] def getScores(self, mols : List[str], frags=None):
scores = np.zeros(len(mols))
for i, mol in enumerate(mols):
scores[i] = (mols.count(mol)-1) / (len(mols)-1)
return scores
[docs] def getKey(self):
return "Unique"
[docs]class LipophilicEfficiency(Scorer):
"""
Calculates the lipophilic efficiency of a molecule: LiPE = pChEMBL value - logP
"""
def __init__(self, qsar_scorer, modifier=None):
super().__init__(modifier)
self.qsar_scorer = qsar_scorer
self.key = f'LipE_{qsar_scorer.getKey()}'
[docs] def getScores(self, mols : List[str], frags=None):
pChEMBL = self.qsar_scorer.getScores(mols)
logP = Property('logP').getScores(mols)
scores = pChEMBL - logP
return scores
[docs] def getKey(self):
return self.key
[docs]class LigandEfficiency(Scorer):
"""
Calculates the ligand efficiency of a molecule: LE = 1.4 * pChEMBL / nAtoms
"""
def __init__(self, qsar_scorer, modifier=None):
super().__init__(modifier)
self.qsar_scorer = qsar_scorer
self.key = f'LE_{qsar_scorer.getKey()}'
[docs] def getScores(self, mols : List[str], frags=None):
pChEMBL = self.qsar_scorer.getScores(mols)
nAtoms = [mol.GerNumAtoms() for mol in mols]
scores = 1.4 * pChEMBL / nAtoms
return scores
[docs] def getKey(self):
return self.key