Source code for drugex.molecules.converters.fragmenters

"""
fragmenters

Created by: Martin Sicho
On: 06.05.22, 12:19
"""
import re
from itertools import combinations
from typing import List, Tuple, Union

import numpy as np
from rdkit import Chem
from rdkit.Chem import Recap, BRICS

from drugex.logs import logger
from drugex.molecules.converters.interfaces import ConversionException
from drugex.molecules.converters.standardizers import CleanSMILES


[docs]class Fragmenter(CleanSMILES): """ Reference implementation of the original fragmenter used in DrugEx v3. """ def __init__(self, n_frags : int, n_combs : int, method : str = 'recap', deep_clean : bool = True, max_bonds : int = 75, allow_single : bool = False): """ Args: n_frags: number of fragments to generate per compound n_combs: maximum number of combinations of the found leaf fragments method: fragmentation method to use. Possible values: ('recap', 'brics') deep_clean: deep clean the SMILES before fragmentation (see `CleanSMILES`) max_bonds: only accept molecules with the number of bonds below or equal to this threshold allow_single: return the fragment also for molecules that result in only one fragment """ super().__init__(deep_clean) self.nFrags = n_frags self.nCombs = n_combs self.method = method self.maxBonds = max_bonds self.allowSingle = allow_single if self.method not in ('recap', 'brics'): raise ConversionException(f"Unknown fragmentation method: {self.method}")
[docs] def getFragments(self, mol : Chem.Mol) -> Union[np.array, None]: """ Get fragments form an RDKit molecule Args: mol: instance of `rdkit.Chem.Mol` Returns: `numpy.array` of generated fragments """ # break molecule into leaf fragments if self.method == 'recap': frags = np.array(sorted(Recap.RecapDecompose(mol).GetLeaves().keys())) else: frags = BRICS.BRICSDecompose(mol) frags = np.array(sorted({re.sub(r'\[\d+\*\]', '*', f) for f in frags})) if len(frags) == 1 and not self.allowSingle: logger.warning(f"Only one retrieved fragment for molecule: {Chem.MolToSmiles(mol)}. Skipping...") return None return frags
[docs] def filterFragments(self, frags : List[str]) -> List[str]: """ Filter fragments to remove those that are contained in other fragments or are too small, and keep only the largest ones. Args: frags: `list` of fragments Returns: `list` of filtered fragments """ # replace connection tokens with [H] du, hy = Chem.MolFromSmiles('*'), Chem.MolFromSmiles('[H]') subs = np.array([Chem.MolFromSmiles(f) for f in frags]) subs = np.array([Chem.RemoveHs(Chem.ReplaceSubstructs(f, du, hy, replaceAll=True)[0]) for f in subs]) subs = np.array([m for m in subs if m.GetNumAtoms() > 1]) # remove fragments that contain other fragments (or are contained in other fragments?) match = np.array([[m.HasSubstructMatch(f) for f in subs] for m in subs]) frags = subs[match.sum(axis=0) == 1] # sort the fragments and only keep n_frag largest ones frags = sorted(frags, key=lambda x:-x.GetNumAtoms())[:self.nFrags] frags = [Chem.MolToSmiles(Chem.RemoveHs(f)) for f in frags] return frags
def __call__(self, smiles : str) -> Union[List[Tuple[str, str]], None]: """ Generate fragment-molecule pairs for a given SMILES string. Args: smiles: SMILES of the molecule to fragment Returns: a list of `tuple`s of format (fragment, smiles), smiles is the same as the input in "smiles" """ ret_frags = [] mol = Chem.MolFromSmiles(smiles) if mol is None: logger.warning(f"Molecule skipped due to invalid SMILES: {smiles}") return None if self.maxBonds and mol.GetNumBonds() >= self.maxBonds: logger.warning(f"Molecule skipped due to threshold on maximum bond count ({self.maxBonds}): {smiles}") return None frags = self.getFragments(mol) if frags is None: return None frags = self.filterFragments(frags) max_comb = min(self.nCombs, len(frags)) for ix in range(1, max_comb+1): # combine leaf fragments into larger fragments combs = combinations(frags, ix) for comb in combs: comb_frags = '.'.join(comb) #remove pair of fragment combinations if longer than original SMILES if len(comb_frags) > len(smiles): continue # check if substructure is in original molecule if mol.HasSubstructMatch(Chem.MolFromSmarts(comb_frags)): ret_frags.append((comb_frags, smiles)) return ret_frags
[docs]class FragmenterWithSelectedFragment(Fragmenter): """ Fragmenter that only returns fragments-molecule pairs where the input fragments contain the fragment specified in the constructor. If `exclusive=True`, only return fragments that contain the specified fragment and nothing else """ def __init__(self, fragment : str, n_frags : int, n_combs : int, method : str = 'recap', deep_clean : bool = True, max_bonds : int =75, allow_single : bool = False, exclusive : bool = False): """ Args: fragment: fragment to search for n_frags: number of fragments to generate per compound n_combs: maximum number of combinations of the found leaf fragments method: fragmentation method to use. Possible values: ('recap', 'brics') deep_clean: deep clean the SMILES before fragmentation (see `CleanSMILES`) max_bonds: only accept molecules with the number of bonds below or equal to this threshold allow_single: return the fragment also for molecules that result in only one fragment exclusive: if True, only return fragments that contain the specified fragment and nothing else """ super().__init__(n_frags, n_combs, method, deep_clean, max_bonds, allow_single) self.fragment = fragment self.exclusive = exclusive def __call__(self, smiles : str) -> Union[List[Tuple[str, str]], None]: """ Generate fragment-molecule pairs for a given SMILES string and only return those that contain the specified fragment. Args: smiles: SMILES of the molecule to fragment Returns: a list of `tuple`s of format (fragment, smiles), smiles is the same as the input in "smiles" """ ret_frags = [] mol = Chem.MolFromSmiles(smiles) if not mol.HasSubstructMatch(Chem.MolFromSmarts(self.fragment)): logger.warning(f"Molecule skipped due to missing the `{self.fragment}` fragment: {smiles}") return None if self.maxBonds and mol.GetNumBonds() >= self.maxBonds: logger.warning(f"Molecule skipped due to threshold on maximum bond count ({self.maxBonds}): {smiles}") return None frags = self.getFragments(mol) if frags is None: return None frags = self.filterFragments(frags) if self.exclusive: # only return fragments that contain the specified fragment and nothing else frags = [f for f in frags if Chem.CanonSmiles(f) == Chem.CanonSmiles(self.fragment)] if len(frags) == 0: logger.warning(f"Molecule skipped due to missing the `{self.fragment}` fragment: {smiles}") return None max_comb = min(self.nCombs, len(frags)) for ix in range(1, max_comb+1): # combine leaf fragments into larger fragments combs = combinations(frags, ix) for comb in combs: comb_frags = '.'.join(comb) # remove combination that do not contain the selected fragment if not Chem.MolFromSmiles(comb_frags).HasSubstructMatch(Chem.MolFromSmarts(self.fragment)): continue # remove pair of fragment combinations if longer than original SMILES if len(comb_frags) > len(smiles): continue # check if substructure is in original molecule if mol.HasSubstructMatch(Chem.MolFromSmarts(comb_frags)): ret_frags.append((comb_frags, smiles)) return ret_frags