Source code for qsprpred.data.chem.standardization

"""Functions to pre-process SMILES for QSPR modelling."""

import re
from typing import Any

import pandas as pd
from chembl_structure_pipeline import standardizer as chembl_stand
from rdkit import Chem
from rdkit.Chem import Mol
from rdkit.Chem.SaltRemover import SaltRemover

from ..processing.mol_processor import MolProcessorWithID
from ...logs import logger


[docs]class CheckSmilesValid(MolProcessorWithID): def __call__( self, mols: list[str | Mol], props: dict[str, list[Any]], *args, **kwargs ) -> Any: throw = kwargs.get("throw", False) ret = [] for mol in mols: is_valid = True exception = None if not mol: is_valid = False exception = ValueError(f"Empty molecule: {mol}") try: mol = Chem.MolFromSmiles(mol) if isinstance(mol, str) else mol if not mol: raise ValueError(f"Invalid molecule: {mol}") Chem.SanitizeMol(mol) except Exception as exp: is_valid = False exception = exp if exception and throw: raise exception else: ret.append(is_valid) ret = pd.Series(ret, index=props[self.idProp]) return ret @property def supportsParallel(self) -> bool: return True
[docs]def neutralize_atoms(mol): """Neutralize charged molecules by atom. From https://www.rdkit.org/docs/Cookbook.html, adapted from https://baoilleach.blogspot.com/2019/12/no-charge-simple-approach-to.html Arguments: mol: rdkit molecule to be neutralized Returns: mol: neutralized rdkit mol """ pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") at_matches = mol.GetSubstructMatches(pattern) at_matches_list = [y[0] for y in at_matches] if len(at_matches_list) > 0: for at_idx in at_matches_list: atom = mol.GetAtomWithIdx(at_idx) chg = atom.GetFormalCharge() hcount = atom.GetTotalNumHs() atom.SetFormalCharge(0) atom.SetNumExplicitHs(hcount - chg) atom.UpdatePropertyCache() return mol
[docs]def chembl_smi_standardizer( smi: str, isomericSmiles: bool = True, sanitize: bool = True ) -> str: """Standardize SMILES using ChEMBL standardizer. Args: smi: SMILES string to be standardized. isomericSmiles: return the isomeric smiles. Defaults to True. sanitize: applies sanitization using the ChEMBL standardizer. Defaults to True. Returns: smiles (str): standardized SMILES string or `None` if standardization failed. """ try: mol = Chem.MolFromSmiles(smi) if not mol: raise ValueError(f"Failed to parse SMILES: {smi}") standard_mol = chembl_stand.standardize_mol(mol, sanitize=sanitize) standard_smiles = Chem.MolToSmiles( standard_mol, kekuleSmiles=False, canonical=True, isomericSmiles=isomericSmiles, ) return standard_smiles except Exception as exp: # E722 logger.warning(f"Could not standardize SMILES: {smi} due to: {exp}.") return None
[docs]def old_standardize_sanitize(smi: str) -> str: """Adaptation of the old QSPRpred molecule standardization/sanitization. Standardize the rdkit mol object and gets parent molecule using chembl_structure_pipeline, and applies some sanitization steps. Using this function is not recommended and it will be deprecated within next releases. Arguments: smi: single SMILES string to be sanitized. Returns: sanitized SMILES string. """ mol = Chem.MolFromSmiles(smi) standard_mol = chembl_stand.standardize_mol(mol) result = chembl_stand.get_parent_mol( standard_mol ) # Tuple with molecule in #0 and Boolean in #1 # Boolean states whether there was an exclusion flag. For more details, check: # https://github.com/chembl/ChEMBL_Structure_Pipeline/wiki/Exclusion-Flag parent_mol = result[0] parent_smi = Chem.MolToSmiles( parent_mol, kekuleSmiles=False, canonical=True, isomericSmiles=True ) salts = re.compile(r"\..?Cl|\..?Br|\..?Ca|\..?K|\..?Na|\..?Li|\..?Zn|/\..?Gd") s_acid_remover = re.compile(r"\.OS\(\=O\)\(\=O\)O") boron_pattern = re.compile(r"B") remover = SaltRemover(defnData="[Cl,Br,Ca,K,Na,Zn]") pattern = Chem.MolFromSmarts("[+1!h0!$([*]~[-1,-2,-3,-4]),-1!$([*]~[+1,+2,+3,+4])]") mol = Chem.MolFromSmiles(parent_smi) # Removing sulfuric acid (smiles = .OS(=O)(=O)O) if s_acid_remover.findall(parent_smi): parent_smi = re.sub(s_acid_remover, "", parent_smi) try: Chem.MolFromSmiles(parent_smi) except: # noqa: E722 print(f"{parent_smi} could not be parsed after removing sulfuric acids!") return None # Removing external molecules by splitting on . and picking the largest smiles if "." in parent_smi: parent_smi = max(parent_smi.split("."), key=len) try: mol = Chem.MolFromSmiles(parent_smi) except: # noqa: E722 print(f"Compound, ({parent_smi}) could not be parsed!!") return None # Trying to remove the salts if salts.findall(parent_smi): res, deleted = remover.StripMolWithDeleted(mol) # avoid neutralizing smiles with boron atoms if all([res is not None, not boron_pattern.findall(parent_smi)]): neutralize_atoms(res) # If it didn't remove, let's continue if salts.findall(Chem.MolToSmiles(res)): print(f"Unable to remove salts from compound {parent_smi}") return None else: parent_smi = Chem.MolToSmiles(res) mol = Chem.MolFromSmiles(parent_smi) # Are the molecules charged according to the "pattern" variable? if mol.GetSubstructMatches(pattern): res, deleted = remover.StripMolWithDeleted(mol) # avoid neutralizing smiles with boron atoms if all([res is not None, not boron_pattern.findall(parent_smi)]): neutralize_atoms(res) if salts.findall(Chem.MolToSmiles(res)): print( f"Unable to remove salts from compound {parent_smi} after neutralizing" ) return None else: parent_smi = Chem.MolToSmiles(res) return parent_smi