Source code for qsprpred.data.chem.standardizers.papyrus

from typing import Literal

from papyrus_structure_pipeline import standardizer as Papyrus_standardizer
from papyrus_structure_pipeline.standardizer import StandardizationResult
from rdkit import Chem
from rdkit.Chem.MolStandardize.rdMolStandardize import FragmentParent

from .base import ChemStandardizer


[docs] class PapyrusStandardizer(ChemStandardizer): """Papyrus standardizer Uses Papyrus (>v05.6) standardization protecol to standardize SMILES. Béquignon, O.J.M., Bongers, B.J., Jespers, W. et al. Papyrus: a large-scale curated dataset aimed at bioactivity predictions. J Cheminform 15, 3 (2023). https://doi.org/10.1186/s13321-022-00672-x Attributes: settings (dict): Settings of the standardizer """ def __init__( self, keep_stereo: bool = True, canonize: bool = True, mixture_handling: Literal["keep_largest", "filter", "keep"] = "keep_largest", remove_additional_salts: bool = True, remove_additional_metals: bool = True, filter_inorganic: bool = False, filter_non_small_molecule: bool = True, small_molecule_min_mw: float = 200, small_molecule_max_mw: float = 800, canonicalize_tautomer: bool = True, tautomer_max_tautomers: int = 2**32 - 1, extra_organic_atoms: list | None = None, extra_metals: list | None = None, extra_salts: list | None = None, uncharge: bool = True, ): """Initialize Papyrus standardizer Args: keep_stereo (bool, optional): Keep stereochemistry. canonize (bool, optional): Canonicalize SMILES. mixture_handling (Literal["keep_largest", "filter", "keep"], optional): How to handle mixtures. Defaults to "keep_largest". remove_additional_salts (bool, optional): Removes a custom set of fragments if present in the molecule object. remove_additional_metals (bool, optional): Removes metal fragments if present in the molecule object. Ignored if remove_additional_salts is set to False. filter_inorganic (bool, optional): Filter inorganic molecules. filter_non_small_molecule (bool, optional): Filter non-small molecules. small_molecule_min_mw (float, optional): Minimum molecular weight of small molecules. small_molecule_max_mw (float, optional): Maximum molecular weight of small molecules. canonicalize_tautomer (bool, optional): Canonicalize tautomers. tautomer_max_tautomers (int, optional): Maximum number of tautomers to consider by the tautomer search algorithm (<2^32). extra_organic_atoms (list, optional): Extra organic atoms to consider in addition to the default set (Papyrus_standardizer.ORGANIC_ATOMS). extra_metals (list, optional): Extra metals to consider in addition to the default set (Papyrus_standardizer.METALS). extra_salts (list, optional): Extra salts to consider in addition to the default set (Papyrus_standardizer.SALTS). uncharge (bool, optional): Uncharge molecules. """ self._settings = { "keep_stereo": keep_stereo, "canonize": canonize, "remove_additional_salts": remove_additional_salts, "remove_additional_metals": remove_additional_metals, "filter_inorganic": filter_inorganic, "filter_non_small_molecule": filter_non_small_molecule, "canonicalize_tautomer": canonicalize_tautomer, "small_molecule_min_mw": small_molecule_min_mw, "small_molecule_max_mw": small_molecule_max_mw, "tautomer_allow_stereo_removal": not keep_stereo, "tautomer_max_tautomers": tautomer_max_tautomers, "extra_organic_atoms": (sorted(extra_organic_atoms) if extra_organic_atoms else []), "extra_metals": sorted(extra_metals) if extra_metals else [], "extra_salts": sorted(extra_salts) if extra_salts else [], "mixture_handling": mixture_handling, "uncharge": uncharge, } if self._settings["extra_organic_atoms"]: Papyrus_standardizer.ORGANIC_ATOMS.extend( self._settings["extra_organic_atoms"] ) if self._settings["extra_metals"]: Papyrus_standardizer.METALS.extend(self._settings["extra_metals"]) if self._settings["extra_salts"]: Papyrus_standardizer.SALTS.extend(self._settings["extra_salts"]) def _fix_errors( self, mol: Chem.Mol, error: StandardizationResult ) -> Chem.Mol | None: """Attempts to fix mixture molecules by keeping the largest fragment. Args: mol (Chem.Mol): RDKit molecule object error (StandardizationResult): Error code Returns: Chem.Mol | None: Fixed molecule or None if molecule cannot be fixed """ if ( error == StandardizationResult.MIXTURE_MOLECULE and self._settings["mixture_handling"] == "keep_largest" ): mol = FragmentParent(mol) return mol return None
[docs] def convertSMILES(self, smiles: str, verbose: bool = False) -> str | None: """Standardize SMILES using Papyrus standardization protocol. Args: smiles (str): SMILES to be standardized verbose (bool, optional): Print verbose output. Defaults to False. Returns: tuple[str | None, str]: a tuple where the first element is the standardized SMILES and the second element is the original SMILES """ mol = Chem.MolFromSmiles(smiles, sanitize=False) out = Papyrus_standardizer.standardize( mol, return_type=True, remove_additional_salts=self._settings["remove_additional_salts"], remove_additional_metals=self._settings["remove_additional_metals"], filter_mixtures=( False if self._settings["mixture_handling"] == "keep" else True ), filter_inorganic=self._settings["filter_inorganic"], filter_non_small_molecule=self._settings["filter_non_small_molecule"], small_molecule_min_mw=self._settings["small_molecule_min_mw"], small_molecule_max_mw=self._settings["small_molecule_max_mw"], canonicalize_tautomer=self._settings["canonicalize_tautomer"], tautomer_max_tautomers=self._settings["tautomer_max_tautomers"], tautomer_allow_stereo_removal=self. _settings["tautomer_allow_stereo_removal"], uncharge=self._settings["uncharge"], ) results = list(out[1:]) if StandardizationResult.CORRECT_MOLECULE not in results: mol = self._fix_errors(mol, results[-1]) if not mol: if verbose: print("SMILES rejected", smiles) print("\tCause:", results) return None else: return self.convertSMILES( Chem.MolToSmiles( mol, isomericSmiles=self._settings["keep_stereo"], canonical=self._settings["canonize"], ) ) else: return ( Chem.MolToSmiles( out[0], canonical=self._settings["canonize"], isomericSmiles=self._settings["keep_stereo"], ) if out[0] else None )
@property def settings(self) -> dict: return self._settings
[docs] def getID(self) -> str: """Get the ID of the standardizer. In this case, the ID is based on the settings of the standardizer. It starts with 'PapyrusStandardizer' followed by a tilde and the settings concatenated with a colon. Returns: str: ID of the standardizer """ sorted_keys = sorted(self._settings.keys()) return "PapyrusStandardizer~" + ":".join( [f"{key}={self._settings[key]!s}" for key in sorted_keys] )
[docs] def fromSettings(self, settings: dict) -> "PapyrusStandardizer": """Create a Papyrus standardizer from settings. Args: settings (dict): settings of the standardizer Returns: PapyrusStandardizer: a Papyrus standardizer """ return PapyrusStandardizer(**settings)