Source code for qsprpred.data.chem.standardizers.papyrus

from typing import Literal

from papyrus_structure_pipeline import standardizer as Papyrus_standardizer
from papyrus_structure_pipeline.standardizer import StandardizationResult
from rdkit import Chem
from rdkit.Chem.MolStandardize.rdMolStandardize import FragmentParent

from .base import ChemStandardizer



[docs]
class PapyrusStandardizer(ChemStandardizer):
    """Papyrus standardizer

    Uses Papyrus (>v05.6) standardization protecol to standardize SMILES.

    Béquignon, O.J.M., Bongers, B.J., Jespers, W. et al.
    Papyrus: a large-scale curated dataset aimed at bioactivity predictions.
    J Cheminform 15, 3 (2023). https://doi.org/10.1186/s13321-022-00672-x

    Attributes:
        settings (dict): Settings of the standardizer
    """
    def __init__(
        self,
        keep_stereo: bool = True,
        canonize: bool = True,
        mixture_handling: Literal["keep_largest", "filter", "keep"] = "keep_largest",
        remove_additional_salts: bool = True,
        remove_additional_metals: bool = True,
        filter_inorganic: bool = False,
        filter_non_small_molecule: bool = True,
        small_molecule_min_mw: float = 200,
        small_molecule_max_mw: float = 800,
        canonicalize_tautomer: bool = True,
        tautomer_max_tautomers: int = 2**32 - 1,
        extra_organic_atoms: list | None = None,
        extra_metals: list | None = None,
        extra_salts: list | None = None,
        uncharge: bool = True,
    ):
        """Initialize Papyrus standardizer

        Args:
            keep_stereo (bool, optional): Keep stereochemistry.
            canonize (bool, optional): Canonicalize SMILES.
            mixture_handling (Literal["keep_largest", "filter", "keep"], optional):
                How to handle mixtures. Defaults to "keep_largest".
            remove_additional_salts (bool, optional):
                Removes a custom set of fragments if present in the molecule object.
            remove_additional_metals (bool, optional):
                Removes metal fragments if present in the molecule object.
                Ignored if remove_additional_salts is set to False.
            filter_inorganic (bool, optional): Filter inorganic molecules.
            filter_non_small_molecule (bool, optional): Filter non-small molecules.
            small_molecule_min_mw (float, optional):
                Minimum molecular weight of small molecules.
            small_molecule_max_mw (float, optional):
                Maximum molecular weight of small molecules.
            canonicalize_tautomer (bool, optional): Canonicalize tautomers.
            tautomer_max_tautomers (int, optional):
                Maximum number of tautomers to consider by the tautomer search
                algorithm (<2^32).
            extra_organic_atoms (list, optional):
                Extra organic atoms to consider in addition to the default set
                (Papyrus_standardizer.ORGANIC_ATOMS).
            extra_metals (list, optional):
                Extra metals to consider in addition to the default set
                (Papyrus_standardizer.METALS).
            extra_salts (list, optional):
                Extra salts to consider in addition to the default set
                (Papyrus_standardizer.SALTS).
            uncharge (bool, optional): Uncharge molecules.
        """

        self._settings = {
            "keep_stereo": keep_stereo,
            "canonize": canonize,
            "remove_additional_salts": remove_additional_salts,
            "remove_additional_metals": remove_additional_metals,
            "filter_inorganic": filter_inorganic,
            "filter_non_small_molecule": filter_non_small_molecule,
            "canonicalize_tautomer": canonicalize_tautomer,
            "small_molecule_min_mw": small_molecule_min_mw,
            "small_molecule_max_mw": small_molecule_max_mw,
            "tautomer_allow_stereo_removal": not keep_stereo,
            "tautomer_max_tautomers": tautomer_max_tautomers,
            "extra_organic_atoms":
                (sorted(extra_organic_atoms) if extra_organic_atoms else []),
            "extra_metals": sorted(extra_metals) if extra_metals else [],
            "extra_salts": sorted(extra_salts) if extra_salts else [],
            "mixture_handling": mixture_handling,
            "uncharge": uncharge,
        }
        if self._settings["extra_organic_atoms"]:
            Papyrus_standardizer.ORGANIC_ATOMS.extend(
                self._settings["extra_organic_atoms"]
            )
        if self._settings["extra_metals"]:
            Papyrus_standardizer.METALS.extend(self._settings["extra_metals"])
        if self._settings["extra_salts"]:
            Papyrus_standardizer.SALTS.extend(self._settings["extra_salts"])

    def _fix_errors(
        self, mol: Chem.Mol, error: StandardizationResult
    ) -> Chem.Mol | None:
        """Attempts to fix mixture molecules by keeping the largest fragment.

        Args:
            mol (Chem.Mol): RDKit molecule object
            error (StandardizationResult): Error code

        Returns:
            Chem.Mol | None: Fixed molecule or None if molecule cannot be fixed
        """
        if (
            error == StandardizationResult.MIXTURE_MOLECULE and
            self._settings["mixture_handling"] == "keep_largest"
        ):
            mol = FragmentParent(mol)
            return mol
        return None


[docs]
    def convertSMILES(self, smiles: str, verbose: bool = False) -> str | None:
        """Standardize SMILES using Papyrus standardization protocol.

        Args:
            smiles (str): SMILES to be standardized
            verbose (bool, optional): Print verbose output. Defaults to False.

        Returns:
            tuple[str | None, str]:
                a tuple where the first element is the standardized SMILES and the
                second element is the original SMILES
        """
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        out = Papyrus_standardizer.standardize(
            mol,
            return_type=True,
            remove_additional_salts=self._settings["remove_additional_salts"],
            remove_additional_metals=self._settings["remove_additional_metals"],
            filter_mixtures=(
                False if self._settings["mixture_handling"] == "keep" else True
            ),
            filter_inorganic=self._settings["filter_inorganic"],
            filter_non_small_molecule=self._settings["filter_non_small_molecule"],
            small_molecule_min_mw=self._settings["small_molecule_min_mw"],
            small_molecule_max_mw=self._settings["small_molecule_max_mw"],
            canonicalize_tautomer=self._settings["canonicalize_tautomer"],
            tautomer_max_tautomers=self._settings["tautomer_max_tautomers"],
            tautomer_allow_stereo_removal=self.
            _settings["tautomer_allow_stereo_removal"],
            uncharge=self._settings["uncharge"],
        )
        results = list(out[1:])
        if StandardizationResult.CORRECT_MOLECULE not in results:
            mol = self._fix_errors(mol, results[-1])
            if not mol:
                if verbose:
                    print("SMILES rejected", smiles)
                    print("\tCause:", results)
                return None
            else:
                return self.convertSMILES(
                    Chem.MolToSmiles(
                        mol,
                        isomericSmiles=self._settings["keep_stereo"],
                        canonical=self._settings["canonize"],
                    )
                )
        else:
            return (
                Chem.MolToSmiles(
                    out[0],
                    canonical=self._settings["canonize"],
                    isomericSmiles=self._settings["keep_stereo"],
                ) if out[0] else None
            )


    @property
    def settings(self) -> dict:
        return self._settings


[docs]
    def getID(self) -> str:
        """Get the ID of the standardizer.

        In this case, the ID is based on the settings of the standardizer.
        It starts with 'PapyrusStandardizer' followed by a tilde and
        the settings concatenated with a colon.

        Returns:
            str: ID of the standardizer
        """
        sorted_keys = sorted(self._settings.keys())
        return "PapyrusStandardizer~" + ":".join(
            [f"{key}={self._settings[key]!s}" for key in sorted_keys]
        )



[docs]
    def fromSettings(self, settings: dict) -> "PapyrusStandardizer":
        """Create a Papyrus standardizer from settings.

        Args:
            settings (dict): settings of the standardizer

        Returns:
            PapyrusStandardizer: a Papyrus standardizer
        """
        return PapyrusStandardizer(**settings)