"""Module with definitions of various extra descriptor sets:
- `Mordred`: Descriptors from molecular descriptor calculation software Mordred.
- `Mold2`: Descriptors from molecular descriptor calculation software Mold2.
- `PaDEL`: Descriptors from molecular descriptor calculation software PaDEL.
- `ProDec`: Protein descriptors from the ProDec package.
"""
import logging
import os
import zipfile
from abc import abstractmethod
from typing import Optional, Any
import mordred
import numpy as np
import pandas as pd
import prodec
from Mold2_pywrapper import Mold2 as Mold2_calculator
from PaDEL_pywrapper import PaDEL as PaDELCalculator
from PaDEL_pywrapper import descriptors as PaDEL_descriptors
from Signature_pywrapper import Signature as Signature_calculator
from mordred import descriptors as Mordred_descriptors
from rdkit import Chem
from rdkit.Chem import Mol
from qsprpred.data.descriptors.sets import DescriptorSet
from qsprpred.extra.data.utils.msa_calculator import MSAProvider, ClustalMSA
[docs]class Mordred(DescriptorSet):
"""Descriptors from molecular descriptor calculation software Mordred.
From https://github.com/mordred-descriptor/mordred.
Attributes:
descs (list[str]): List of Mordred descriptor names.
version (str): version of mordred
ignore_3D (bool): ignore 3D information
config (str): path to config file if available
"""
def __init__(
self,
descs: list[str] | None = None,
version: str | None = None,
ignore_3D: bool = False,
config: str | None = None,
):
"""
Initialize the descriptor with the same arguments as you would pass
to `DescriptorsCalculator` function of Mordred, except the
`descs` argument, which can also be a `list` of mordred descriptor names instead
of a mordred descriptor module.
Args:
descs (list[str]): List of Mordred descriptor names, a Mordred descriptor
module or `None` for all mordred descriptors
version (str): version of mordred
ignore_3D (bool): ignore 3D information
config (str): path to config file?
"""
super().__init__()
if descs:
# if mordred descriptor module is passed,
# convert to list of descriptor instances
if not isinstance(descs, list):
descs = mordred.Calculator(descs).descriptors
else:
# use all mordred descriptors if no descriptors are specified
descs = mordred.Calculator(Mordred_descriptors).descriptors
# init member variables
self.version = version
self.ignore3D = ignore_3D
self.config = config
self._mordred = None
# convert to list of descriptor names if descriptor instances are passed
self.descriptors = [str(d) for d in descs]
[docs] def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
df = self._mordred.pandas(self.iterMols(mols), quiet=True, nproc=1)
df = df.apply(pd.to_numeric, errors="coerce") # replace errors by nan values
return df.values
@property
def descriptors(self):
return self._descriptors
@descriptors.setter
def descriptors(self, names: list[str]):
"""Set the descriptors to calculate.
Converts a list of Mordred descriptor names to Mordred descriptor instances,
which is used to initialize a Mordred calculator with the specified descriptors.
Args:
names (list[str]): List of Mordred descriptor names.
"""
calc = mordred.Calculator(Mordred_descriptors)
self._mordred = mordred.Calculator(
[d for d in calc.descriptors if str(d) in names],
version=self.version,
ignore_3D=self.ignore3D,
config=self.config,
)
self._descriptors = names
def __str__(self):
return "Mordred"
[docs]class Mold2(DescriptorSet):
"""Descriptors from molecular descriptor calculation software Mold2.
From https://github.com/OlivierBeq/Mold2_pywrapper.
Initialize the descriptor with no arguments.
All descriptors are always calculated.
Arguments:
descs: names of Mold2 descriptors to be calculated (e.g. D001)
"""
def __init__(self, descs: list[str] | None = None):
"""Initialize a Mold2 descriptor calculator.
Args:
descs (list[str] | None):
names of Mold2 descriptors to be calculated (e.g. D001)
"""
super().__init__()
self._descs = descs
self._mold2 = Mold2_calculator()
self._defaultDescs = self._mold2.calculate(
[Chem.MolFromSmiles("C")], show_banner=False
).columns.tolist()
self._descriptors = self._defaultDescs[:]
self._keepindices = list(range(len(self._descriptors)))
@property
def supportsParallel(self) -> bool:
return False
[docs] def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
values = self._mold2.calculate(self.iterMols(mols), show_banner=False)
# Drop columns
values = values[self._descriptors].values
return values
@property
def descriptors(self):
return self._descriptors
@descriptors.setter
def descriptors(self, names: list[str] | None = None):
"""Set the descriptors to calculate.
Args:
names (list[str] | None): list of Mold2 descriptor names
"""
if names is None:
self._descriptors = self._defaultDescs[:]
self._keepindices = list(range(len(self._descriptors)))
return
# Find descriptors not part of Mold2
remainder = set(names).difference(set(self._defaultDescs))
if len(remainder) > 0:
raise ValueError(
f'names are not valid Mold2 descriptor names: {", ".join(remainder)}'
)
else:
new_indices = []
new_descs = []
for i, desc_name in enumerate(self._defaultDescs):
if desc_name in names:
new_indices.append(i)
new_descs.append(self._defaultDescs[i])
self._descriptors = new_descs
self._keepindices = new_indices
def __str__(self):
return "Mold2"
[docs]class PaDEL(DescriptorSet):
"""Descriptors from molecular descriptor calculation software PaDEL.
From https://github.com/OlivierBeq/PaDEL_pywrapper.
Attributes:
descriptors (list[str]): list of PaDEL descriptor names
"""
_notJSON = ["_nameMapping", "_padel", "_descriptors", *DescriptorSet._notJSON]
def __init__(
self,
descs: list[str] | None = None,
ignore_3d: bool = True,
n_jobs: int | None = None,
):
"""Initialize a PaDEL calculator
Args:
descs: list of PaDEL descriptor short names
ignore_3d (bool): skip 3D descriptor calculation
"""
super().__init__()
self.nJobs = n_jobs or os.cpu_count()
self._descs = descs
self._ignore3D = ignore_3d
# Initialize name mapping
self._initMapping()
# Initialize descriptors and calculator
if descs is None:
self.descriptors = None
else:
self.descriptors = descs
@property
def supportsParallel(self) -> bool:
return False
def _initMapping(self):
# Obtain default descriptor names
self._nameMapping = {}
for descriptor in PaDEL_descriptors:
# Skip if desc is 3D and set to be ignored
if self._ignore3D and descriptor.is_3D:
continue
for name in descriptor.description.name:
self._nameMapping[name] = descriptor
def __setstate__(self, state):
self.__dict__.update(state)
self._initMapping()
self.descriptors = state["_keep"]
[docs] def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
mols = [Chem.AddHs(mol) for mol in self.iterMols(mols)]
df = self._padel.calculate(mols, show_banner=False, njobs=self.nJobs)
intersection = list(set(self._keep).intersection(df.columns))
df = df[intersection]
return df.values
@property
def descriptors(self):
return self._keep
@descriptors.setter
def descriptors(self, names: list[str] | None = None):
"""Set the descriptors to calculate.
Args:
names (list[str] | None): list of PaDEL descriptor names
"""
# convert from name to PaDEL descriptor sub-classes
if names is None:
self._descriptors = list(set(self._nameMapping.values()))
else:
remainder = set(names).difference(set(self._nameMapping.keys()))
if len(remainder) > 0:
raise ValueError(
"names are not valid PaDEL descriptor names: "
f"{', '.join(remainder)}"
)
self._descriptors = list({self._nameMapping[name] for name in names})
# Instantiate calculator
self._padel = PaDELCalculator(self._descriptors, ignore_3D=self._ignore3D)
# Set names to keep when calculating
if names is None:
self._keep = [
name
for name, desc in self._nameMapping.items()
if desc in self._descriptors
]
else:
self._keep = names
def __str__(self):
return "PaDEL"
[docs]class ExtendedValenceSignature(DescriptorSet):
"""SMILES signature based on extended valence sequence from
The Signature Molecular Descriptor.
1. Using Extended Valence Sequences in QSAR and QSPR StudiesJean-Loup
Faulon, Donald P. Visco, and Ramdas S. Pophale
Journal of Chemical Information and Computer Sciences 2003 43 (3), 707-720
DOI: 10.1021/ci020345w
"""
def __init__(self, depth: int | list[int]):
"""Initialize a ExtendedValenceSignature calculator
Args:
depth: depth of the signature
"""
super().__init__()
self._depth = depth
self._signature = Signature_calculator()
self._descriptors = []
# Flag initialization of descriptors after first calculation
self._descriptors_init = False
@property
def supportsParallel(self) -> bool:
return False
[docs] def getDescriptors(
self, mols: list[Mol], props: dict[str, list[Any]], *args, **kwargs
) -> np.ndarray:
mols = [Chem.AddHs(mol) for mol in self.iterMols(mols)]
df = self._signature.calculate(
mols, depth=self._depth, show_banner=False, njobs=1
).fillna(0)
if not self._descriptors_init:
self.descriptors = df.columns.tolist()
self._descriptors_init = True
else:
intersection = list(set(self.descriptors).intersection(df.columns))
df = df[intersection]
return df.values
@property
def descriptors(self):
return self._descriptors
@descriptors.setter
def descriptors(self, names: list[str] | None = None):
if names is None:
self._descriptors = []
else:
self._descriptors = names
self._descriptors_init = True
def __str__(self):
return "ExtendedValenceSignature"
[docs]class ProteinDescriptorSet(DescriptorSet):
"""Abstract base class for protein descriptor sets."""
[docs] @abstractmethod
def getProteinDescriptors(
self, acc_keys: list[str], sequences: Optional[dict[str, str]] = None, **kwargs
) -> pd.DataFrame:
"""
Calculate the protein descriptors for a given target.
Args:
acc_keys (list[str]):
target accession keys, the resulting data frame
will be indexed by these keys
sequences (dict[str, str]):
optional list of protein sequences matched to the accession keys
**kwargs:
additional data passed from `ProteinDescriptorCalculator`
Returns:
pd.DataFrame:
a data frame of descriptor values of shape (acc_keys, n_descriptors),
indexed by `acc_keys`
"""
[docs] def getDescriptors(
self,
mols: list[Mol],
props: dict[str, list[Any] | dict[str, str]],
*args,
**kwargs,
) -> np.ndarray:
"""Get array of calculated protein descriptors for given targets.
Args:
mols (list[Mol]): list of molecules, not used
props (dict[str, list[Any] | dict[str, str]]): dictionary of properties
for the molecules, including the accession keys
*args: additional arguments, not used
**kwargs: additional keyword arguments, passed to `getProteinDescriptors`
Returns:
np.ndarray: array of calculated protein descriptors
"""
# Get array of calculated protein descriptors
acc_keys = sorted(set(props["acc_keys"]))
values = self.getProteinDescriptors(acc_keys, **kwargs).reset_index()
values.rename(columns={"ID": "acc_keys"}, inplace=True)
# create a data frame with the same order of acc_keys as in props
df = pd.DataFrame({"acc_keys": props["acc_keys"]})
# merge the calculated values with the data frame to attach them to the rows
df = df.merge(
values, left_on="acc_keys", right_on="acc_keys", how="left"
).set_index("acc_keys")
return df.values
@property
def requiredProps(self) -> list[str]:
existing = super().requiredProps
return ["acc_keys", *existing]
[docs] def supportsParallel(self) -> bool:
return False
[docs]class ProDec(ProteinDescriptorSet):
"""Protein descriptors from the ProDec package.
See https://github.com/OlivierBeq/ProDEC.
Attributes:
sets (list[str]):
list of ProDec descriptor names (see https://github.com/OlivierBeq/ProDEC)
factory (prodec.ProteinDescriptors):
factory to calculate descriptors
"""
def __init__(
self, sets: list[str] | None = None, msa_provider: MSAProvider = ClustalMSA()
):
"""Initialize a ProDec calculator.
Args:
sets:
list of ProDec descriptor names, if `None`, all available are used
(see https://github.com/OlivierBeq/ProDEC)
"""
super().__init__()
self.factory = prodec.ProteinDescriptors()
self.sets = self.factory.available_descriptors if sets is None else sets
self._descriptors = []
self.msaProvider = msa_provider
self.msa = None
def __getstate__(self):
o_dict = super().__getstate__()
# Remove factory from state
del o_dict["factory"]
return o_dict
def __setstate__(self, state):
super().__setstate__(state)
# Add factory to state
self.factory = prodec.ProteinDescriptors()
[docs] @staticmethod
def calculateDescriptor(
factory: prodec.ProteinDescriptors, msa: dict[str, str], descriptor: str
):
"""
Calculate a protein descriptor for given targets
using a given multiple sequence alignment.
Args:
factory (ProteinDescriptors):
factory to create the descriptor
msa (dict):
mapping of accession keys to sequences from the multiple
sequence alignment
descriptor (str):
name of the descriptor to calculate (see
https://github.com/OlivierBeq/ProDEC)
Returns:
a data frame of descriptor values of shape (acc_keys, n_descriptors),
indexed by acc_keys
"""
# Get protein descriptor from ProDEC
prodec_descriptor = factory.get_descriptor(descriptor)
# Calculate descriptor features for aligned sequences of interest
protein_features = prodec_descriptor.pandas_get(msa.values(), ids=msa.keys())
return protein_features
[docs] def getProteinDescriptors(
self, acc_keys: list[str], sequences: Optional[dict[str, str]] = None, **kwargs
) -> pd.DataFrame:
"""
Calculate the protein descriptors for a given target.
Args:
acc_keys:
target accession keys, defines the resulting index of
the returned `pd.DataFrame`
sequences:
optional list of protein sequences matched to the accession keys
**kwargs:
any additional data passed from `ProteinDescriptorCalculator`
Returns:
a data frame of descriptor values of shape (acc_keys, n_descriptors),
"""
# calculate MSA
if not self.msa:
self.msa = self.msaProvider(sequences, **kwargs)
# calculate descriptors
dfs = []
for descriptor in self.sets:
dfs.append(self.calculateDescriptor(self.factory, self.msa, descriptor))
df = pd.concat(dfs, axis=1)
df.set_index("ID", inplace=True, drop=True)
# Keep only descriptors that were requested to keep
if not self._descriptors:
self._descriptors = sorted(df.columns.tolist())
else:
df.drop(
columns=[col for col in df.columns if col not in self._descriptors],
inplace=True,
)
# reorder columns to reflect the order of descriptors
df = df[self.descriptors]
return df
@property
def descriptors(self):
return sorted(self._descriptors)
@descriptors.setter
def descriptors(self, value):
self._descriptors = value
def __str__(self):
return "ProDec_" + "_".join(self.sets)