Source code for qsprpred.data.processing.target_transformers
import pandas as pd
from .step import Step
from abc import abstractmethod
import numpy as np
from typing import Literal, ClassVar
from sklearn.preprocessing import LabelEncoder
[docs]
class TargetTransformer(Step):
[docs]
@abstractmethod
def transform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[pd.DataFrame, pd.DataFrame]:
"""Transform the target data.
Args:
X (pd.DataFrame): features
y (pd.DataFrame): target data (to be transformed)
Returns:
pd.DataFrame: data
pd.DataFrame: (transformed) target data
"""
pass
[docs]
class Discretizer(TargetTransformer):
"""Discretizes the target data into bins.
Note. using this step in a pipeline may break the subsequent model training
as the discretizer does not update the `targetProperties` of the dataset.
It is recommended to use the `makeClassification` method of the dataset instead,
see the documentation of the `QSPRDataSet` class.
Attributes:
target (str): name of the target property to be discretized
th (list[float]): thresholds for the bins
le (LabelEncoder): label encoder for multi-class discretization, only
used if more than one threshold is provided.
"""
def __init__(self, target: str, th: list[float] | float):
"""Initialize the discretizer.
Args:
target (str): name of the target property to be discretized
th (list[float] | float): thresholds for the bins.
If a single float is provided, it will be used as a single threshold.
If a list is provided, it should contain at least one value.
"""
self._fitted = False
self.target = target
if isinstance(th, float) or isinstance(th, int):
th = [th]
assert len(th) > 0, "Threshold list must contain at least one value."
if len(th) > 1:
assert len(th) > 3, (
"For multi-class classification, set at least 4 thresholds. "
"These define the lower and upper bounds of the bins, e.g. "
"[1, 2, 3, 4] will create bins (1,2], (2,3], (3,4]."
)
self.th = th
[docs]
def transform(self, X: pd.DataFrame, y: pd.DataFrame | None = None) -> tuple[pd.DataFrame, pd.DataFrame | None]:
"""Discretize the target data into bins.
Args:
X (pd.DataFrame): features
y (pd.DataFrame | None): target data to be discretized
Returns:
pd.DataFrame: data
pd.DataFrame | None: (discretized) target data
"""
if y is None:
return X, y
if isinstance(y, pd.Series):
y = pd.DataFrame(y)
if self.target not in y.columns:
raise ValueError(f"Target {self.target} not found in target data.")
if y[self.target].isna().all():
return X, y
if len(self.th) > 1:
assert max(y[self.target].dropna()) <= max(self.th), (
"Make sure final threshold value is not smaller "
"than largest value of property"
)
assert min(y[self.target].dropna()) >= min(self.th), (
"Make sure first threshold value is not larger "
"than smallest value of property"
)
y_intervals = pd.cut(
y[self.target], bins=self.th, include_lowest=True
).astype(str)
self.le = LabelEncoder()
encoded_intervals = self.le.fit_transform(y_intervals)
y_transformed = pd.DataFrame(
np.where(
y[self.target].notna(), encoded_intervals, np.nan
).astype(float),
columns=[self.target],
index=y.index
)
else:
binary_target = y[self.target] > self.th[0]
y_transformed = pd.DataFrame(
binary_target.where(y[self.target].notna(), np.nan).astype(float),
columns=[self.target],
index=y.index
)
return X, y_transformed
[docs]
def getIntervals(self, discrete_values: pd.Series) -> pd.Series:
"""Transform the discretized values to intervals.
Args:
discrete_values (pd.Series): discretized values
Returns:
pd.Series: intervals corresponding to the discretized values
"""
if not self.fitted:
raise ValueError("Discretizer has not been fitted yet.")
if not hasattr(self, "le"):
raise ValueError(
"No label encoder found, intervals can only be retrieved "
"for multi-class discretization."
)
return pd.Series(
self.le.inverse_transform(discrete_values.astype(int)),
index=discrete_values.index
)
[docs]
class SimpleTargetTransformer(TargetTransformer):
"""Applies a simple transformation to the target data.
Attributes:
transform_dict (dict): dictionary of available transformations
transformer (callable): numpy function
"""
_notJSON: ClassVar = ["transform_dict", "inverse_transform_dict"]
transform_dict = {
"log10": lambda x: (__import__("numpy").log10(x)),
"log2": lambda x: (__import__("numpy").log2(x)),
"log": lambda x: (__import__("numpy").log(x)),
"sqrt": lambda x: (__import__("numpy").sqrt(x)),
"cbrt": lambda x: (__import__("numpy").cbrt(x)),
"exp": lambda x: (__import__("numpy").exp(x)),
"square": lambda x: __import__("numpy").power(x, 2),
"cube": lambda x: __import__("numpy").power(x, 3),
"reciprocal": lambda x: __import__("numpy").reciprocal(x),
}
inverse_transform_dict = {
"log10": lambda x: (__import__("numpy").power(10, x)),
"log2": lambda x: (__import__("numpy").power(2, x)),
"log": lambda x: (__import__("numpy").exp(x)),
"sqrt": lambda x: (__import__("numpy").power(x, 2)),
"cbrt": lambda x: (__import__("numpy").power(x, 3)),
"exp": lambda x: (__import__("numpy").log(x)),
"square": lambda x: __import__("numpy").sqrt(x),
"cube": lambda x: __import__("numpy").cbrt(x),
"reciprocal": lambda x: __import__("numpy").reciprocal(x),
}
def __init__(
self,
target: str,
transformation: Literal["log10", "log2", "log", "sqrt", "cbrt", "exp", "square", "cube", "reciprocal"]
):
"""Initialize the SklearnStep
Args:
target (str): name of the target property to be transformed
transformation (str): transformer function, should be
one of log10, log2, log, sqrt, cbrt, exp, square, cube, reciprocal
"""
self._fitted = False
self.target = target
self.transformation = transformation
if transformation not in self.transform_dict:
raise ValueError(
f"Transformation {transformation} not recognized. "
f"Available transformations: {list(self.transform_dict.keys())}"
)
[docs]
def transform(
self, X: pd.DataFrame, y: None | pd.DataFrame = None
) -> tuple[pd.DataFrame, pd.DataFrame | None]:
"""Transform the data using the transformer
Args:
X (pd.DataFrame): data to be transformed
y (pd.DataFrame | None): target data to be transformed
Returns:
pd.DataFrame: transformed data
pd.DataFrame | None: (transformed) target data
"""
if y is None:
return X, None
if self.target not in y.columns:
raise ValueError(f"Target {self.target} not found in target data.")
y[self.target] = self.getTransformer()(y[self.target])
return X, y
[docs]
def inverseTransform(
self, X: pd.DataFrame, y: None | pd.DataFrame = None
) -> tuple[pd.DataFrame, pd.DataFrame | None]:
"""Inverse transform the data using the inverse transformer
Args:
X (pd.DataFrame): data to be transformed
y (pd.DataFrame | None): target data to be transformed
Returns:
pd.DataFrame: transformed data
pd.DataFrame | None: (transformed) target data
"""
if y is None:
return X, None
if self.target not in y.columns:
raise ValueError(f"Target {self.target} not found in target data.")
y[self.target] = self.getInverseTransformer()(y[self.target])
return X, y
[docs]
def getTransformer(self) -> callable:
"""Get the transformer function
Returns:
callable: transformer function
"""
return self.transform_dict[self.transformation]
[docs]
def getInverseTransformer(self) -> callable:
"""Get the inverse transformer function
Returns:
callable: inverse transformer function
"""
return self.inverse_transform_dict[self.transformation]