Source code for qsprpred.data.processing.target_transformers

import pandas as pd
from .step import Step
from abc import abstractmethod
import numpy as np
from typing import Literal, ClassVar
from sklearn.preprocessing import LabelEncoder


[docs]
class TargetTransformer(Step):


[docs]
    @abstractmethod
    def transform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[pd.DataFrame, pd.DataFrame]:
        """Transform the target data.
        
        Args:
            X (pd.DataFrame): features
            y (pd.DataFrame): target data (to be transformed)
        
        Returns:
            pd.DataFrame: data
            pd.DataFrame: (transformed) target data
        """
        pass


    

[docs]
class Discretizer(TargetTransformer):
    """Discretizes the target data into bins.
    
    Note. using this step in a pipeline may break the subsequent model training
    as the discretizer does not update the `targetProperties` of the dataset.
    It is recommended to use the `makeClassification` method of the dataset instead,
    see the documentation of the `QSPRDataSet` class.

    Attributes:
        target (str): name of the target property to be discretized
        th (list[float]): thresholds for the bins
        le (LabelEncoder): label encoder for multi-class discretization, only
            used if more than one threshold is provided.
    """
    
    def __init__(self, target: str, th: list[float] | float):
        """Initialize the discretizer.
        
        Args:
            target (str): name of the target property to be discretized
            th (list[float] | float): thresholds for the bins.
                If a single float is provided, it will be used as a single threshold.
                If a list is provided, it should contain at least one value.
        """
        self._fitted = False
        self.target = target
        if isinstance(th, float) or isinstance(th, int):
            th = [th]
        assert len(th) > 0, "Threshold list must contain at least one value."
        if len(th) > 1:
            assert len(th) > 3, (
                "For multi-class classification, set at least 4 thresholds. "
                "These define the lower and upper bounds of the bins, e.g. "
                "[1, 2, 3, 4] will create bins (1,2], (2,3], (3,4]."
            )
        
        self.th = th
    

[docs]
    def transform(self, X: pd.DataFrame, y: pd.DataFrame | None = None) -> tuple[pd.DataFrame, pd.DataFrame | None]:
        """Discretize the target data into bins.
        
        Args:
            X (pd.DataFrame): features
            y (pd.DataFrame | None): target data to be discretized
        
        Returns:
            pd.DataFrame: data
            pd.DataFrame | None: (discretized) target data
        """
        if y is None:
            return X, y
        if isinstance(y, pd.Series):
            y = pd.DataFrame(y)
        if self.target not in y.columns:
            raise ValueError(f"Target {self.target} not found in target data.")
        if y[self.target].isna().all():
            return X, y
        if len(self.th) > 1:
            assert max(y[self.target].dropna()) <= max(self.th), (
                "Make sure final threshold value is not smaller "
                "than largest value of property"
            )
            assert min(y[self.target].dropna()) >= min(self.th), (
                "Make sure first threshold value is not larger "
                "than smallest value of property"
            )
            y_intervals = pd.cut(
                y[self.target], bins=self.th, include_lowest=True
            ).astype(str)
            self.le = LabelEncoder()
            encoded_intervals = self.le.fit_transform(y_intervals)
            y_transformed = pd.DataFrame(
                np.where(
                    y[self.target].notna(), encoded_intervals, np.nan
                ).astype(float),
                columns=[self.target],
                index=y.index
            )         
        else:
            binary_target = y[self.target] > self.th[0]
            y_transformed = pd.DataFrame(
                binary_target.where(y[self.target].notna(), np.nan).astype(float),
                columns=[self.target],
                index=y.index
            )
        return X, y_transformed

    

[docs]
    def getIntervals(self, discrete_values: pd.Series) -> pd.Series:
        """Transform the discretized values to intervals.
        
        Args:
            discrete_values (pd.Series): discretized values
        Returns:
            pd.Series: intervals corresponding to the discretized values
        """
        if not self.fitted:
            raise ValueError("Discretizer has not been fitted yet.")
        if not hasattr(self, "le"):
            raise ValueError(
                "No label encoder found, intervals can only be retrieved "
                "for multi-class discretization."
            )
        return pd.Series(
            self.le.inverse_transform(discrete_values.astype(int)),
            index=discrete_values.index
        )



    

[docs]
class SimpleTargetTransformer(TargetTransformer):
    """Applies a simple transformation to the target data.

    Attributes:
        transform_dict (dict): dictionary of available transformations
        transformer (callable): numpy function
    """
    _notJSON: ClassVar = ["transform_dict", "inverse_transform_dict"]
    
    transform_dict = {
        "log10": lambda x: (__import__("numpy").log10(x)),
        "log2": lambda x: (__import__("numpy").log2(x)),
        "log": lambda x: (__import__("numpy").log(x)),
        "sqrt": lambda x: (__import__("numpy").sqrt(x)),
        "cbrt": lambda x: (__import__("numpy").cbrt(x)),
        "exp": lambda x: (__import__("numpy").exp(x)),
        "square": lambda x: __import__("numpy").power(x, 2),
        "cube": lambda x: __import__("numpy").power(x, 3),
        "reciprocal": lambda x: __import__("numpy").reciprocal(x),
    }
    inverse_transform_dict = {
        "log10": lambda x: (__import__("numpy").power(10, x)),
        "log2": lambda x: (__import__("numpy").power(2, x)),
        "log": lambda x: (__import__("numpy").exp(x)),
        "sqrt": lambda x: (__import__("numpy").power(x, 2)),
        "cbrt": lambda x: (__import__("numpy").power(x, 3)),
        "exp": lambda x: (__import__("numpy").log(x)),
        "square": lambda x: __import__("numpy").sqrt(x),
        "cube": lambda x: __import__("numpy").cbrt(x),
        "reciprocal": lambda x: __import__("numpy").reciprocal(x),
    }
    
    
    def __init__(
        self, 
        target: str,
        transformation: Literal["log10", "log2", "log", "sqrt", "cbrt", "exp", "square", "cube", "reciprocal"]
    ):
        """Initialize the SklearnStep
        
        Args:
            target (str): name of the target property to be transformed
            transformation (str): transformer function, should be 
            one of log10, log2, log, sqrt, cbrt, exp, square, cube, reciprocal
        """
        self._fitted = False
        self.target = target
        self.transformation = transformation
        if transformation not in self.transform_dict:
            raise ValueError(
                f"Transformation {transformation} not recognized. "
                f"Available transformations: {list(self.transform_dict.keys())}"
            )
        
    

[docs]
    def transform(
        self, X: pd.DataFrame, y: None | pd.DataFrame = None
    ) -> tuple[pd.DataFrame, pd.DataFrame | None]:
        """Transform the data using the transformer
        
        Args:
            X (pd.DataFrame): data to be transformed
            y (pd.DataFrame | None): target data to be transformed

        Returns:
            pd.DataFrame: transformed data
            pd.DataFrame | None: (transformed) target data
        """
        if y is None:
            return X, None
        if self.target not in y.columns:
            raise ValueError(f"Target {self.target} not found in target data.")
        
        y[self.target] = self.getTransformer()(y[self.target])
        return X, y

    

[docs]
    def inverseTransform(
        self, X: pd.DataFrame, y: None | pd.DataFrame = None
    ) -> tuple[pd.DataFrame, pd.DataFrame | None]:
        """Inverse transform the data using the inverse transformer
        
        Args:
            X (pd.DataFrame): data to be transformed
            y (pd.DataFrame | None): target data to be transformed

        Returns:
            pd.DataFrame: transformed data
            pd.DataFrame | None: (transformed) target data
        """
        if y is None:
            return X, None
        if self.target not in y.columns:
            raise ValueError(f"Target {self.target} not found in target data.")
        
        y[self.target] = self.getInverseTransformer()(y[self.target])
        return X, y

        

[docs]
    def getTransformer(self) -> callable:
        """Get the transformer function

        Returns:
            callable: transformer function
        """
        return self.transform_dict[self.transformation]



[docs]
    def getInverseTransformer(self) -> callable:
        """Get the inverse transformer function

        Returns:
            callable: inverse transformer function
        """
        return self.inverse_transform_dict[self.transformation]