Source code for qsprpred.data.processing.step

from abc import abstractmethod
import pandas as pd
from ...utils.serialization import JSONSerializable
from ...utils.interfaces.randomized import Randomized

[docs] class Step(JSONSerializable): """A data preprocessing step that can be applied to a dataset""" def __init__(self, **kwargs): """Initialize the step""" self._fitted = False super().__init__(**kwargs)
[docs] def fit(self, X: pd.DataFrame, y: None | pd.DataFrame = None): """Fit the step to the dataset If the step requires fitting to the data, this method should be implemented. Args: X (pd.DataFrame): training data y (pd.DataFrame): training targets """ self._fitted = True
[docs] @abstractmethod def transform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[pd.DataFrame, pd.DataFrame | None]: """Apply the step to the dataset Note. the step should not modify the original data Args: X (pd.DataFrame): data to be transformed y (pd.DataFrame): target data to be transformed Returns: pd.DataFrame: transformed data pd.DataFrame: (transformed) target data """ pass
[docs] def fitTransform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[pd.DataFrame, pd.DataFrame | None]: """Fit the step to the dataset and apply it Args: X (pd.DataFrame): training data y (pd.DataFrame): training targets Returns: pd.DataFrame: transformed data pd.DataFrame: (transformed) target data """ self.fit(X, y) return self.transform(X, y)
@property def fitted(self) -> bool: """Check if the step is fitted Returns: bool: True if the step is fitted, False otherwise """ return self._fitted
[docs] class DummyStep(Step): """Dummy step that does nothing"""
[docs] def transform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[pd.DataFrame, pd.DataFrame | None]: """Just return the input data Args: X (pd.DataFrame): data to be transformed y (pd.DataFrame | None): target data to be transformed Returns: pd.DataFrame: unchanged data pd.DataFrame | None: unchanged target data """ return X, y
[docs] class Shuffle(Step, Randomized): """Step that shuffles the data Attributes: randomState (int | None): Seed to randomize the shuffle. """ def __init__(self, seed: int | None = None): """Initialize the shuffle step Args: seed (int | None): Seed to randomize the shuffle. """ self._fitted = False self.seed = seed @property def randomState(self) -> int | None: """Get the random state for the object.""" return self.seed @randomState.setter def randomState(self, seed: int | None): """Set the random state for the object. Args: seed (int | None): The seed to use to randomize the action. If `None`, a random seed is used instead of a fixed one. """ self.seed = seed
[docs] def transform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[pd.DataFrame, pd.DataFrame | None]: """Shuffle the data Args: X (pd.DataFrame): data to be shuffled y (pd.DataFrame | None): target data to be shuffled Returns: pd.DataFrame: shuffled data pd.DataFrame | None: shuffled target data """ X_shuffled = X.sample(frac=1, random_state=self.randomState) y_shuffled = y.loc[X_shuffled.index] if y is not None else None return X_shuffled, y_shuffled