Source code for qsprpred.data.processing.imputers

from abc import abstractmethod

import pandas as pd
from sklearn.impute._base import _BaseImputer

from .step import Step


[docs] class Imputer(Step):
[docs] @abstractmethod def transform(self, X: pd.DataFrame, y: None | pd.DataFrame = None) -> tuple[ pd.DataFrame, pd.DataFrame]: """Impute values in the dataset. Args: X (pd.DataFrame): features (to be imputed) y (pd.DataFrame): target data (to be imputed) Returns: pd.DataFrame: (imputed) data pd.DataFrame: (imputed) target data """ pass
[docs] class TargetImputer(Imputer): def __init__(self, imputer: _BaseImputer, target_properties: list[str] | None = None): """Initialize the target imputer. Args: imputer (callable): imputer function, e.g. from sklearn.impute, should have fit and transform methods target_properties (list[str], optional): target properties to impute, if None, all targets will be imputed. """ self.imputer = imputer self.target_properties = target_properties self._fitted = False
[docs] def fit(self, X: pd.DataFrame, y: pd.DataFrame): """Fit the imputer to the dataset Args: X (pd.DataFrame): training data features y (pd.DataFrame): training targets """ if self.target_properties is None: self.target_properties = y.columns.tolist() self.imputer.fit(y[self.target_properties]) self._fitted = True
[docs] def transform(self, X: pd.DataFrame, y: pd.DataFrame) -> tuple[ pd.DataFrame, pd.DataFrame]: """Impute values in the dataset. Args: X (pd.DataFrame): features (to be imputed) y (pd.DataFrame): target data (to be imputed) Returns: pd.DataFrame: (imputed) data pd.DataFrame: (imputed) target data """ if not self._fitted: raise ValueError("Imputer not fitted.") y_imputed = y.copy() y_imputed[self.target_properties] = self.imputer.transform( y[self.target_properties]) return X, y_imputed
[docs] class FeatureImputer(Imputer): def __init__(self, imputer: _BaseImputer, feature_properties: list[str] | None = None): """Initialize the feature imputer. Args: imputer (callable): imputer function, e.g. from sklearn.impute, should have fit and transform methods feature_properties (list[str], optional): feature properties to impute, if None, all features will be imputed. Note that you can set either a DescriptorSet name or a list of feature names prefixed by the DescriptorSet name, e.g. ['RDKitDesc', 'MorganFP_0', 'MorganFP_1'] """ self.imputer = imputer self.feature_properties = feature_properties self._fitted = False
[docs] def fit(self, X: pd.DataFrame, y: pd.DataFrame): """Fit the imputer to the dataset Args: X (pd.DataFrame): training data features y (pd.DataFrame): training targets """ if self.feature_properties is None: self.feature_properties = X.columns.tolist() to_be_imputed = self.get_features_to_be_imputed(X) self.imputer.fit(X[to_be_imputed]) self._fitted = True
[docs] def transform(self, X: pd.DataFrame, y: pd.DataFrame) -> tuple[ pd.DataFrame, pd.DataFrame]: """Impute values in the dataset. Args: X (pd.DataFrame): features (to be imputed) y (pd.DataFrame): target data (to be imputed) Returns: pd.DataFrame: (imputed) data pd.DataFrame: (imputed) target data """ if not self._fitted: raise ValueError("Imputer not fitted.") X_imputed = X.copy() to_be_imputed = self.get_features_to_be_imputed(X) X_imputed[to_be_imputed] = self.imputer.transform(X[to_be_imputed]) return X_imputed, y
[docs] def get_features_to_be_imputed(self, X: pd.DataFrame) -> list[str]: """Get the features that will be imputed. Args: X (pd.DataFrame): features Returns: list[str]: features to be imputed """ return [ col for col in X.columns.tolist() if any(col.startswith(prop) for prop in self.feature_properties) ]