Source code for qsprpred.models.scikit_learn

"""Here the QSPRmodel classes can be found.

At the moment there is a class for sklearn type models. However, one for a pytorch DNN
model can be found in `qsprpred.deep`. To add more types a model class implementing
the `QSPRModel` interface can be added.
"""

import os
from typing import Any

import ml2json
import numpy as np
import pandas as pd
from sklearn.exceptions import NotFittedError
from sklearn.utils.validation import check_is_fitted

from .model import QSPRModel
from ..data.tables.qspr import QSPRDataset
from ..logs import logger
from ..tasks import ModelTasks


[docs]class SklearnModel(QSPRModel): """QSPRModel class for sklearn type models. Wrap your sklearn model class in this class to use it with the `QSPRModel` interface. """ def __init__( self, base_dir: str, alg=None, name: str | None = None, parameters: dict | None = None, autoload: bool = True, random_state: int | None = None, ): """Initialize SklearnModel model. Args: base_dir (str): base directory for model alg (Type): sklearn model class name (str): customized model name parameters (dict): model parameters autoload (bool): load model from file random_state (int): seed for the random state """ super().__init__(base_dir, alg, name, parameters, autoload, random_state) # initialize models with defined parameters try: # check if alg can be initialized with parameters if self.parameters is not None: self.alg(**self.parameters) else: self.alg() except: logger.error( f"Cannot initialize alg {self.alg} with parameters {self.parameters}." ) raise # set parameters if defined if ( (self.parameters not in [None, {}]) and hasattr(self, "estimator") and self.estimator is not None ): try: check_is_fitted(self.estimator) except NotFittedError: self.estimator.set_params(**self.parameters) # log some things logger.info("parameters: %s" % self.parameters) logger.debug(f'Model "{self.name}" initialized in: "{self.baseDir}"') @property def supportsEarlyStopping(self) -> bool: """Whether the model supports early stopping or not.""" return False
[docs] def loadEstimator(self, params: dict | None = None) -> Any: """Load estimator from alg and params. Args: params (dict): parameters """ new_parameters = self.getParameters(params) if new_parameters is not None: return self.alg(**new_parameters) else: return self.alg()
[docs] def loadEstimatorFromFile( self, params: dict | None = None, fallback_load: bool = True ): """Load estimator from file. Args: params (dict): parameters fallback_load (bool): if `True`, init estimator from alg and params if no estimator found at path """ path = f"{self.outPrefix}.json" if os.path.isfile(path): estimator = ml2json.from_json(path) self.alg = estimator.__class__ if params is not None: new_parameters = self.getParameters(params) if new_parameters is not None: estimator = estimator.set_params(**new_parameters) return estimator elif fallback_load: logger.warning( f"No estimator found at {path}, creating unfitted estimator instead. " f"Set fallback_load to False to prevent this." ) return self.loadEstimator(params) else: raise FileNotFoundError( f"No estimator found at {path}, loading estimator from file failed." )
[docs] def saveEstimator(self) -> str: """See `QSPRModel.saveEstimator`.""" estimator_path = f"{self.outPrefix}.json" ml2json.to_json(self.estimator, estimator_path) return estimator_path
[docs] def fit( self, X: pd.DataFrame | np.ndarray, y: pd.DataFrame | np.ndarray, estimator: Any = None, mode: Any = None, monitor: None = None, **kwargs, ): # check for incompatible tasks if self.task == ModelTasks.MULTITASK_MIXED: raise ValueError( "MultiTask with a mix of classification and regression tasks " "is not supported for sklearn models." ) if self.task == ModelTasks.MULTITASK_MULTICLASS: raise NotImplementedError( "At the moment there are no supported metrics " "for multi-task multi-class/mix multi-and-single class classification." ) estimator = self.estimator if estimator is None else estimator X, y = self.convertToNumpy(X, y) # sklearn models expect 1d arrays # for single target regression and classification if not self.task.isMultiTask(): y = y.ravel() return estimator.fit(X, y)
[docs] def predict( self, X: pd.DataFrame | np.ndarray | QSPRDataset, estimator: Any = None ): """See `QSPRModel.predict`.""" estimator = self.estimator if estimator is None else estimator X = self.convertToNumpy(X) preds = estimator.predict(X) # Most sklearn regression models return 1d arrays for single target regression # and sklearn single task classification models return 1d arrays # However, QSPRpred expects 2d arrays in every case if preds.ndim == 1: preds = preds.reshape(-1, 1) return preds
[docs] def predictProba( self, X: pd.DataFrame | np.ndarray | QSPRDataset, estimator: Any = None ): """See `QSPRModel.predictProba`.""" estimator = self.estimator if estimator is None else estimator X = self.convertToNumpy(X) preds = estimator.predict_proba(X) # if preds is a numpy array, convert it to a list # to be consistent with the multiclass-multitask case if isinstance(preds, np.ndarray): preds = [preds] return preds