Source code for qsprpred.plotting.regression

"""Module for plotting regression models."""
from abc import ABC
from copy import deepcopy
from typing import List

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics

from ..data import QSPRDataset
from ..models import QSPRModel
from ..plotting.base_plot import ModelPlot
from ..tasks import ModelTasks


[docs]class RegressionPlot(ModelPlot, ABC): """Base class for all regression plots."""
[docs] def getSupportedTasks(self) -> list[ModelTasks]: """Return a list of supported model tasks.""" return [ModelTasks.REGRESSION, ModelTasks.MULTITASK_REGRESSION]
[docs] def prepareAssessment(self, assessment_df: pd.DataFrame) -> pd.DataFrame: """Prepare assessment dataframe for plotting Args: assessment_df (pd.DataFrame): the assessment dataframe containing the experimental and predicted values for each property. The dataframe should have the following columns: QSPRID, Fold (opt.), <property_name>_<suffixes>_<Label/Prediction> Returns: pd.DataFrame: The dataframe containing the assessment results, columns: QSPRID, Fold, Property, Label, Prediction, Set """ # change all property columns into one column id_vars = ["QSPRID", "Fold"] if "Fold" in assessment_df.columns else ["QSPRID"] df = assessment_df.melt(id_vars=id_vars) # split the variable (<property_name>_<suffixes>_<Label/Prediction>) column # into the property name and the type (Label or Prediction) df[["Property", "type"]] = df["variable"].str.rsplit("_", n=1, expand=True) df.drop("variable", axis=1, inplace=True) # pivot the dataframe so that Label and Prediction are separate columns df = df.pivot_table( index=[*id_vars, "Property"], columns="type", values="value" ) df.reset_index(inplace=True) df.columns.name = None # Add Fold column if it doesn't exist (for independent test set) if "Fold" not in df.columns: df["Fold"] = "Independent Test" df["Set"] = "Independent Test" else: df["Set"] = "Cross Validation" return df
[docs] def prepareRegressionResults( self, ) -> pd.DataFrame: """Prepare regression results dataframe for plotting. Returns: pd.DataFrame: the dataframe containing the regression results, columns: Model, QSPRID, Fold, Property, Label, Prediction, Set """ model_results = {} for m, model in enumerate(self.models): # Read in and prepare the cross-validation and independent test set results df_cv = self.prepareAssessment(pd.read_table(self.cvPaths[model])) df_ind = self.prepareAssessment(pd.read_table(self.indPaths[model])) # concatenate the cross-validation and independent test set results df = pd.concat([df_cv, df_ind]) print(model.name) model_results[model.name] = df # concatenate the results from all models and add the model name as a column df = ( pd.concat( model_results.values(), keys=model_results.keys(), names=["Model"] ) .reset_index(level=1, drop=True) .reset_index() ) self.results = df return df
[docs] def getSummary(self): """calculate the R2 and RMSE for each model per set (cross-validation or independent test)""" if not hasattr(self, "results"): self.prepareRegressionResults() df = deepcopy(self.results) df_summary = ( df.groupby(["Model", "Fold", "Property"]) .apply( lambda x: pd.Series( { "R2": metrics.r2_score(x["Label"], x["Prediction"]), "RMSE": metrics.root_mean_squared_error( x["Label"], x["Prediction"] ), } ) ) .reset_index() ) df_summary["Set"] = df_summary["Fold"].apply( lambda x: "Independent Test" if x == "Independent Test" else "Cross Validation" ) self.summary = df_summary return df_summary
[docs]class CorrelationPlot(RegressionPlot): """Class to plot the results of regression models. Plot predicted pX_train vs real pX_train."""
[docs] def make( self, save: bool = True, show: bool = False, out_path: str | None = None, ) -> tuple[sns.FacetGrid, pd.DataFrame]: """Plot the results of regression models. Plot predicted pX_train vs real pX_train. Args: save (bool): whether to save the plot show (bool): whether to show the plot out_path (str | None): path to save the plot to, e.g. "results/plot.png", if `None`, the plot will be saved to each model's output directory. Returns: g (sns.FacetGrid): the seaborn FacetGrid object used to make the plot pd.DataFrame: the summary data used to make the plot """ # prepare the dataframe for plotting df = self.prepareRegressionResults() if not hasattr(self, "summary"): self.getSummary() # plot the results g = sns.FacetGrid( df, col="Property", row="Model", hue="Set", margin_titles=True, height=4, sharex=False, sharey=False, ) g.map(sns.scatterplot, "Label", "Prediction", s=7, edgecolor="none") # set x and y range to be the same for each plot for ax in g.axes_dict.values(): x_min, x_max = ax.get_xlim() y_min, y_max = ax.get_ylim() ax_min = min(x_min, y_min) ax_max = max(x_max, y_max) pad = (ax_max - ax_min) * 0.1 ax.set_xlim(ax_min - pad, ax_max + pad) ax.set_ylim(ax_min - pad, ax_max + pad) ax.set_aspect("equal", "box") for ax in g.axes_dict.values(): ax.axline((0, 0), slope=1, c=".2", ls="--") g.add_legend() # save the plot if save: if out_path is not None: plt.savefig(out_path, dpi=300) else: for model in self.models: plt.savefig(f"{model.outPrefix}_correlation.png", dpi=300) # show the plot if show: plt.show() plt.clf() return g, self.summary
[docs]class WilliamsPlot(RegressionPlot): """Williams plot; plot of standardized residuals versus leverages""" def __init__(self, models: list[QSPRModel], datasets: list[QSPRDataset]): super().__init__(models) self.datasets = datasets
[docs] def make( self, save: bool = True, show: bool = False, out_path: str | None = None, ) -> tuple[sns.FacetGrid, pd.DataFrame, List[float]]: """make Williams plot Args: save (bool): whether to save the plot show (bool): whether to show the plot out_path (str | None): path to save the plot to, e.g. "results/plot.png", if `None`, the plot will be saved to each model's output directory. Returns: g (sns.FacetGrid): the seaborn FacetGrid object used to make the plot pd.DataFrame: the leverages and standardized residuals for each compound dict[str, float]: the h* values for the datasets """ def calculateLeverages( features_train: pd.DataFrame, features_test: pd.DataFrame ) -> pd.DataFrame: """Calculate the leverages for each compound in the dataset. Args: features_train (pd.DataFrame): the features for each compound in the training set features_test (pd.DataFrame): the features for each compound in the test set Returns: pd.DataFrame: the leverages for each compound in the dataset float: the h* value for the dataset """ X_train = features_train.values X_test = features_test.values # assert the number of samples is greater than the number of features assert X_train.shape[0] > X_train.shape[1], ( f"The number of samples ({X_train.shape[0]}) should be greater than the " f"number of features ({X_train.shape[1]}) for calculating the leverages." ) # get the diagonal elements of the hat matrix # these are the leverages pinv_XTX = np.linalg.pinv(X_train.T @ X_train) leverages_train = np.diag(X_train @ pinv_XTX @ X_train.T) leverages_train = pd.Series(leverages_train, index=features_train.index) leverages_test = np.diag(X_test @ pinv_XTX @ X_test.T) leverages_test = pd.Series(leverages_test, index=features_test.index) leverages = pd.concat([leverages_train, leverages_test]) # h* = (3(p+1)/N) is the cutoff for high leverage points # p is the number of features # N is the number of compounds p = X_train.shape[1] N = X_train.shape[0] h_star = (3 * (p + 1)) / N # print waring if h* > 1 if h_star > 1: print( f"Warning: h* = {h_star} is greater than 1, this may indicate that the " "number of samples is too small for the number of features. " "Leverage values are between 0 and 1, so h* should be less than 1." ) return leverages, h_star # prepare the dataframe for plotting df = self.prepareRegressionResults() # calculate the leverages and h* for each model model_leverages = {} model_h_star = {} model_p = {} # number of descriptors for model, dataset in zip(self.models, self.datasets): model_name = model.name if dataset.hasFeatures: features = dataset.getFeatures() leverages, h_star = calculateLeverages(*features) model_leverages[model_name] = leverages model_h_star[model_name] = h_star model_p[model_name] = features[0].shape[1] else: raise ValueError( f"Dataset {dataset.name} does not have features, to" " calculate leverages, the dataset should have features." ) # Add the levarages to the dataframe df["leverage"] = df.apply( lambda x: model_leverages[x["Model"]][x["QSPRID"]], axis=1 ) df["n_features"] = df["Model"].apply(lambda x: model_p[x]) # calculate the residuals df["residual"] = df["Label"] - df["Prediction"] # calculate the residuals standard deviation df["n_samples"] = df.groupby(["Model", "Set", "Property"])[ "residual" ].transform("count") # calculate degrees of freedom df["df"] = df["n_samples"] - df["n_features"] - 1 RSE = {} # check if the degrees of freedom is greater than 0 for each model, property, and set for (model, set, property), df_ in df.groupby(["Model", "Set", "Property"]): if set == "Cross Validation": if df_["df"].iloc[0] <= 0: print(f"{model} {set} {property}") print(df_[["n_samples", "n_features"]].iloc[0]) raise ValueError( "Degrees of freedom is less than or equal to 0 for some models, " "properties trainingset. Check the number of samples and features, the " "number of samples should be greater than the number of features." ) RSE[(model, property)] = np.sqrt( (1 / df_["df"].iloc[0]) * np.sum(df_["residual"] ** 2) ) # add the residual standard error to the df df["RSE"] = df.apply(lambda x: RSE[(x["Model"], x["Property"])], axis=1) # calculate the standardized residuals df["std_resid"] = df["residual"] / (df["RSE"] * np.sqrt(1 - df["leverage"])) # plot the results g = sns.FacetGrid( df, col="Property", row="Model", margin_titles=True, height=4, sharex=False, sharey=False, hue="Set", ) g.map(sns.scatterplot, "leverage", "std_resid", s=7, edgecolor="none") # add the h* line to each plot based on the model's h* # and add hlines at +/- 3 for k, ax in g.axes_dict.items(): ax.axvline(model_h_star[k[0]], c=".2", ls="--") ax.axhline(2, c=".2", ls="--") ax.axhline(-2, c=".2", ls="--") # set y axis tile to standardized residuals g.set_ylabels("Studentized Residuals") g.add_legend() # save the plot if save: if out_path is not None: plt.savefig(out_path, dpi=300) else: for model in self.models: plt.savefig(f"{model.outPrefix}_williamsplot.png", dpi=300) # show the plot if show: plt.show() plt.clf() return ( g, df[["Model", "Fold", "Property", "leverage", "std_resid", "QSPRID"]], model_h_star, )