Source code for qsprpred.plotting.regression

"""Module for plotting regression models."""
from abc import ABC
from copy import deepcopy
from typing import List

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn import metrics

from ..data import QSPRDataset
from ..models import QSPRModel
from ..plotting.base_plot import ModelPlot
from ..tasks import ModelTasks


[docs]class RegressionPlot(ModelPlot, ABC):
    """Base class for all regression plots."""

[docs]    def getSupportedTasks(self) -> list[ModelTasks]:
        """Return a list of supported model tasks."""
        return [ModelTasks.REGRESSION, ModelTasks.MULTITASK_REGRESSION]

[docs]    def prepareAssessment(self, assessment_df: pd.DataFrame) -> pd.DataFrame:
        """Prepare assessment dataframe for plotting

        Args:
            assessment_df (pd.DataFrame):
                the assessment dataframe containing the experimental and predicted
                values for each property. The dataframe should have the following
                columns:
                QSPRID, Fold (opt.), <property_name>_<suffixes>_<Label/Prediction>

        Returns:
            pd.DataFrame:
                The dataframe containing the assessment results,
                columns: QSPRID, Fold, Property, Label, Prediction, Set
        """
        # change all property columns into one column
        id_vars = ["QSPRID", "Fold"] if "Fold" in assessment_df.columns else ["QSPRID"]
        df = assessment_df.melt(id_vars=id_vars)
        # split the variable (<property_name>_<suffixes>_<Label/Prediction>) column
        # into the property name and the type (Label or Prediction)
        df[["Property", "type"]] = df["variable"].str.rsplit("_", n=1, expand=True)
        df.drop("variable", axis=1, inplace=True)
        # pivot the dataframe so that Label and Prediction are separate columns
        df = df.pivot_table(
            index=[*id_vars, "Property"], columns="type", values="value"
        )
        df.reset_index(inplace=True)
        df.columns.name = None
        # Add Fold column if it doesn't exist (for independent test set)
        if "Fold" not in df.columns:
            df["Fold"] = "Independent Test"
            df["Set"] = "Independent Test"
        else:
            df["Set"] = "Cross Validation"
        return df

[docs]    def prepareRegressionResults(
        self,
    ) -> pd.DataFrame:
        """Prepare regression results dataframe for plotting.

        Returns:
            pd.DataFrame:
                the dataframe containing the regression results,
                columns: Model, QSPRID, Fold, Property, Label, Prediction, Set
        """
        model_results = {}
        for m, model in enumerate(self.models):
            # Read in and prepare the cross-validation and independent test set results
            df_cv = self.prepareAssessment(pd.read_table(self.cvPaths[model]))
            df_ind = self.prepareAssessment(pd.read_table(self.indPaths[model]))
            # concatenate the cross-validation and independent test set results
            df = pd.concat([df_cv, df_ind])
            print(model.name)
            model_results[model.name] = df
        # concatenate the results from all models and add the model name as a column
        df = (
            pd.concat(
                model_results.values(), keys=model_results.keys(), names=["Model"]
            )
            .reset_index(level=1, drop=True)
            .reset_index()
        )

        self.results = df
        return df

[docs]    def getSummary(self):
        """calculate the R2 and RMSE for each model per set (cross-validation or independent test)"""
        if not hasattr(self, "results"):
            self.prepareRegressionResults()
        df = deepcopy(self.results)
        df_summary = (
            df.groupby(["Model", "Fold", "Property"])
            .apply(
                lambda x: pd.Series(
                    {
                        "R2": metrics.r2_score(x["Label"], x["Prediction"]),
                        "RMSE": metrics.root_mean_squared_error(
                            x["Label"], x["Prediction"]
                        ),
                    }
                )
            )
            .reset_index()
        )
        df_summary["Set"] = df_summary["Fold"].apply(
            lambda x: "Independent Test"
            if x == "Independent Test"
            else "Cross Validation"
        )
        self.summary = df_summary
        return df_summary


[docs]class CorrelationPlot(RegressionPlot):
    """Class to plot the results of regression models. Plot predicted pX_train vs real pX_train."""

[docs]    def make(
        self,
        save: bool = True,
        show: bool = False,
        out_path: str | None = None,
    ) -> tuple[sns.FacetGrid, pd.DataFrame]:
        """Plot the results of regression models. Plot predicted pX_train vs real pX_train.

        Args:
            save (bool):
                whether to save the plot
            show (bool):
                whether to show the plot
            out_path (str | None):
                path to save the plot to, e.g. "results/plot.png", if `None`, the plot
                will be saved to each model's output directory.

        Returns:
            g (sns.FacetGrid):
                the seaborn FacetGrid object used to make the plot
            pd.DataFrame:
                the summary data used to make the plot
        """
        # prepare the dataframe for plotting
        df = self.prepareRegressionResults()

        if not hasattr(self, "summary"):
            self.getSummary()

        # plot the results
        g = sns.FacetGrid(
            df,
            col="Property",
            row="Model",
            hue="Set",
            margin_titles=True,
            height=4,
            sharex=False,
            sharey=False,
        )
        g.map(sns.scatterplot, "Label", "Prediction", s=7, edgecolor="none")
        # set x and y range to be the same for each plot
        for ax in g.axes_dict.values():
            x_min, x_max = ax.get_xlim()
            y_min, y_max = ax.get_ylim()
            ax_min = min(x_min, y_min)
            ax_max = max(x_max, y_max)
            pad = (ax_max - ax_min) * 0.1
            ax.set_xlim(ax_min - pad, ax_max + pad)
            ax.set_ylim(ax_min - pad, ax_max + pad)
            ax.set_aspect("equal", "box")
        for ax in g.axes_dict.values():
            ax.axline((0, 0), slope=1, c=".2", ls="--")

        g.add_legend()

        # save the plot
        if save:
            if out_path is not None:
                plt.savefig(out_path, dpi=300)
            else:
                for model in self.models:
                    plt.savefig(f"{model.outPrefix}_correlation.png", dpi=300)
        # show the plot
        if show:
            plt.show()

        plt.clf()
        return g, self.summary


[docs]class WilliamsPlot(RegressionPlot):
    """Williams plot; plot of standardized residuals versus leverages"""

    def __init__(self, models: list[QSPRModel], datasets: list[QSPRDataset]):
        super().__init__(models)
        self.datasets = datasets

[docs]    def make(
        self,
        save: bool = True,
        show: bool = False,
        out_path: str | None = None,
    ) -> tuple[sns.FacetGrid, pd.DataFrame, List[float]]:
        """make Williams plot

        Args:
            save (bool):
                whether to save the plot
            show (bool):
                whether to show the plot
            out_path (str | None):
                path to save the plot to, e.g. "results/plot.png", if `None`, the plot
                will be saved to each model's output directory.

        Returns:
            g (sns.FacetGrid):
                the seaborn FacetGrid object used to make the plot
            pd.DataFrame:
                the leverages and standardized residuals for each compound
            dict[str, float]:
                the h* values for the datasets
        """

        def calculateLeverages(
            features_train: pd.DataFrame, features_test: pd.DataFrame
        ) -> pd.DataFrame:
            """Calculate the leverages for each compound in the dataset.

            Args:
                features_train (pd.DataFrame):
                    the features for each compound in the training set
                features_test (pd.DataFrame):
                    the features for each compound in the test set

            Returns:
                pd.DataFrame:
                    the leverages for each compound in the dataset
                float:
                    the h* value for the dataset
            """
            X_train = features_train.values
            X_test = features_test.values

            # assert the number of samples is greater than the number of features
            assert X_train.shape[0] > X_train.shape[1], (
                f"The number of samples ({X_train.shape[0]}) should be greater than the "
                f"number of features ({X_train.shape[1]}) for calculating the leverages."
            )

            # get the diagonal elements of the hat matrix
            # these are the leverages
            pinv_XTX = np.linalg.pinv(X_train.T @ X_train)
            leverages_train = np.diag(X_train @ pinv_XTX @ X_train.T)
            leverages_train = pd.Series(leverages_train, index=features_train.index)

            leverages_test = np.diag(X_test @ pinv_XTX @ X_test.T)
            leverages_test = pd.Series(leverages_test, index=features_test.index)

            leverages = pd.concat([leverages_train, leverages_test])

            # h* = (3(p+1)/N) is the cutoff for high leverage points
            # p is the number of features
            # N is the number of compounds
            p = X_train.shape[1]
            N = X_train.shape[0]
            h_star = (3 * (p + 1)) / N

            # print waring if h* > 1
            if h_star > 1:
                print(
                    f"Warning: h* = {h_star} is greater than 1, this may indicate that the "
                    "number of samples is too small for the number of features. "
                    "Leverage values are between 0 and 1, so h* should be less than 1."
                )

            return leverages, h_star

        # prepare the dataframe for plotting
        df = self.prepareRegressionResults()

        # calculate the leverages and h* for each model
        model_leverages = {}
        model_h_star = {}
        model_p = {}  # number of descriptors
        for model, dataset in zip(self.models, self.datasets):
            model_name = model.name
            if dataset.hasFeatures:
                features = dataset.getFeatures()
                leverages, h_star = calculateLeverages(*features)
                model_leverages[model_name] = leverages
                model_h_star[model_name] = h_star
                model_p[model_name] = features[0].shape[1]
            else:
                raise ValueError(
                    f"Dataset {dataset.name} does not have features, to"
                    " calculate leverages, the dataset should have features."
                )

        # Add the levarages to the dataframe
        df["leverage"] = df.apply(
            lambda x: model_leverages[x["Model"]][x["QSPRID"]], axis=1
        )
        df["n_features"] = df["Model"].apply(lambda x: model_p[x])

        # calculate the residuals
        df["residual"] = df["Label"] - df["Prediction"]

        # calculate the residuals standard deviation
        df["n_samples"] = df.groupby(["Model", "Set", "Property"])[
            "residual"
        ].transform("count")

        # calculate degrees of freedom
        df["df"] = df["n_samples"] - df["n_features"] - 1

        RSE = {}
        # check if the degrees of freedom is greater than 0 for each model, property, and set
        for (model, set, property), df_ in df.groupby(["Model", "Set", "Property"]):
            if set == "Cross Validation":
                if df_["df"].iloc[0] <= 0:
                    print(f"{model} {set} {property}")
                    print(df_[["n_samples", "n_features"]].iloc[0])
                    raise ValueError(
                        "Degrees of freedom is less than or equal to 0 for some models, "
                        "properties trainingset. Check the number of samples and features, the "
                        "number of samples should be greater than the number of features."
                    )
                RSE[(model, property)] = np.sqrt(
                    (1 / df_["df"].iloc[0]) * np.sum(df_["residual"] ** 2)
                )

        # add the residual standard error to the df
        df["RSE"] = df.apply(lambda x: RSE[(x["Model"], x["Property"])], axis=1)

        # calculate the standardized residuals
        df["std_resid"] = df["residual"] / (df["RSE"] * np.sqrt(1 - df["leverage"]))

        # plot the results
        g = sns.FacetGrid(
            df,
            col="Property",
            row="Model",
            margin_titles=True,
            height=4,
            sharex=False,
            sharey=False,
            hue="Set",
        )
        g.map(sns.scatterplot, "leverage", "std_resid", s=7, edgecolor="none")
        # add the h* line to each plot based on the model's h*
        # and add hlines at +/- 3
        for k, ax in g.axes_dict.items():
            ax.axvline(model_h_star[k[0]], c=".2", ls="--")
            ax.axhline(2, c=".2", ls="--")
            ax.axhline(-2, c=".2", ls="--")

        # set y axis tile to standardized residuals
        g.set_ylabels("Studentized Residuals")
        g.add_legend()

        # save the plot
        if save:
            if out_path is not None:
                plt.savefig(out_path, dpi=300)
            else:
                for model in self.models:
                    plt.savefig(f"{model.outPrefix}_williamsplot.png", dpi=300)
        # show the plot
        if show:
            plt.show()
        plt.clf()
        return (
            g,
            df[["Model", "Fold", "Property", "leverage", "std_resid", "QSPRID"]],
            model_h_star,
        )