Source code for qsprpred.models.assessment.metrics.regression

import numpy as np
import scipy.stats

from qsprpred.models.assessment.metrics.base import Metric


[docs]class KSlope(Metric): """Calculate the slope of the regression line through the origin between the predicted and observed values. Reference: Tropsha, A., & Golbraikh, A. (2010). In J.-L. Faulon & A. Bender (Eds.), Handbook of Chemoinformatics Algorithms. https://www.taylorfrancis.com/books/9781420082999 Attributes: name (str): Name of the scoring function (k_slope). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the slope of the regression line through the origin between the predicted and observed values. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The coefficient of determination. """ num, denom = 0, 0 for i in range(len(y_true)): num += y_true[i] * y_pred[i] denom += y_true[i] ** 2 return num / denom if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "k_slope"
[docs]class RPrime20(KSlope): """Calculate the coefficient of determination for regression line through the origin between the predicted and observed values. Reference: Tropsha, A., & Golbraikh, A. (2010). In J.-L. Faulon & A. Bender (Eds.), Handbook of Chemoinformatics Algorithms. https://www.taylorfrancis.com/books/9781420082999 Attributes: name (str): Name of the scoring function (r_prime_2_0). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the coefficient of determination for regression line through the origin between the predicted and observed values. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The coefficient of determination. """ # get the slope of the regression line through the origin k = super().__call__(y_true, y_pred) y_pred_mean = y_pred.mean() num, denom = 0, 0 for i in range(len(y_true)): num += y_pred[i] - k * y_true[i] denom += (y_pred[i] - y_pred_mean) ** 2 return 1 - num / denom if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "r_prime_2_0"
[docs]class KPrimeSlope(Metric): """Calculate the slope of the regression line through the origin between the observed and predicted values. Reference: Tropsha, A., & Golbraikh, A. (2010). In J.-L. Faulon & A. Bender (Eds.), Handbook of Chemoinformatics Algorithms. https://www.taylorfrancis.com/books/9781420082999 Attributes: name (str): Name of the scoring function (k_prime_slope). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the slope of the regression line through the origin between the observed and predicted values. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The coefficient of determination. """ num, denom = 0, 0 for i in range(len(y_true)): num += y_true[i] * y_pred[i] denom += y_pred[i] ** 2 return num / denom if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "k_prime_slope"
[docs]class R20(KPrimeSlope): """Calculate the coefficient of determination for regression line through the origin between the observed and predicted values. Reference: Tropsha, A., & Golbraikh, A. (2010). In J.-L. Faulon & A. Bender (Eds.), Handbook of Chemoinformatics Algorithms. https://www.taylorfrancis.com/books/9781420082999 Attributes: name (str): Name of the scoring function (r_2_0). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the coefficient of determination for regression line through the origin between the observed and predicted values. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The coefficient of determination. """ # get the slope of the regression line through the origin k_prime = super().__call__(y_true, y_pred) y_true_mean = y_true.mean() num, denom = 0, 0 for i in range(len(y_true)): num += y_true[i] - k_prime * y_pred[i] denom += (y_true[i] - y_true_mean) ** 2 return 1 - num / denom if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "r_2_0"
[docs]class Pearson(Metric): """Calculate the Pearson correlation coefficient. Attributes: name (str): Name of the scoring function (pearson). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the Pearson correlation coefficient. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, 1) Returns: float: The Pearson correlation coefficient. """ y_pred = y_pred.flatten() return scipy.stats.pearsonr(y_true, y_pred)[0] if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "pearson"
[docs]class Spearman(Metric): """Calculate the Spearman correlation Attributes: name (str): Name of the scoring function (spearman). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the Spearman correlation Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The Pearson Spearman coefficient. """ return scipy.stats.spearmanr(y_true, y_pred)[0] if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "spearman"
[docs]class Kendall(Metric): """Calculate the Kendall rank correlation coefficient. Attributes: name (str): Name of the scoring function (kendall). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the Kendall rank correlation coefficient. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The Kendall rank correlation coefficient. """ return scipy.stats.kendalltau(y_true, y_pred)[0] if len(y_pred) >= 2 else 0 def __str__(self) -> str: """Return the name of the scorer.""" return "kendall"
[docs]class AverageFoldError(Metric): """Calculate the average fold error (AFE). Attributes: name (str): Name of the scoring function (fold_error). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the fold error. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The fold error. """ return 10 ** (np.mean(np.log10(y_pred / y_true))) def __str__(self) -> str: """Return the name of the scorer.""" return "average_fold_error"
[docs]class AbsoluteAverageFoldError(Metric): """Calculate the absolute average fold error (AAFE). The AAFE is also known as the geometric mean fold error (GMFE). Attributes: name (str): Name of the scoring function (absolute_average_fold_error). """ def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the absolute fold error. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The absolute average fold error. """ return 10 ** (np.mean(np.abs(np.log10(y_pred / y_true)))) def __str__(self) -> str: """Return the name of the scorer.""" return "absolute_average_fold_error"
[docs]class PercentageWithinFoldError(Metric): """Calculate the percentage of predictions within a certain fold error. Attributes: name (str): Name of the scoring function (percentage_within_{x}_fold_error). """ def __init__(self, fold_error: float = 2): """Initialize the percentage within fold error scorer. Args: fold_error (float): The fold error threshold. Defaults to 2. """ self.fold_error = fold_error def __call__(self, y_true: np.array, y_pred: np.array) -> float: """Calculate the percentage of predictions within a specified fold error. Args: y_true (np.array): Ground truth (correct) target values. 1d array. y_pred (np.array): 2D array (n_samples, n_tasks) Returns: float: The percentage of predictions within a fold error. """ fold_errors = np.abs(np.log10(y_pred / y_true)) return np.mean(fold_errors < np.log10(self.fold_error)) * 100 def __str__(self) -> str: """Return the name of the scorer.""" return f"percentage_within_{self.fold_error}_fold_error"