Source code for qsprpred.benchmarks.replica

import json
import os
from copy import deepcopy

import numpy as np
import pandas as pd

from .settings.benchmark import DataPrepSettings
from ..data import QSPRDataset
from ..data.descriptors.sets import DescriptorSet
from ..data.sources.data_source import DataSource
from ..logs import logger
from ..models.assessment.methods import ModelAssessor
from ..models.hyperparam_optimization import HyperparameterOptimization
from ..models.model import QSPRModel
from ..models.monitors import NullMonitor
from ..tasks import TargetProperty
from ..utils.serialization import JSONSerializable


[docs]class Replica(JSONSerializable):
    """Class that determines settings for a single replica of a benchmarking run.

    Attributes:
        idx (int):
            Index of the replica. This is not an identifier, but rather a number
            that indicates the order of the replica in the benchmarking run.
        name (str):
            Name of the replica.
        dataSource (DataSource):
            Data source to use.
        descriptors (list[DescriptorSet]):
            Descriptor sets to use.
        targetProps (list[TargetProperty]):
            Target properties to use.
        prepSettings (DataPrepSettings):
            Data preparation settings to use.
        model (QSPRModel):
            Current model. Use `initModel` to prepare it.
        optimizer (HyperparameterOptimization):
            Hyperparameter optimizer to use.
        assessors (list[ModelAssessor]):
            Model assessors to use.
        randomSeed (int):
            Random seed to use for all random operations withing the replica.
        ds (QSPRDataset):
            Initialized data set. Only available after `initData` has been called.
        results (pd.DataFrame):
            Results from the replica. Only available after
            `runAssessment` has been called.
    """

    _notJSON = JSONSerializable._notJSON + ["ds", "results", "model"]

    def __init__(
            self,
            idx: int,
            name: str,
            data_source: DataSource,
            descriptors: list[DescriptorSet],
            target_props: list[TargetProperty],
            prep_settings: DataPrepSettings,
            model: QSPRModel,
            optimizer: HyperparameterOptimization,
            assessors: list[ModelAssessor],
            random_seed: int,
    ):
        self.idx = idx
        self.name = name
        self.dataSource = data_source
        self.descriptors = descriptors
        self.targetProps = target_props
        self.prepSettings = prep_settings
        self.optimizer = optimizer
        self.assessors = assessors
        self.randomSeed = random_seed
        self.ds = None
        self.results = None
        self.model = deepcopy(model)

    def __getstate__(self):
        o_dict = super().__getstate__()
        o_dict["model"] = self.model.save()
        o_dict["ds"] = None
        o_dict["results"] = None
        return o_dict

    def __setstate__(self, state):
        super().__setstate__(state)
        self.model = QSPRModel.fromFile(state["model"])
        self.ds = None
        self.results = None

    def __str__(self):
        return self.id

    @property
    def requiresGpu(self) -> bool:
        """Whether the model requires a GPU.

        Returns:
            bool:
                Whether the model requires a GPU.
        """
        return hasattr(self.model, "setGPUs")

[docs]    def setGPUs(self, gpus: list[int]):
        """Sets the GPUs to use for the model.

        Args:
            gpus (list[int]):
                List of GPU indices to use.
        """
        if hasattr(self.model, "setGPUs"):
            self.model.setGPUs(gpus)
        else:
            raise ValueError("Model does not support GPU usage.")

[docs]    def getGPUs(self) -> list[int]:
        """Gets the GPUs to use for the model.

        Returns:
            list[int]:
                List of GPU indices to use.
        """
        if hasattr(self.model, "getGPUs"):
            return self.model.getGPUs()
        else:
            return []

    @property
    def id(self) -> str:
        """A unique identifier for the replica.

        Returns:
            str:
                A unique identifier for the replica.
        """
        return f"{self.name}_{self.randomSeed}"

[docs]    def initData(self, reload=False):
        """Initializes the data set for this replica.

        Args:
            reload (bool, optional):
                Whether to overwrite all existing data and
                reinitialize from scratch. Defaults to `False`.
        """
        self.ds = self.dataSource.getDataSet(
            deepcopy(self.targetProps),
            overwrite=reload,
            random_state=self.randomSeed,
        )
        self.ds.dropInvalids()

[docs]    def addDescriptors(self, reload: bool = False):
        """Adds descriptors to the current data set. Make sure to call
        `initData` first to get it from the source.

        Args:
            reload (bool, optional):
                Whether to overwrite all existing data and
                reinitialize from scratch. Defaults to `False`.

        Raises:
            ValueError:
                If the data set has not been initialized.
        """
        if self.ds is None:
            raise ValueError("Data set not initialized. Call initData first.")
        desc_id = "_".join(sorted([str(d) for d in self.descriptors]))
        self.ds.name = f"{self.ds.name}_{desc_id}"
        if self.requiresGpu:
            self.ds.name = f"{self.ds.name}_gpu"
        # attempt to load the data set with descriptors
        if os.path.exists(self.ds.metaFile) and not reload:
            logger.info(f"Reloading existing {self.ds.name} from cache...")
            self.ds = QSPRDataset.fromFile(self.ds.metaFile)
            self.ds.setRandomState(self.randomSeed)
            self.ds.setTargetProperties(deepcopy(self.targetProps))
        else:
            logger.info(f"Data set {self.ds.name} not yet found. It will be created.")
            # calculate descriptors if necessary
            logger.info(f"Calculating descriptors for {self.ds.name}.")
            self.ds.addDescriptors(deepcopy(self.descriptors), recalculate=True)
            self.ds.setTargetProperties(deepcopy(self.targetProps))
            self.ds.setRandomState(self.randomSeed)
            self.ds.save()

[docs]    def prepData(self):
        """Prepares the data set for this replica.

        Raises:
            ValueError:
                If the data set has not been initialized.
        """
        if self.ds is None:
            raise ValueError("Data set not initialized. Call initData first.")
        self.ds.prepareDataset(
            **deepcopy(self.prepSettings.__dict__),
        )

[docs]    def initModel(self):
        """Initializes the model for this replica. This includes
        initializing the model from the data set and optimizing
        the hyperparameters if an optimizer is specified.

        Raises:
            ValueError:
                If the data set has not been initialized.
        """
        if self.ds is None:
            raise ValueError("Data set not initialized. Call initData first.")
        self.model.name = f"{self.id}_{self.ds.name}"
        self.model.initFromDataset(self.ds)
        self.model.initRandomState(self.randomSeed)
        if self.optimizer is not None:
            self.optimizer.optimize(self.model, self.ds)
        self.model.save()

[docs]    def runAssessment(self):
        """Runs the model assessment for this replica. This includes
        running all model assessors and saving the results.

        The results are saved in the `results` attribute. They can be
        accessed by calling `createReport`, which combines the relevant information
        from the replica and the results into one `pd.DataFrame`.

        Raises:
            ValueError:
                If the model has not been initialized.
        """
        if self.ds is None:
            raise ValueError("Data set not initialized. Call initData first.")
        if self.model is None:
            raise ValueError("Model not initialized. Call initModel first.")
        self.results = None
        for assessor in self.assessors:
            scores = assessor(self.model, self.ds, save=True)
            if isinstance(scores, float):
                scores = np.array([scores])
            scores_df = pd.DataFrame()
            for i, fold_score in enumerate(scores):
                if isinstance(fold_score, float):
                    if self.model.isMultiTask:
                        tp = self.targetProps[i]
                    else:
                        tp = self.targetProps[0]
                    score_df = pd.DataFrame(
                        {
                            "Assessor": [assessor.__class__.__name__],
                            "ScoreFunc": [
                                (
                                    assessor.scoreFunc.name
                                    if hasattr(assessor.scoreFunc, "name")
                                    else assessor.scoreFunc.__name__
                                )
                            ],
                            "Score": [fold_score],
                            "TargetProperty": [tp.name],
                            "TargetTask": [tp.task.name],
                        }
                    )
                    scores_df = pd.concat([scores_df, score_df])
                else:
                    for tp_score, tp in zip(fold_score, self.targetProps):
                        score_df = pd.DataFrame(
                            {
                                "Assessor": [assessor.__class__.__name__],
                                "ScoreFunc": [
                                    (
                                        assessor.scoreFunc.name
                                        if hasattr(assessor.scoreFunc, "name")
                                        else assessor.scoreFunc.__name__
                                    )
                                ],
                                "Score": [tp_score],
                                "TargetProperty": [tp.name],
                                "TargetTask": [tp.task.name],
                            }
                        )
                        scores_df = pd.concat([scores_df, score_df])
            if self.results is None:
                self.results = scores_df
            else:
                self.results = pd.concat([self.results, scores_df])

[docs]    def createReport(self):
        """Creates a report from the results of this replica.

        Returns:
            pd.DataFrame:
                A `pd.DataFrame` with the results of this replica.

        Raises:
            ValueError:
                If the results have not been calculated.
        """
        if self.results is None:
            raise ValueError("Results not available. Call runAssessment first.")
        results = self.results.copy()
        results["ModelFile"] = self.model.metaFile
        results["Algorithm"] = self.model.alg.__name__
        results["AlgorithmParams"] = json.dumps(self.model.parameters)
        results["ReplicaID"] = self.id
        results["DataSet"] = self.ds.name
        out_file = f"{self.model.outPrefix}_replica.json"
        for assessor in self.assessors:
            # FIXME: some problems in monitor serialization now prevent this
            assessor.monitor = NullMonitor()
        results["ReplicaFile"] = self.toFile(out_file)
        return results