import json
import os
from copy import deepcopy
import numpy as np
import pandas as pd
from .settings.benchmark import DataPrepSettings
from ..data import QSPRDataset
from ..data.descriptors.sets import DescriptorSet
from ..data.sources.data_source import DataSource
from ..logs import logger
from ..models.assessment.methods import ModelAssessor
from ..models.hyperparam_optimization import HyperparameterOptimization
from ..models.model import QSPRModel
from ..models.monitors import NullMonitor
from ..tasks import TargetProperty
from ..utils.serialization import JSONSerializable
[docs]class Replica(JSONSerializable):
"""Class that determines settings for a single replica of a benchmarking run.
Attributes:
idx (int):
Index of the replica. This is not an identifier, but rather a number
that indicates the order of the replica in the benchmarking run.
name (str):
Name of the replica.
dataSource (DataSource):
Data source to use.
descriptors (list[DescriptorSet]):
Descriptor sets to use.
targetProps (list[TargetProperty]):
Target properties to use.
prepSettings (DataPrepSettings):
Data preparation settings to use.
model (QSPRModel):
Current model. Use `initModel` to prepare it.
optimizer (HyperparameterOptimization):
Hyperparameter optimizer to use.
assessors (list[ModelAssessor]):
Model assessors to use.
randomSeed (int):
Random seed to use for all random operations withing the replica.
ds (QSPRDataset):
Initialized data set. Only available after `initData` has been called.
results (pd.DataFrame):
Results from the replica. Only available after
`runAssessment` has been called.
"""
_notJSON = JSONSerializable._notJSON + ["ds", "results", "model"]
def __init__(
self,
idx: int,
name: str,
data_source: DataSource,
descriptors: list[DescriptorSet],
target_props: list[TargetProperty],
prep_settings: DataPrepSettings,
model: QSPRModel,
optimizer: HyperparameterOptimization,
assessors: list[ModelAssessor],
random_seed: int,
):
self.idx = idx
self.name = name
self.dataSource = data_source
self.descriptors = descriptors
self.targetProps = target_props
self.prepSettings = prep_settings
self.optimizer = optimizer
self.assessors = assessors
self.randomSeed = random_seed
self.ds = None
self.results = None
self.model = deepcopy(model)
def __getstate__(self):
o_dict = super().__getstate__()
o_dict["model"] = self.model.save()
o_dict["ds"] = None
o_dict["results"] = None
return o_dict
def __setstate__(self, state):
super().__setstate__(state)
self.model = QSPRModel.fromFile(state["model"])
self.ds = None
self.results = None
def __str__(self):
return self.id
@property
def requiresGpu(self) -> bool:
"""Whether the model requires a GPU.
Returns:
bool:
Whether the model requires a GPU.
"""
return hasattr(self.model, "setGPUs")
[docs] def setGPUs(self, gpus: list[int]):
"""Sets the GPUs to use for the model.
Args:
gpus (list[int]):
List of GPU indices to use.
"""
if hasattr(self.model, "setGPUs"):
self.model.setGPUs(gpus)
else:
raise ValueError("Model does not support GPU usage.")
[docs] def getGPUs(self) -> list[int]:
"""Gets the GPUs to use for the model.
Returns:
list[int]:
List of GPU indices to use.
"""
if hasattr(self.model, "getGPUs"):
return self.model.getGPUs()
else:
return []
@property
def id(self) -> str:
"""A unique identifier for the replica.
Returns:
str:
A unique identifier for the replica.
"""
return f"{self.name}_{self.randomSeed}"
[docs] def initData(self, reload=False):
"""Initializes the data set for this replica.
Args:
reload (bool, optional):
Whether to overwrite all existing data and
reinitialize from scratch. Defaults to `False`.
"""
self.ds = self.dataSource.getDataSet(
deepcopy(self.targetProps),
overwrite=reload,
random_state=self.randomSeed,
)
self.ds.dropInvalids()
[docs] def addDescriptors(self, reload: bool = False):
"""Adds descriptors to the current data set. Make sure to call
`initData` first to get it from the source.
Args:
reload (bool, optional):
Whether to overwrite all existing data and
reinitialize from scratch. Defaults to `False`.
Raises:
ValueError:
If the data set has not been initialized.
"""
if self.ds is None:
raise ValueError("Data set not initialized. Call initData first.")
desc_id = "_".join(sorted([str(d) for d in self.descriptors]))
self.ds.name = f"{self.ds.name}_{desc_id}"
if self.requiresGpu:
self.ds.name = f"{self.ds.name}_gpu"
# attempt to load the data set with descriptors
if os.path.exists(self.ds.metaFile) and not reload:
logger.info(f"Reloading existing {self.ds.name} from cache...")
self.ds = QSPRDataset.fromFile(self.ds.metaFile)
self.ds.setRandomState(self.randomSeed)
self.ds.setTargetProperties(deepcopy(self.targetProps))
else:
logger.info(f"Data set {self.ds.name} not yet found. It will be created.")
# calculate descriptors if necessary
logger.info(f"Calculating descriptors for {self.ds.name}.")
self.ds.addDescriptors(deepcopy(self.descriptors), recalculate=True)
self.ds.setTargetProperties(deepcopy(self.targetProps))
self.ds.setRandomState(self.randomSeed)
self.ds.save()
[docs] def prepData(self):
"""Prepares the data set for this replica.
Raises:
ValueError:
If the data set has not been initialized.
"""
if self.ds is None:
raise ValueError("Data set not initialized. Call initData first.")
self.ds.prepareDataset(
**deepcopy(self.prepSettings.__dict__),
)
[docs] def initModel(self):
"""Initializes the model for this replica. This includes
initializing the model from the data set and optimizing
the hyperparameters if an optimizer is specified.
Raises:
ValueError:
If the data set has not been initialized.
"""
if self.ds is None:
raise ValueError("Data set not initialized. Call initData first.")
self.model.name = f"{self.id}_{self.ds.name}"
self.model.initFromDataset(self.ds)
self.model.initRandomState(self.randomSeed)
if self.optimizer is not None:
self.optimizer.optimize(self.model, self.ds)
self.model.save()
[docs] def runAssessment(self):
"""Runs the model assessment for this replica. This includes
running all model assessors and saving the results.
The results are saved in the `results` attribute. They can be
accessed by calling `createReport`, which combines the relevant information
from the replica and the results into one `pd.DataFrame`.
Raises:
ValueError:
If the model has not been initialized.
"""
if self.ds is None:
raise ValueError("Data set not initialized. Call initData first.")
if self.model is None:
raise ValueError("Model not initialized. Call initModel first.")
self.results = None
for assessor in self.assessors:
scores = assessor(self.model, self.ds, save=True)
if isinstance(scores, float):
scores = np.array([scores])
scores_df = pd.DataFrame()
for i, fold_score in enumerate(scores):
if isinstance(fold_score, float):
if self.model.isMultiTask:
tp = self.targetProps[i]
else:
tp = self.targetProps[0]
score_df = pd.DataFrame(
{
"Assessor": [assessor.__class__.__name__],
"ScoreFunc": [
(
assessor.scoreFunc.name
if hasattr(assessor.scoreFunc, "name")
else assessor.scoreFunc.__name__
)
],
"Score": [fold_score],
"TargetProperty": [tp.name],
"TargetTask": [tp.task.name],
}
)
scores_df = pd.concat([scores_df, score_df])
else:
for tp_score, tp in zip(fold_score, self.targetProps):
score_df = pd.DataFrame(
{
"Assessor": [assessor.__class__.__name__],
"ScoreFunc": [
(
assessor.scoreFunc.name
if hasattr(assessor.scoreFunc, "name")
else assessor.scoreFunc.__name__
)
],
"Score": [tp_score],
"TargetProperty": [tp.name],
"TargetTask": [tp.task.name],
}
)
scores_df = pd.concat([scores_df, score_df])
if self.results is None:
self.results = scores_df
else:
self.results = pd.concat([self.results, scores_df])
[docs] def createReport(self):
"""Creates a report from the results of this replica.
Returns:
pd.DataFrame:
A `pd.DataFrame` with the results of this replica.
Raises:
ValueError:
If the results have not been calculated.
"""
if self.results is None:
raise ValueError("Results not available. Call runAssessment first.")
results = self.results.copy()
results["ModelFile"] = self.model.metaFile
results["Algorithm"] = self.model.alg.__name__
results["AlgorithmParams"] = json.dumps(self.model.parameters)
results["ReplicaID"] = self.id
results["DataSet"] = self.ds.name
out_file = f"{self.model.outPrefix}_replica.json"
for assessor in self.assessors:
# FIXME: some problems in monitor serialization now prevent this
assessor.monitor = NullMonitor()
results["ReplicaFile"] = self.toFile(out_file)
return results