Source code for drugex.training.scorers.smiles

"""
scorers

Created by: Martin Sicho
On: 03.06.22, 13:28
"""
import numpy as np
import pandas as pd
from rdkit import Chem

[docs]class SmilesChecker:
[docs] @staticmethod def checkSmiles(smiles, frags=None, no_multifrag_smiles=True): """ This method is used to check the validity of the SMILES strings and to check if they contain given fragments. Parameters ---------- smiles : list of str List of SMILES strings to check. frags : list of str, optional List of SMILES strings of fragments to check for. no_multifrag_smiles : bool, optional If True, SMILES strings that contain more than one fragment will be marked as invalid. Returns ------- scores : pd.DataFrame Dataframe with the validity and accuracy of the SMILES strings. """ scores = pd.DataFrame() #valids = np.zeros(shape) if no_multifrag_smiles: # Check if SMILES is not fragmented smiles = [smi if smi.count('.') == 0 else None for smi in smiles] for j, smile in enumerate(smiles): # 1. Check if SMILES can be parsed by rdkit try: mol = Chem.MolFromSmiles(smile) if not smile: mol = None scores.loc[j, 'Valid'] = 0 if mol is None else 1 except: scores.loc[j, 'Valid'] = 0 if frags is not None: # 2. Check if SMILES contain given fragments try: subs = frags[j].split('.') subs = [Chem.MolFromSmiles(sub) for sub in subs] scores.loc[j, 'Accurate'] = 1 if np.all([mol.HasSubstructMatch(sub) for sub in subs]) else 0 except: scores.loc[j, 'Accurate'] = 0 # else : # #scores['Accurate'] = np.nan return scores