"""
corpus
Created by: Martin Sicho
On: 26.04.22, 16:47
"""
from rdkit import Chem
from drugex.logs import logger
from drugex.data.corpus.interfaces import Corpus
from drugex.data.corpus.vocabulary import VocSmiles, VocGraph
[docs]class SequenceCorpus(Corpus):
"""
A `Corpus` to encode molecules for the sequence-based models.
"""
def __init__(self, molecules, vocabulary=VocSmiles(False), update_voc=True, throw = False, check_unique=True):
"""
Create a sequence corpus.
Args:
molecules: an `iterable`, `MolSupplier` or a `list`-like data structure to supply sequence representations of molecules (i.e. SMILES strings)
vocabulary: a `SequenceVocabulary` instance to be used for encoding and collecting tokens
update_voc: `True` if the tokens in the vocabulary should be updated with new tokens derived from the data (the `SequenceVocabulary.addWordsFromSeq()` method is used for splitting instead of doing simply `SequenceVocabulary.splitSequence()`)
throw: 'True' if molecules that contain tokens that are not in the vocabulary should be thrown out of corpus (the `SequenceVocabulary.removeIfNew()` method is used for splitting instead of doing simply `SequenceVocabulary.splitSequence()`)
check_unique: Skip identical sequences in "molecules".
"""
super().__init__(molecules)
self.vocabulary = vocabulary
self.updateVoc = update_voc
self.throw = throw
if self.updateVoc and self.throw:
logger.warning(f"update_voc and throw cannot both be true at same time, defaulting to update_voc")
self.checkUnique = check_unique
self._unique = set()
[docs] def saveVoc(self, path):
"""
Save the current state of the vocabulary to a file.
Args:
path: Path to the generated file.
Returns:
`None`
"""
self.vocabulary.toFile(path)
[docs] def getVoc(self):
"""
Return current vocabulary.
Returns:
Current vocabulary as a `SequenceVocabulary` instance.
"""
return self.vocabulary
[docs] def processMolecule(self, seq):
"""
Generate encoding information for the given molecule sequence.
Args:
seq: molecule as a sequence (i.e. SMILES string)
Returns:
a `dict` where "seq" is the key to the original sequence and "token" to the generated encoding of this sequence
"""
if self.checkUnique and seq in self._unique:
return None
tokens = None
if self.updateVoc:
tokens = self.vocabulary.addWordsFromSeq(seq)
elif self.throw:
tokens = self.vocabulary.removeIfNew(seq)
else:
tokens = self.vocabulary.splitSequence(seq)
if tokens:
if self.checkUnique:
self._unique.add(seq)
output = self.vocabulary.encode([tokens[: -1]])
code = output[0].reshape(-1).tolist()
return code