Source code for drugex.data.corpus.corpus

"""
corpus

Created by: Martin Sicho
On: 26.04.22, 16:47
"""
from rdkit import Chem

from drugex.logs import logger
from drugex.data.corpus.interfaces import Corpus
from drugex.data.corpus.vocabulary import VocSmiles, VocGraph


[docs]class SequenceCorpus(Corpus):
    """
    A `Corpus` to encode molecules for the sequence-based models.
    """

    def __init__(self, molecules, vocabulary=VocSmiles(False), update_voc=True, throw = False, check_unique=True):
        """
        Create a sequence corpus.

        Args:
            molecules: an `iterable`, `MolSupplier` or a `list`-like data structure to supply sequence representations of molecules (i.e. SMILES strings)
            vocabulary: a `SequenceVocabulary` instance to be used for encoding and collecting tokens
            update_voc: `True` if the tokens in the vocabulary should be updated with new tokens derived from the data (the `SequenceVocabulary.addWordsFromSeq()` method is used for splitting instead of doing simply `SequenceVocabulary.splitSequence()`)
            throw: 'True' if molecules that contain tokens that are not in the vocabulary should be thrown out of corpus (the `SequenceVocabulary.removeIfNew()` method is used for splitting instead of doing simply `SequenceVocabulary.splitSequence()`)
            check_unique: Skip identical sequences in "molecules".
        """

        super().__init__(molecules)
        self.vocabulary = vocabulary
        self.updateVoc = update_voc
        self.throw = throw
        if self.updateVoc and self.throw:
            logger.warning(f"update_voc and throw cannot both be true at same time, defaulting to update_voc")
        self.checkUnique = check_unique
        self._unique = set()

[docs]    def saveVoc(self, path):
        """
        Save the current state of the vocabulary to a file.

        Args:
            path: Path to the generated file.

        Returns:
            `None`
        """

        self.vocabulary.toFile(path)

[docs]    def getVoc(self):
        """
        Return current vocabulary.

        Returns:
            Current vocabulary as a `SequenceVocabulary` instance.
        """

        return self.vocabulary

[docs]    def processMolecule(self, seq):
        """
        Generate encoding information for the given molecule sequence.

        Args:
            seq: molecule as a sequence (i.e. SMILES string)

        Returns:
            a `dict` where "seq" is the key to the original sequence and "token" to the generated encoding of this sequence
        """

        if self.checkUnique and seq in self._unique:
            return None

        tokens = None
        if self.updateVoc:
            tokens = self.vocabulary.addWordsFromSeq(seq)
        elif self.throw:
            tokens = self.vocabulary.removeIfNew(seq)
        else:
            tokens = self.vocabulary.splitSequence(seq)

        if tokens:
            if self.checkUnique:
                self._unique.add(seq)
            output = self.vocabulary.encode([tokens[: -1]])
            code = output[0].reshape(-1).tolist()
            return code