Source code for drugex.data.corpus.interfaces

"""
interfaces

Created by: Martin Sicho
On: 26.04.22, 13:12
"""
from abc import ABC, abstractmethod

from drugex.logs import logger
from drugex.molecules.interfaces import MolSupplier


[docs]class Vocabulary(ABC): """ Definition of the vocabulary interface. All vocabularies contain "words" that are used for encoding and decoding molecules. """ def __init__(self, words): self.words = words def __add__(self, other): return type(self)(other.words + self.words)
[docs] @abstractmethod def encode(self, tokens, frags=None): pass
[docs] @abstractmethod def decode(self, representation): pass
[docs] @staticmethod @abstractmethod def fromFile(path): pass
[docs] @abstractmethod def toFile(self, path): pass
[docs]class SequenceVocabulary(Vocabulary, ABC): """ Generic vocabulary for sequence-based models. """ def __init__(self, encode_frags, words, max_len=100, min_len=10): """ Args: encode_frags: boolean indicating if used to also encode fragments words: iterable of words in this vocabulary max_len: the maximum number of tokens contained in one SMILES """ super().__init__(words) if encode_frags: # Allow fragments for fragment-based models self.control = ('_', 'GO', 'EOS') # '_' used during model fitting self.special = list(self.control) + ['.'] else: self.control = ('GO', 'EOS') self.special = list(self.control) self.wordSet = set() if words: self.wordSet = set(x for x in words if x not in self.special) self.updateIndex() self.max_len = max_len self.min_len = min_len
[docs] @abstractmethod def splitSequence(self, seq): pass
[docs] def toFile(self, path): log = open(path, 'w') log.write('\n'.join([x for x in self.words if x not in self.special])) log.close()
[docs] def addWordsFromSeq(self, seq, ignoreConstraints=False): token = self.splitSequence(seq) if ignoreConstraints or (self.min_len < len(token) <= self.max_len): diff = set(token) - self.wordSet if len(diff) > 0: self.wordSet.update(diff) self.updateIndex() return token else: logger.warning(f"Molecule does not meet min/max words requirements (min: {self.min_len}, max: {self.max_len}). Words found: {set(token)} (occurrence count: {len(token)}). It will be ignored.") return None
[docs] def removeIfNew(self, seq, ignoreConstraints=False): token = self.splitSequence(seq) if ignoreConstraints or (self.min_len < len(token) <= self.max_len): diff = set(token) - self.wordSet - set(self.special) if len(diff) > 0: logger.warning(f"Tokens: {set(diff)} do not occur in voc. Molecule: {seq} will be ignored.") return None else: return token else: logger.warning(f"Molecule does not meet min/max words requirements (min: {self.min_len}, max: {self.max_len}). Words found: {set(token)} (occurrence count: {len(token)}). It will be ignored.") return None
[docs] def updateIndex(self): self.words = self.special + [x for x in sorted(self.wordSet) if x not in self.special] self.size = len(self.words) self.tk2ix = dict(zip(self.words, range(len(self.words)))) self.ix2tk = {v: k for k, v in self.tk2ix.items()}
[docs]class Corpus(MolSupplier, ABC): """ A `MolSupplier` that generates encoded molecule data from the given input. """ def __init__(self, molecules): """ Args: molecules: an `iterable`, `MolSupplier` or a `list`-like data structure to supply molecules """ super().__init__() self.molecules = molecules if hasattr(molecules, "__next__") else iter(molecules)
[docs] def next(self): return next(self.molecules)
[docs] def convert(self, representation): try: ret = self.processMolecule(representation) except Exception as exp: logger.warning(f'Exception occurred when generating corpus data for molecule: {representation}. Cause:') logger.exception(exp) return next(self) return ret
[docs] @abstractmethod def processMolecule(self, molecule): """ Process one molecule. Args: molecule: a molecule instance (representation depend on the implementation). Returns: encoded data of the molecule (i.e. data associated with one input sample to the desired DrugEx model) """ pass
[docs] @abstractmethod def getVoc(self): """ Corpus should keep track of the 'Vocabulary' used to encode molecules. This method should return its current state. Returns: currently used `Vocabulary` """ pass