Source code for drugex.data.utils

import os

from drugex import VERSION
from drugex.logs import logger

[docs]def getVocPaths(data_path, voc_files, mol_type):
    """ 
    Get paths to vocabulary files. If none are found, use internal defaults.
    
    Parameters
    ----------
    data_path : str
        Path to data directory.
    voc_files : list
        List of vocabulary file names.
        
    Returns
    -------
    list
        List of paths to vocabulary files.
    """

    voc_paths = []
    for voc_file in voc_files:
        path = f'{data_path}/{voc_file}'
        if os.path.exists(path):
            voc_paths.append(path)
        elif os.path.exists(path + f'_{mol_type}.txt.vocab'):
            voc_paths.append(path + f'_{mol_type}.txt.vocab')
        else:
            logger.warning(f'Could not find vocabulary file {voc_file} in {data_path}.')
            
    if len(voc_paths) == 0 :
        logger.warning(f'No vocabulary files found. Using internal defaults for DrugEx v{VERSION}.')

    return voc_paths

[docs]def getDataPaths(data_path, input_prefix, mol_type, unique_frags):

    """ 
    Get paths to training and test data files.
    
    Parameters
    ----------
    data_path : str
        Path to data directory.
    input_prefix : str
        Prefix of data files. If a file with the exact name exists, it is used for both training and testing.
    mol_type : str
        Type of molecules in data files. Either 'smiles' or 'graph'.
    unique_frags : bool
        Whether to use unique fragments or not.
    
    Returns
    -------
    Tuple[str, str]
        Paths to training and test data files.
    """

    # If exact data path was given as input, that data is both used for training and testing
    if os.path.exists(data_path + input_prefix):
        train_path = data_path + input_prefix
        test_path = train_path
    # Else if prefix was given, read separate train and test sets
    else:
        train_path = data_path + '_'.join([input_prefix, 'unique' if unique_frags else 'train', mol_type]) + '.txt'
        test_path = data_path + '_'.join([input_prefix, 'test', mol_type]) + '.txt'    
    assert os.path.exists(train_path), f'{train_path} does not exist'
    assert os.path.exists(test_path), f'{test_path} does not exist'          
        
    logger.info(f'Loading training data from {train_path}')
    logger.info(f'Loading validation data from {test_path}')

    return train_path, test_path