Source code for drugex.utils.download

# -*- coding: utf-8 -*-

"""Utility functions to download files."""

import os
import hashlib
import zipfile
import tarfile

import requests
from tqdm.auto import tqdm


CHUNKSIZE = 1048576  # 1 MB
RETRIES = 3


[docs]def download_file(url, out_path, extract_out_path=None, byte_size=None, sha256sum=None, progress=True, callback=None) -> None: """ Download a file and extract its content if is a ZIP or TAR.GZ file. Args: url (str): URL of the file to be downloaded. out_path (str): Path the file should be written to extract_out_path (str): Path to extract the content of the ZIP/TAR.GZ file into byte_size (int): Size in bytes of the file to be downloaded; ignored if None sha256sum (str): SHA256 checksum to compare that of the downloaded file to progress (bool): should progress be shown callback (Callable[[], Any]): callback function to be called after each chunk of the file is downloaded """ os.makedirs(os.path.dirname(out_path), exist_ok=True) os.makedirs(os.path.dirname(extract_out_path), exist_ok=True) fname = os.path.basename(out_path) fdir = os.path.dirname(out_path) # Ensure path exists if not os.path.isdir(fdir): raise ValueError(f'Path {fdir} does not exist, download aborted.') if progress: pbar = tqdm(total=byte_size, desc=f'Downloading file {fname}', unit='B', unit_scale=True) # Download file correct = False # ensure file is not corrupted retries = RETRIES while not correct and retries > 0: # Allow few failures session = requests.session() res = session.get(url, headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/39.0.2171.95 " "Safari/537.36"}, stream=True, verify=True) with open(out_path, 'wb') as fh: for chunk in res.iter_content(chunk_size=CHUNKSIZE): fh.write(chunk) if progress: pbar.update(len(chunk)) if callback is not None: callback() correct = check_sha256sum(out_path, sha256sum) if sha256sum is not None else True if not correct: retries -= 1 if progress: if retries > 0: message = f'SHA256 hash unexpected for {fname}. Remaining download attempts: {retries}' else: message = f'SHA256 hash unexpected for {fname}. All {RETRIES} attempts failed.' pbar.write(message) os.remove(out_path) if retries == 0: if progress: pbar.close() raise IOError(f'Download failed for {fname}') # Extract if ZIP file if fname.endswith('.zip'): with zipfile.ZipFile(out_path) as zip_handle: for name in zip_handle.namelist(): subpath = extract_out_path if extract_out_path is not None else out_path zip_handle.extract(name, subpath) os.remove(out_path) if progress: pbar.close() # Extract if TAR.GZ file elif fname.endswith(('.tar.gz', '.tgz')): with tarfile.open(out_path) as tar_handle: tar_handle.extractall(extract_out_path if extract_out_path is not None else out_path) os.remove(out_path) if progress: pbar.close()
[docs]def check_sha256sum(filename, sha256): """Check the SHA256 checksum of a file corresponds to that given. Args: filename (str): Path the file to check sha256 (str): SHA256 checksum to compare that of the downloaded file to Returns: True if the file's SHA256 checksum and the one provided match """ if not (isinstance(sha256, str) and len(sha256) == 64): raise ValueError("SHA256 must be 64 chars: {}".format(sha256)) sha256_actual = sha256sum(filename) return sha256_actual == sha256
[docs]def sha256sum(filename, blocksize=None): """Calculate the SHA256 cheksum of a file. Args: filename: path of the file blocksize: size of iterative chunks for calculation (default: 64 KiB) Returns: SHA256 checksum is hexadecimal """ if blocksize is None: blocksize = 65536 hash = hashlib.sha256() with open(filename, "rb") as fh: for block in iter(lambda: fh.read(blocksize), b""): hash.update(block) return hash.hexdigest()