Source code for file_utils

from __future__ import absolute_import
from __future__ import print_function

import tarfile
import os
import sys
import shutil
import hashlib
from six.moves.urllib.request import urlopen
from six.moves.urllib.error import URLError, HTTPError

from generic_utils import Progbar
from modac_utils import get_file_from_modac


# Under Python 2, 'urlretrieve' relies on FancyURLopener from legacy
# urllib module, known to have issues with proxy management
if sys.version_info[0] == 2:
    def urlretrieve(url, filename, reporthook=None, data=None):
        def chunk_read(response, chunk_size=8192, reporthook=None):
            total_size = response.info().get('Content-Length').strip()
            total_size = int(total_size)
            count = 0
            while 1:
                chunk = response.read(chunk_size)
                count += 1
                if not chunk:
                    reporthook(count, total_size, total_size)
                    break
                if reporthook:
                    reporthook(count, chunk_size, total_size)
                yield chunk

        response = urlopen(url, data)
        with open(filename, 'wb') as fd:
            for chunk in chunk_read(response, reporthook=reporthook):
                fd.write(chunk)
else:
    from six.moves.urllib.request import urlretrieve


[docs]def get_file(fname, origin, unpack=False,
             # md5_hash=None, datadir='../Data/common'):
             # md5_hash=None, cache_subdir='common', datadir='../Data/common'):
             md5_hash=None, cache_subdir='common', datadir=None):  # datadir argument was never actually used so changing it to None
    """ Downloads a file from a URL if it not already in the cache.
        Passing the MD5 hash will verify the file after download as well
        as if it is already present in the cache.

        Parameters
        ----------
        fname : string
            name of the file
        origin : string
            original URL of the file
        unpack : boolean
            whether the file should be decompressed
        md5_hash : string
            MD5 hash of the file for verification
        cache_subdir : string
            directory being used as the cache
        datadir : string
            if set, datadir becomes its setting (which could be e.g. an absolute path) and cache_subdir no longer matters

        Returns
        ----------
        Path to the downloaded file
    """

    if datadir is None:
        file_path = os.path.dirname(os.path.realpath(__file__))
        datadir_base = os.path.expanduser(os.path.join(file_path, '..', 'Data'))
        datadir = os.path.join(datadir_base, cache_subdir)

    if not os.path.exists(datadir):
        os.makedirs(datadir)

    # if unpack:
    #    fnamesplit = fname.split('.tar.gz')
    #    unpack_fpath = os.path.join(datadir, fnamesplit[0])

    if fname.endswith('.tar.gz'):
        fnamesplit = fname.split('.tar.gz')
        unpack_fpath = os.path.join(datadir, fnamesplit[0])
        unpack = True
    elif fname.endswith('.tgz'):
        fnamesplit = fname.split('.tgz')
        unpack_fpath = os.path.join(datadir, fnamesplit[0])
        unpack = True
    elif fname.endswith('.zip'):
        fnamesplit = fname.split('.zip')
        unpack_fpath = os.path.join(datadir, fnamesplit[0])
        unpack = True
    else:
        unpack_fpath = None

    fpath = os.path.join(datadir, fname)
    if not os.path.exists(os.path.dirname(fpath)):
        os.makedirs(os.path.dirname(fpath))

    download = False
    if os.path.exists(fpath) or (unpack_fpath is not None and os.path.exists(unpack_fpath)):
        # file found; verify integrity if a hash was provided
        if md5_hash is not None:
            if not validate_file(fpath, md5_hash):
                print('A local file was found, but it seems to be '
                      'incomplete or outdated.')
                download = True
    else:
        download = True

    # fix ftp protocol if needed
    '''
    if origin.startswith('ftp://'):
        new_url = origin.replace('ftp://','http://')
        origin = new_url
    print('Origin = ', origin)
    '''

    if download:
        if 'modac.cancer.gov' in origin:
            get_file_from_modac(fpath, origin)
        else:
            print('Downloading data from', origin)
            global progbar
            progbar = None

            def dl_progress(count, block_size, total_size):
                global progbar
                if progbar is None:
                    progbar = Progbar(total_size)
                else:
                    progbar.update(count * block_size)

            error_msg = 'URL fetch failure on {}: {} -- {}'
            try:
                try:
                    urlretrieve(origin, fpath, dl_progress)
                    # fpath = wget.download(origin)
                except URLError as e:
                    raise Exception(error_msg.format(origin, e.errno, e.reason))
                except HTTPError as e:
                    raise Exception(error_msg.format(origin, e.code, e.msg))
            except (Exception, KeyboardInterrupt) as e:
                print(f"Error {e}")
                if os.path.exists(fpath):
                    os.remove(fpath)
                raise
            progbar = None
            print()

    if unpack:
        if not os.path.exists(unpack_fpath):
            print('Unpacking file...')
            try:
                shutil.unpack_archive(fpath, datadir)
            except (Exception, KeyboardInterrupt) as e:
                print(f"Error {e}")
                if os.path.exists(unpack_fpath):
                    if os.path.isfile(unpack_fpath):
                        os.remove(unpack_fpath)
                    else:
                        shutil.rmtree(unpack_fpath)
                raise
        return unpack_fpath
        print()

    return fpath


[docs]def validate_file(fpath, md5_hash):
    """ Validates a file against a MD5 hash

        Parameters
        ----------
        fpath : string
            path to the file being validated
        md5_hash : string
            the MD5 hash being validated against

        Returns
        ----------
        boolean
            Whether the file is valid
    """
    hasher = hashlib.md5()
    with open(fpath, 'rb') as f:
        buf = f.read()
        hasher.update(buf)
    if str(hasher.hexdigest()) == str(md5_hash):
        return True
    else:
        return False


[docs]def directory_from_parameters(params, commonroot='Output'):
    """ Construct output directory path with unique IDs from parameters

        Parameters
        ----------
        params : python dictionary
            Dictionary of parameters read
        commonroot : string
            String to specify the common folder to store results.

    """

    if commonroot in set(['.', './']):  # Same directory --> convert to absolute path
        outdir = os.path.abspath('.')
    else:  # Create path specified
        outdir = os.path.abspath(os.path.join('.', commonroot))
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        outdir = os.path.abspath(os.path.join(outdir, params['experiment_id']))
        if not os.path.exists(outdir):
            os.makedirs(outdir)

        outdir = os.path.abspath(os.path.join(outdir, params['run_id']))
        if not os.path.exists(outdir):
            os.makedirs(outdir)

    return outdir