Source code for dynamo_consistency.datatypes

# pylint: disable=too-many-locals, too-many-branches, too-many-statements, too-complex
#
# Here there be dragons
#

"""
Module defines the datatypes that are used for storage and comparison.
There is also a powerful create_dirinfo function that takes a filler function
or object and uses the multiprocessing module to recursively list directories
in parallel.

:author: Daniel Abercrombie <dabercro@mit.edu>
"""


import os
import time
import hashlib
import cPickle
import logging
import multiprocessing

from Queue import Empty

from . import config

LOG = logging.getLogger(__name__)
"""
The maximum age, in days, of files and directories to ignore in this check.
This variable should be reset once in a while by deamons that run while an
operator might be adjusting the configuration.
"""


[docs]def create_dirinfo(location, first_dir, filler,
                   object_params=None, callback=None):
    """ Create the directory information

    :param str location: This is the beginning of the path where we will find ``first_dir``.
                         For example, to find the first directory ``mc``, we also have to
                         say where it is. For using CMS LFNs, location would be
                         ``/store`` (where ``mc`` is inside).
                         This is a path.
    :param str first_dir: The name of the first directory that is inside the path of ``location``.
                          This should not be a path,
                          but the name of the directory to list recursively.
    :param filler: This is either a function that lists the directory contents given just a path
                   of ``os.path.join(location, first_dir)``, or it is a constructor that
                   does the same thing with a member function called ``list``.
                   If ``filler`` is an object constructor, the parameters for the object
                   creation must be passed through the parameter ``object_params``.
                   Both listings must return the following tuple:

                     - A bool saying whether the listing was successful or not
                     - A list of tuples of sub-directories and their mod times
                     - A list of tuples files inside, their size, and their mode times

    :type filler: function or constructor
    :param list object_params: This only needs to be set when filler is an object constructor.
                               Each element in the list is a tuple of arguments to pass
                               to the constructor.
    :param function callback: A function that is called every time master thread has finished
                              checking the child threads.
                              This can happen very many times at large sites.
                              The function is called with the main DirectoryTree as its argument
    :returns: A :py:class:`DirectoryInfo` object containing everything the directory listings from
              ``os.path.join(location, first_dir)`` with name ``first_dir``.
    :rtype: DirectoryInfo
    """

    LOG.debug('Called create_dirinfo(%s, %s, %s, %s)',
              location, first_dir, filler, object_params)

    # Determine the number of threads
    if object_params is not None:
        n_threads = len(object_params)
    else:
        n_threads = config.config_dict()['NumThreads'] or multiprocessing.cpu_count()

    # First directory is location + first_dir
    starting_dir = os.path.join(location, first_dir)
    LOG.info('Listing directory %s with %i threads', starting_dir, n_threads)

    # Initialize queue and connection lists
    out_queue = multiprocessing.Queue()
    in_queue = multiprocessing.Queue()

    master_conns = []
    slave_conns = []
    send_to_master = []

    for _ in xrange(n_threads):
        con1, con2 = multiprocessing.Pipe()
        master_conns.append(con1)
        slave_conns.append(con2)
        send_to_master.append(multiprocessing.Queue())

    # Put in the first element for the queue
    # They go like, (full path of the next listing to do,
    #                name of sub-node to place the listing (blank for first level),
    #                list of previous directories, list of previous files (for retries),
    #                list of queue numbers that have failed so far)
    in_queue.put((starting_dir, '', [], [], []))

    def run_queue(i_queue):
        """
        Runs over one of the queues.
        When the queue is finished, it checks back with the master for permission to stop.

        :param int i_queue: The number of the thread
        """

        thread_log = logging.getLogger('%s--thread%i' % (__name__, i_queue))

        thread_log.debug('Running queue: %i', i_queue)
        running = True

        # Get the queue and connection for this thread
        conn = slave_conns[i_queue]

        if object_params:
            # Create the object with the parameters here
            params = object_params[i_queue]
            thread_log.debug('Params for this object: %s', params)
            thread_object = filler(*params)
            filler_func = thread_object.list
        else:
            # Otherwise, use the filler function directly passed
            filler_func = filler

        while running:
            try:
                location, name, prev_dirs, prev_files, failed_list = in_queue.get(True, 3)
                if i_queue in failed_list:
                    thread_log.warning('Got previously failed call, putting back and sleeping')
                    in_queue.put((location, name, prev_dirs, prev_files, failed_list))
                    time.sleep(10)

                thread_log.debug('Getting directory with (%s, %s, %s)',
                                 location, name, failed_list)

                # Call filler
                full_path = os.path.join(location, name)
                thread_log.debug('Full path is %s', full_path)
                okay, directories, files = filler_func(full_path)
                thread_log.debug('Got from filler: Good? %s, %i directories, %i files',
                                 okay, len(directories), len(files))

                # If not okay, add _unlisted_ flag
                if not okay:

                    directories = list(set(directories + prev_dirs))
                    files = list(set(files + prev_files))

                    thread_log.debug('Full dirs, and files: %s, %s', directories, files)

                    thread_log.error('Giving up directory %s', full_path)
                    # _unlisted_ is used as a flag to tell our comparer something went wrong
                    files.append(('_unlisted_', 0, 0))

                # Send results to master queue
                out_queue.put((name, directories, files, len(failed_list)))

                # Add each directory into some input queue
                for directory, _ in directories:
                    joined_name = os.path.join(name, directory)
                    in_queue.put((location, joined_name, [], [], []))

                # Tell master that a job finished,
                # so it can build the final object
                send_to_master[i_queue].put(('O', time.time()))

                thread_log.debug('Finished one job with (%s, %s)', location, name)
            except Empty:
                # Report empty
                thread_log.debug('Worker finished input queue')
                send_to_master[i_queue].put(('A', 0))
                #conn.send('All_Job')

                # Check for main process
                message = conn.recv()
                thread_log.debug('Message from master: %s', message)
                # If permission, close
                if message == 'Close':
                    conn.close()
                    running = False
                else:
                    thread_log.debug('Worker going back to check queue')

    # Spawn processes to run on this run_queue function
    processes = []

    for i_queue in range(n_threads):
        process = multiprocessing.Process(target=run_queue, args=(i_queue,))
        process.start()
        processes.append(process)

    # Build the DirectoryInfo
    building = True
    dir_info = DirectoryInfo(first_dir)

    while building:
        try:
            # Get the info from the queue
            name, directories, files, _ = out_queue.get(True, 1)

            # Create the nodes and files
            built = dir_info.get_node(name)
            built.add_files(files)

            # Set correct node mtime for directories
            for directory, mtime in directories:
                built.get_node(directory).mtime = mtime

        except Empty:
            # When empty, check on the status of the workers
            LOG.debug('Empty queue for building.')
            LOG.info('Number of files so far built: %8i  nodes: %8i',
                     dir_info.get_num_files(), dir_info.count_nodes())

            # Process the dir_info with some callback
            if callback:
                callback(dir_info)

            # Ends only if all threads are done at the beginning of this check
            threads_done = 0
            for conn in master_conns:
                LOG.debug('Waiting for thread %i', master_conns.index(conn))
                message, timestamp = send_to_master[master_conns.index(conn)].get(True)

                LOG.debug('Recieved message %s', message)
                # Count the number of threads saying their finished at the beginning
                if message == 'A':
                    threads_done += 1
                    LOG.info('Threads saying done: %i', threads_done)
                    # Send back to work, just in case not all threads are done
                    conn.send('Work')
                elif message == 'O':
                    # This thread wasn't finished at the beginning, so threads_done
                    # will not reach n_threads if the master reaches this point in the code
                    LOG.debug('Found one job, about to cycle')
                    now = time.time()
                    cycle = True
                    while cycle:
                        # Cycle through timestamps so that we do not have a backlog
                        try:
                            message, timestamp = \
                                send_to_master[master_conns.index(conn)].get(True, 1)
                            if message == 'A':
                                LOG.debug('Found end to pipe.')
                                conn.send('Work')
                                cycle = False
                            elif timestamp > now:
                                cycle = False
                        except Empty:
                            cycle = False

                else:
                    LOG.error('Weird message from pipe')

            # Check if all the threads were finished
            if threads_done == n_threads:
                LOG.debug('Done building')
                # Break out of loop of checking
                building = False

    LOG.debug('Closing all connections')
    # Tell connections to close
    for conn in master_conns:
        conn.send('Close')
        conn.close()

    LOG.debug('Waiting for processes')
    # Wait for processes to join
    for proc in processes:
        proc.join()

    return dir_info


[docs]class NotEmpty(Exception):
    """
    An exception for throwing when a non-empty directory is deleted
    from a :py:class:`DirectoryInfo`
    """
    pass


[docs]class BadPath(Exception):
    """
    An exception for throwing when the path doesn't make sense for various methods
    of a :py:class:`DirectoryInfo`
    """
    pass


[docs]class DirectoryInfo(object):
    """
    Stores all of the information of the contents of a directory

    :param str name: The name of the directory
    :param list directories: If this is set, the infos in the
                             list are merged into a master :py:class:`DirectoryInfo`.
    :param list files: List of tuples containing information about files
                       in the directory.
    """

    ignore_age = None

    __slots__ = ('directories', 'timestamp', 'name', 'hash', 'files', 'mtime', 'can_compare')
    def __init__(self, name='', directories=None, files=None):
        if DirectoryInfo.ignore_age is None:
            DirectoryInfo.ignore_age = float(config.config_dict()['IgnoreAge'])

        self.directories = directories or []
        self.timestamp = time.time()
        self.name = name
        self.hash = None
        # Is only None until filled for the first time.
        # If still None for some reason during comparison, errors will be thrown
        self.files = None
        self.mtime = None

        self.can_compare = False

        if directories is not None or files is not None:
            self.add_files(files)


[docs]    def get_files(self, min_age=0, path=''):
        """
        Get the list of files that are older than some age

        :param int min_age: The minimum age, in seconds, of files to list
        :param str path: The path to this file. Used for recursive calls
        :returns: List of full file paths
        :rtype: list
        """

        output = []
        for fil in self.files:
            # Only list old files
            if (self.timestamp - fil['mtime']) > min_age and fil['name'] != '_unlisted_':
                output.append(os.path.join(path, self.name, fil['name']))

        for directory in self.directories:
            output.extend(directory.get_files(min_age, os.path.join(path, self.name)))

        return output


[docs]    def add_files(self, files):
        """
        Set the files for this :py:class:`DirectoryInfo` node

        :param list files: The tuples of file information.
                           Each element consists of file name, size, and mod time.
        :returns: self for chaining calls
        :rtype: :py:class:`DirectoryInfo`
        """

        # This is where we know that the directory has been properly filled
        if self.files is None:
            self.files = []

        # Get the list of new files
        existing_names = [fi['name'] for fi in self.files]
        sorted_files = [fi for fi in sorted(files or []) \
                            if fi[0] not in existing_names]

        for file_info in sorted_files:
            name, size, mtime = file_info[:3]

            if len(file_info) > 3:
                block = file_info[3]
            else:
                block = ''

            self.files.append({
                'name': name,
                'size': long(size),
                'mtime': mtime,
                'block': block,
                'hash': hashlib.sha1(
                    '%s %i' % (name, size)    # We are not comparing mtime for now
                    ).hexdigest(),
                'can_compare': bool(mtime + DirectoryInfo.ignore_age * 24 * 3600 < self.timestamp
                                    and name != '_unlisted_')
                })

        self.files.sort(key=lambda x: x['name'])

        return self

[docs]    def add_file_list(self, file_infos):
        """
        Add a list of tuples containing file_name, file_size to the node.
        This is most useful when you get a list of files from some other source
        and want to easily convert that list into a :py:func:`DirectoryInfo`

        :param list file_infos: The list of files (full path, size in bytes[, timestamp])
        """

        files = []
        directory = ''

        for file_info in file_infos:

            name, size = file_info[:2]
            if len(file_info) > 2:
                timestamp = file_info[2]
            else:
                timestamp = 0

            new_dir = os.path.dirname(name[len(self.name):].lstrip('/'))

            if directory == new_dir:
                # If in the old directory, append to the list of files
                files.append((os.path.basename(name), size, timestamp))
            else:
                # When changing directories, append the files gathered in the last directory
                self.get_node(directory).add_files(files)
                # Get the new directory name
                directory = new_dir
                # Reset the files list
                files = [(os.path.basename(name), size, timestamp)]

        # Add data from the last directory
        self.get_node(directory).add_files(files)

[docs]    def setup_hash(self):
        """
        Set the hashes for this :py:class:`DirectoryInfo`
        """

        if self.files is None:
            return

        hasher = hashlib.sha1()

        # Sort the sub-directories and files
        self.directories.sort(key=lambda x: x.name)
        self.files.sort(key=lambda x: x['name'])

        hasher.update(self.name)

        for directory in self.directories:
            # Recursively make the hash for each subdirectory first
            directory.setup_hash()
            # Can compare if a subdirectory asks for it
            self.can_compare = self.can_compare or directory.can_compare

            # Ignore newer directories or any others that don't want to be compared
            if directory.can_compare:
                hasher.update('%s %s' % (directory.name, directory.hash))

        for file_info in self.files:
            if file_info['can_compare']:
                # Add files that can be compared, and set self to be compared
                self.can_compare = True
                hasher.update('%s %s' % (file_info['name'], file_info['hash']))

        # Add empty directories that are not too new to comparison
        if not (self.directories or self.files) and self.mtime and \
                self.mtime + DirectoryInfo.ignore_age * 24 * 3600 < self.timestamp:
            self.can_compare = True

        # Calculate hash
        self.hash = hasher.hexdigest()


[docs]    def save(self, file_name):
        """
        Save this :py:class:`DirectoryInfo` in a file.

        :param str file_name: is the location to save the file
        """

        with open(file_name, 'w') as outfile:
            cPickle.dump(self, outfile, protocol=cPickle.HIGHEST_PROTOCOL)

[docs]    def display(self, path=''):
        """
        Print out the contents of this :py:class:`DirectoryInfo`

        :param str path: The full path to this :py:class:`DirectoryInfo` instance
        """
        print self.displays(path)

[docs]    def displays(self, path=''):
        """
        Get the string to print out the contents of this :py:class:`DirectoryInfo`.

        :param str path: The full path to this :py:class:`DirectoryInfo` instance
        :returns: The display string
        :rtype: str
        """
        # This is in a separate function for unit test assertion errors, which likes strings

        if not path:
            path = self.name

        output = 'compare: %i mtime: %s my hash: %s path: %s' % \
            (int(self.can_compare), str(self.mtime), self.hash, path)

        for file_info in self.files:
            output += ('\nmtime: %i size: %i my hash:%s name: %s' %
                       (file_info['mtime'], file_info['size'],
                        file_info['hash'], file_info['name']))

        for directory in self.directories:
            # Recursively get displays for sub-directories
            output += '\n' + directory.displays(os.path.join(path, directory.name))

        return output

[docs]    def get_node(self, path, make_new=True):
        """ Get the node that corresponds to the path given.
        If the node does not exist yet, and ``make_new`` is True, the node is created.

        :param str path: Path to the desired node from current node.
                         If the path does not exist yet, empty nodes will be created.
        :param str make_new: Bool to create new node if none exists at path or not
        :returns: A node with the proper path, unless make_new is False and the node doesn't exist
        :rtype: DirectoryInfo or None
        """

        # If any path left
        if path:
            split_path = path.split('/')
            return_name = '/'.join(split_path[1:])

            # Search for if directory exists
            for directory in self.directories:
                if split_path[0] == directory.name:
                    return directory.get_node(return_name, make_new)

            # If not, make a new directory, or None
            if make_new:
                # If we're making a new directory, then this should have non-None self.files
                if self.files is None:
                    self.files = []

                new_dir = DirectoryInfo(split_path[0])
                self.directories.append(new_dir)
                return new_dir.get_node(return_name, make_new)

            return None

        # If no path, just return self
        return self

[docs]    def get_directory_size(self):
        """ Report the total size used by this directory and its subdirectories.

        :returns: Size of files in directory, in bytes
        :rtype: int
        """

        return sum([di.get_directory_size() for di in self.directories],
                   sum([fi['size'] for fi in self.files]))

[docs]    def get_unlisted(self, path=''):
        """
        :param str path: Path to prepend to the name, used in recursive calls
        :returns: List of directories that were unlisted
        :rtype: list
        """
        here = os.path.join(path, self.name)
        output = [name for d in self.directories for name in d.get_unlisted(here)]
        if '_unlisted_' in [f['name'] for f in self.files]:
            output.append(here)

        return output

[docs]    def get_num_files(self, unlisted=False, place_new=False):
        """ Report the total number of files stored.

        :param bool unlisted: If true, return number of unlisted directories,
                              Otherwise return only successfully listed files
        :param bool place_new: If true, pretend there's one more file inside
                               any new directory or if files is None.
                               This prevents listing of empty directories to include
                               directories that should not actually be deleted.
        :returns: The number of files in the directory tree structure
        :rtype: int
        """

        if self.files is None:
            return int(place_new)

        num_files = len([fi for fi in self.files \
                             if (fi['name'] == '_unlisted_') == unlisted])
        for directory in self.directories:
            num_files += directory.get_num_files(unlisted, place_new)

        if place_new and (not self.can_compare or self.mtime is None):
            num_files += 1

        return num_files

    def _grab_first(self, levels=100):
        """ Used for debugging.
        Grabs the subdirectories by the first in the list.

        :param int levels: is the number of levels of directories to bypass
        :returns: The proper :py:class:`DirectoryInfo` level
        :rtype: DirectoryInfo
        """

        output = self

        for _ in xrange(levels):
            if output.directories:
                output = output.directories[0]
            else:
                break

        return output

[docs]    def compare(self, other, path='', check=None):
        """ Does one way comparison with a different tree

        :param DirectoryInfo other: The directory tree to compare this one to
        :param str path: Is the path to get to this location so far
        :param check: An optional function that double checks a file name.
                      If the checking function returns ``True`` for a file name,
                      the file will not be included in the output.
        :type check: function
        :returns: Tuple of list of files and directories that are present and not in the other tree
                  and the size of the files that corresponds to
        :rtype: list, list, long
        """

        extra_files = []
        extra_dirs = []
        extra_size = long(0)

        if '_unlisted_' in [fi['name'] for fi in self.files]:
            return extra_files, extra_dirs, extra_size

        here = os.path.join(path, self.name)

        if other:
            # If there is a match in the hash, then the nodes are effectively identical
            # Otherwise, do these recursive comparisons

            logging.debug('Hashes: %s -- %s, can compare: %i -- %i',
                          self.hash, other.hash, self.can_compare, other.can_compare)

            if self.hash != other.hash and other.can_compare:
                for directory in self.directories:
                    # Ignore not comparable directories (usually new ones)
                    if not directory.can_compare:
                        continue

                    # Recursive check of extra files and directories here
                    new_other = other.get_node(directory.name, False)
                    more_files, more_dirs, more_size = directory.compare(new_other, here, check)
                    extra_size += more_size
                    extra_files.extend(more_files)
                    if new_other:
                        extra_dirs.extend(more_dirs)
                    elif '_unlisted_' not in [fi['name'] for fi in other.files]:
                        # If the subdirectory does not exist, and '_unlisted_' not thrown
                        # mark that whole directory as being extra.
                        # At the moment this is redundant with all the files,
                        # but gives a good place to prune file system directories
                        # after files have been deleted
                        extra_dirs.append(os.path.join(here, directory.name))

                for file_info in self.files:
                    if not file_info['can_compare']:
                        continue

                    # See if each file exists and has the correct hash
                    # Say all files are fine in a directory that is even partially '_unlisted_'
                    found = False
                    for to_match in other.files:
                        if file_info['hash'] == to_match['hash'] or \
                                to_match['name'] == '_unlisted_':
                            found = True
                            break

                    full_name = os.path.join(path, self.name, file_info['name'])

                    if not found and (check is None or not check(full_name)):
                        extra_size += file_info['size']
                        extra_files.append(full_name)
        else:
            # If no other node to compare, all files are extra (not in the other tree)

            LOG.debug('Nothing to compare, files: %s', self.files)
            LOG.debug('Nothing to compare, directories: %s',
                      [(di.name, di.can_compare) for di in self.directories])

            for file_info in [fi for fi in self.files if fi['can_compare']]:
                full_name = os.path.join(path, self.name, file_info['name'])

                if check is None or not check(full_name):
                    extra_files.append(os.path.join(path, self.name, file_info['name']))
                    extra_size += file_info['size']

            # All directories are extra too
            for directory in [di for di in self.directories if di.can_compare]:
                more_files, _, more_size = directory.compare(None, here, check)
                extra_size += more_size
                extra_files.extend(more_files)

        return extra_files, extra_dirs, extra_size

[docs]    def count_nodes(self, empty=False):
        """
        :param bool empty: If True, only return the number of empty nodes
        :returns: The total number of nodes in this Directory Info. This corresponds
                  to approximately the number of listing requests required to build the data.
        :rtype: int
        """

        count_this = 0 if self.files is None or (empty and self.get_num_files() != 0) else 1
        return sum([directory.count_nodes(empty) for directory in self.directories], count_this)

[docs]    def empty_nodes_set(self):
        """
        This function recursively builds the entire list of empty  directories that can be deleted

        :returns: The set of empty directories to delete
        :rtype: set
        """

        output = set()

        if not self.can_compare or \
                (self.mtime is not None and
                 self.mtime + DirectoryInfo.ignore_age * 24 * 3600 > self.timestamp):
            return output

        # Count direct subdirectories that are removed
        count_sub = 0

        for directory in self.directories:
            # Add all the elements from the other set
            for sub in directory.empty_nodes_set():
                if '/' not in sub:
                    count_sub += 1
                output.add(os.path.join(self.name, sub))

        if not (self.get_num_files(place_new=True) or self.mtime is None) and \
                count_sub == len(self.directories):
            output.add(self.name)

        return output

[docs]    def empty_nodes_list(self):
        """
        This function should be used to get the nodes to delete in
        the proper order for non-recursive deletion

        :returns: The list of empty directories to delete in the order to delete
        :rtype: list
        """
        # Don't want to recursively sort, so we send this to a helpful set function
        return sorted(self.empty_nodes_set(), reverse=True)

[docs]    def listdir(self, *args, **kwargs):
        """
        Get the list of directory names within a :py:class:`DirectoryInfo`.
        Adding an argument will display the contents of the next directory.
        For example, if ``dir.listdir()`` returns::

            0: data
            1: mc

        ``dir.listdir(1)`` then lists the contents of ``mc`` and ``dir.listdir(1, 0)``
        lists the contents of the first subdirectory in ``mc``.

        :param args: Is a list of indices to list the subdirectories
        :param kwargs: Supports 'printing' which is set to a bool. Defaults as True.
        :returns: The :py:class:`DirectoryInfo` that is being listed
        :rtype: DirectoryInfo
        """

        printing = kwargs.get('printing', True)

        # Print the contents of a directory picked next, and return that DirectoryInfo
        if args:
            return self.directories[args[0]].listdir(*args[1:], printing=printing)

        # If we got to the last directory of the args, print the files contained
        elif printing:
            print '\nDirectories:'

            # Get the formatting width for printing the directory names
            if self.directories:
                width = max([len(di.name) for di in self.directories]) + 2
            else:
                width = 0

            # Print information for each directory
            for index, directory in enumerate(self.directories):
                print '%3i: %-{0}s Hash: %s  Num Files: %7i  Dirs Unlisted: %7i'.format(width) % \
                    (index, directory.name, directory.hash,
                     directory.get_num_files(), directory.get_num_files(True))

            if self.files:
                print 'Files:'

            for file_info in self.files:
                print file_info

        return self

[docs]    def get_file(self, file_name):
        """
        Get the file dictionary based off the name.

        :param str file_name: The LFN of the file
        :returns: Dictionary of file information
        :rtype: dict
        :raises BadPath: if the file_name does not start with ``self.name``
        """

        if not file_name.startswith(self.name):
            raise BadPath('self.name is %s, file_name is %s' % (self.name, file_name))

        exploded_name = file_name[len(self.name) + 1:].split('/')
        desired_name = exploded_name[-1]
        node = self.get_node('/'.join(exploded_name[:-1]))

        for file_info in node.files:

            if file_info['name'] == desired_name:
                return file_info

        return None

[docs]    def remove_node(self, path_name):
        """
        Remove an empty node from the DirectoryInfo

        :param str path_name: The path to the node, including the ``self.name`` at the beginning
        :returns: self for chaining
        :rtype: :py:class:`DirectoryInfo`
        :raises NotEmpty: if the directory is not empty or ``self.files`` is None
        :raises BadPath: if the path_name does not start with the ``self.name``
        """

        LOG.debug('Would like to remove %s', path_name)

        if not path_name.startswith(self.name):
            raise BadPath('self.name is %s, path_name is %s' % (self.name, path_name))

        exploded_name = path_name[len(self.name) + 1:].split('/')
        parent = self.get_node('/'.join(exploded_name[:-1]))

        # If the directory doesn't exist, we'll get some TypeError things
        node = parent.get_node(exploded_name[-1], make_new=False)

        if node.files:
            raise NotEmpty('This directory has files %s' % node.files)
        if node.directories:
            raise NotEmpty('This directory contains subdirectories %s' %
                           [d.name for d in node.directories])
        if node.files is None:
            raise NotEmpty('The files list is still None')
        if node.mtime + DirectoryInfo.ignore_age * 24 * 3600 > node.timestamp:
            raise NotEmpty('This directory is not old enough?')

        parent.directories.remove(node)
        return self


[docs]def get_info(file_name):
    """
    Get the :py:class:`DirectoryInfo` from a file.

    :param str file_name: is the location of the saved information
    :returns: Saved info
    :rtype: DirectoryInfo
    """

    infile = open(file_name, 'r')
    output = cPickle.load(infile)
    infile.close()

    return output


[docs]def compare(inventory, listing, output_base=None, orphan_check=None, missing_check=None):
    """
    Compare two different trees and output the differences into an ASCII file

    :param DirectoryInfo inventory: The tree of files that should be at a site
    :param DirectoryInfo listing: The tree of files that are listed remotely
    :param str output_base: The names of the ASCII files to place the reports
                            are generated from this variable.
    :param function orphan_check: A function that double checks each expected orphan.
                                  The function takes as an input, an LFN.
                                  If the function returns true, the LFN will not be
                                  listed as an orphan.
    :param function missing_check: A function checks each expected missing file
                                   The function takes as an input, an LFN.
                                   If the function returns true, the LFN will not be
                                   listed as missing.
    :returns: The two lists, missing and orphan files
    :rtype: tuple
    """

    LOG.info('About to perform comparison. Results will be in files starting with %s',
             output_base)
    LOG.debug('Double checking missing with %s', missing_check)
    missing, _, m_size = inventory.compare(listing, check=missing_check)
    LOG.info('There are %i missing files', len(missing))
    LOG.info('Size: %i', m_size)
    LOG.debug('Double checking orphans with %s', orphan_check)
    orphan, _, o_size = listing.compare(inventory, check=orphan_check)
    LOG.info('There are %i orphan files', len(orphan))
    LOG.info('Size: %i', o_size)

    if output_base:
        with open('%s_missing.txt' % output_base, 'w') as missing_file:
            for line in missing:
                missing_file.write(line + '\n')

        with open('%s_orphan.txt' % output_base, 'w') as orphan_file:
            for line in orphan:
                orphan_file.write(line + '\n')

    return missing, m_size, orphan, o_size