Source code for dynamo_consistency.checkphedex

# pylint: disable=import-error

"""
A module that provides functions to check the comparison results to
the list of files and deletions in PhEDEx.

:author: Daniel Abercrombie <dabercro@mit.edu>
"""

import time
import logging

from cmstoolbox.webtools import get_json
from . import config

LOG = logging.getLogger(__name__)

[docs]def set_of_deletions(site):
    """
    Get a list of datasets with approved deletion requests at a given site that were created
    within the number of days matching the **IgnoreAge** configuration parameter.
    This request is done via the PhEDEx ``deleterequests`` API.

    :param str site: The site that we want the list of deletion requests for.
    :returns: Datasets that are in deletion requests
    :rtype: set
    """

    created_since = int(
        time.time() - float(config.config_dict().get('IgnoreAge', 0)) * 24 * 3600)

    # Get deletion requests in PhEDEx
    deletion_request = get_json(
        'cmsweb.cern.ch', '/phedex/datasvc/json/prod/deleterequests',
        {'node': site, 'approval': 'approved', 'create_since': created_since},
        use_https=True)

    # PhEDEx APIs are ridiculous
    # Here I get the dataset names of approved deletion requests in a single list
    datasets_for_deletion = set(
        [block['name'].split('#')[0] for block in sum(
            [request['data']['dbs']['block'] for request in \
                 deletion_request['phedex']['request']],
            [])] + \
        [dataset['name'] for dataset in sum(
            [request['data']['dbs']['dataset'] for request in \
                 deletion_request['phedex']['request']],
            [])]
        ) if deletion_request else set()

    return datasets_for_deletion


[docs]def check_for_datasets(site, orphan_list_file):
    """
    Checks PhEDEx exhaustively to see if a dataset should exist at a site,
    according to PhEDEx, but has files marked as orphans according to our check.
    This is done via the PhEDEx ``filereplicas`` API.
    The number of filereplicas for each dataset is printed to the terminal.
    Datasets that contain any filereplicas are returned by this function.

    :param str site: The name of the site to check
    :param list orphan_list_file: List of LFNs that are listed as orphans at the site
    :returns: The list of number of files and datasets for each dataset that is
              supposed to have at least 1 file at the site.
    :rtype: list of tuples
    """

    datasets = set()

    output = []

    with open(orphan_list_file) as orphans:
        for line in orphans:
            split_name = line.split('/')
            dataset = '/%s/%s-%s/%s' % (split_name[4], split_name[3], split_name[6], split_name[5])

            if dataset not in datasets:
                phedex_response = get_json(
                    'cmsweb.cern.ch', '/phedex/datasvc/json/prod/filereplicas',
                    {'node': site, 'dataset': dataset},
                    use_https=True)

                num_files = sum(len(block['file']) for block in phedex_response['phedex']['block'])

                datasets.add(dataset)

                print num_files, dataset

                if num_files:
                    output.append((num_files, dataset))

    return output