Source code for dynamo_consistency.checkphedex
# pylint: disable=import-error
"""
A module that provides functions to check the comparison results to
the list of files and deletions in PhEDEx.
:author: Daniel Abercrombie <dabercro@mit.edu>
"""
import time
import logging
from common.interface.mysql import MySQL
from CMSToolBox.webtools import get_json
from . import config
from . import datatypes
from . import cache_tree
LOG = logging.getLogger(__name__)
[docs]def set_of_deletions(site):
"""
Get a list of datasets with approved deletion requests at a given site that were created
within the number of days matching the **IgnoreAge** configuration parameter.
This request is done via the PhEDEx ``deleterequests`` API.
:param str site: The site that we want the list of deletion requests for.
:returns: Datasets that are in deletion requests
:rtype: set
"""
created_since = int(
time.time() - float(config.config_dict().get('IgnoreAge', 0)) * 24 * 3600)
# Get deletion requests in PhEDEx
deletion_request = get_json(
'cmsweb.cern.ch', '/phedex/datasvc/json/prod/deleterequests',
{'node': site, 'approval': 'approved', 'create_since': created_since},
use_https=True)
# PhEDEx APIs are ridiculous
# Here I get the dataset names of approved deletion requests in a single list
datasets_for_deletion = set(
[block['name'].split('#')[0] for block in sum(
[request['data']['dbs']['block'] for request in \
deletion_request['phedex']['request']],
[])] + \
[dataset['name'] for dataset in sum(
[request['data']['dbs']['dataset'] for request in \
deletion_request['phedex']['request']],
[])]
) if deletion_request else set()
return datasets_for_deletion
[docs]@cache_tree('InventoryAge', 'phedexlisting')
def get_phedex_tree(site):
"""
Get the file list tree from PhEDEx.
Uses the InventoryAge configuration to determine when to refresh cache.
:param str site: The site to get information from PhEDEx for.
:returns: A tree containing file replicas that are supposed to be at the site
:rtype: dynamo_consistency.datatypes.DirectoryInfo
"""
tree = datatypes.DirectoryInfo('/store')
valid_list = config.config_dict().get('DirectoryList', [])
sql = MySQL(config_file='/etc/my.cnf', db='dynamo', config_group='mysql-dynamo')
datasets = sql.query('SELECT datasets.name '
'FROM sites INNER JOIN dataset_replicas INNER JOIN datasets '
'WHERE dataset_replicas.dataset_id=datasets.id AND '
'dataset_replicas.site_id=sites.id and sites.name=%s', site)
def add_files(dataset, retries):
"""
:param str dataset: Dataset to get from PhEDEx
:param int retries: The number of times to retry PhEDEx call
:returns: Whether or not the addition was successful
:rtype: bool
"""
LOG.info('Getting PhEDEx contents for %s', dataset)
phedex_response = get_json(
'cmsweb.cern.ch', '/phedex/datasvc/json/prod/filereplicas',
{'node': site, 'dataset': dataset},
retries=retries,
use_https=True)
report = 0
if not phedex_response:
LOG.warning('Bad response from PhEDEx for %s', dataset)
return False
for block in phedex_response['phedex']['block']:
LOG.debug('%s', block)
replica_list = [(replica['name'], replica['bytes'],
int(replica['replica'][0]['time_create'] or time.time()),
block['name']) \
for replica in block['file'] \
if replica['name'].split('/')[2] in valid_list]
report += len(replica_list)
tree.add_file_list(replica_list)
LOG.info('%i files', report)
return True
separate = []
for primary in set([d.split('/')[1][:3] for d in datasets]):
success = add_files('/%s*/*/*' % primary, 0)
if not success:
separate.append(primary)
# Separate loop to retry datasets individually
for dataset in [d for d in datasets if d.split('/')[1][:3] in separate]:
success = add_files(dataset, 5)
if not success:
LOG.critical('Cannot get %s from PhEDEx. Do not trust results...', dataset)
return tree
[docs]def check_for_datasets(site, orphan_list_file):
"""
Checks PhEDEx exhaustively to see if a dataset should exist at a site,
according to PhEDEx, but has files marked as orphans according to our check.
This is done via the PhEDEx ``filereplicas`` API.
The number of filereplicas for each dataset is printed to the terminal.
Datasets that contain any filereplicas are returned by this function.
:param str site: The name of the site to check
:param list orphan_list_file: List of LFNs that are listed as orphans at the site
:returns: The list of number of files and datasets for each dataset that is
supposed to have at least 1 file at the site.
:rtype: list of tuples
"""
datasets = set()
output = []
with open(orphan_list_file) as orphans:
for line in orphans:
split_name = line.split('/')
dataset = '/%s/%s-%s/%s' % (split_name[4], split_name[3], split_name[6], split_name[5])
if dataset not in datasets:
phedex_response = get_json(
'cmsweb.cern.ch', '/phedex/datasvc/json/prod/filereplicas',
{'node': site, 'dataset': dataset},
use_https=True)
num_files = sum(len(block['file']) for block in phedex_response['phedex']['block'])
datasets.add(dataset)
print num_files, dataset
if num_files:
output.append((num_files, dataset))
return output