Source code for classes

"""
classes
=======

The classes module provides one classes and three functions.
The class: DataNode

The core functionality of :mod:`cmipdata` is to organize a large number of
model output files into a logical structure so that further processing
can be done. Data is organized into a tree-like structure using the class
DataNode as the nodes of a tree. The entire tree structure will be referred
to as an ensemble. At each level of the tree the level is specified by the
genre attribute.

Various methods exist to interact with the ensemble, and its
constituent elements.

The :func:`mkensemble` function is used to create :class:`Ensemble`
objects, while :func:`match_ensembles` finds models common to two ensembles and
:func:`match_reliazations` matches realizations between two ensembles. Once
created, an ensemble can be used to harness the power of
the :mod:`preprocessing_tools` to apply systematic operations to all files.

.. moduleauthor:: Neil Swart <neil.swart@ec.gc.ca>
"""

import os
import glob
import copy

[docs]class DataNode(object): """ Defines a cmipdata DataNode. Attributes ---------- genre : string The attribute of DataNode name : string The name of the particular genre children : list List of DataNodees of genre beneath the current DataNode parent : DataNode for genre 'ensemble' the parent is None start_date : string for genre 'file' end_date : string for genre 'file' realm : string for genre 'variable' contains the realm of the varaible """ def __init__(self, genre, name, parent=None, **kwargs): """ Possible keys in kwargs: 'start_date' 'end_date' 'realm' """ self.genre = genre self.name = name self.children = [] self.parent = parent for k,v in kwargs.items(): setattr(self, k, v)
[docs] def add(self, child): """ Add DataNode to children Parameters ---------- child : DataNode """ self.children.append(child)
[docs] def delete(self, child): """Delete DataNode from children Parameters ---------- child : DataNode """ self.children.remove(child)
[docs] def getNameWithoutDates(self): """ Return string name with the dates removed if present Returns ------- string """ return self.name.replace('_' + self.start_date + '-' + self.end_date + '.nc', "")
[docs] def getChild(self, input_name): """ Returns DataNode given the name of the DataNode if it is in children Parameters ---------- input_name : string Returns ------- DataNode : Returns None if the DataNode is not in children """ for child in self.children: if child.name == input_name: return child return None
[docs] def mer(self): """Returns a generator containing lists of length 3 with the DataNode genre:'realization' the DataNode genre:'experiment' string model-experiment-realization Returns ------- generator """ for obj in self.objects('realization'): yield [obj, obj.parent, obj.parent.parent.name + '-' + obj.parent.name + '-' + obj.name]
[docs] def lister(self, genre, unique=True): """ Returns a list of names of a particular genre Parameters ---------- genre : string the genre of returned list unique: boolean if True removes duplicates from the list Return ------ list of strings """ def alist(item, genre): if item.genre == genre: yield item.name else: for child in item.children: for value in alist(child, genre): yield value if unique: return list(set(alist(self, genre))) else: return list(alist(self, genre))
[docs] def objects(self, genre): """ Returns a generator for a DataNode of a particular genre Parameters ---------- genre : string the genre of returned generator Return ------ generator of DataNodees """ def alist(item, genre): if item.genre == genre: yield item else: for child in item.children: for value in alist(child, genre): yield value return list(alist(self, genre))
[docs] def parentobject(self, genre): """ Returns the parent DataNode of a particular genre Parameters ---------- genre : string the genre of returned DataNode Return ------ DataNode """ def check(item): if item.genre == genre: return item else: return check(item.parent) return check(self)
def _checkfile(self): """ Removes files from ensemble if they are not in the directory """ for f in self.objects('ncfile'): if not os.path.isfile(f.name): f.parent.delete(f)
[docs] def squeeze(self): """ Remove any empty elements from the ensemble """ self._checkfile() def sq(node): if node.children == [] and node.genre != 'ncfile': delete = node if node.genre != 'ensemble': node = node.parent node.delete(delete) print 'Removing ' + delete.name + ' from ' + delete.parent.name sq(node) for n in node.children: sq(n) for n in self.children: sq(n)
[docs] def getDictionary(self): """Returns a dictionary which has the genres and their names for all the ancestors of the DataNode """ node = self values = {} while node.genre != 'ensemble': values[node.genre] = node.name node = node.parent return values
[docs] def sinfo(self, listOfGenres=['variable', 'model', 'experiment', 'realization', 'ncfile']): """ Returns the number of models, experiments, realizations, variables and files in the DataNode""" print "This ensemble contains:" for key in listOfGenres: if key == 'realization': print str(len(list(self.objects(key)))) + " " + key + "s" else: print str(len(self.lister(key))) + " " + key + "s"
[docs] def fulldetails(self): """ prints information about the number of models, experiments, variables and files ina DataNode tree. """ for model in self.children: print model.name + ':' for experiment in model.children: print '\t' + experiment.name for realization in experiment.children: print '\t\t' + realization.name for variable in realization.children: print '\t\t\t' + variable.name for filename in variable.children: print '\t\t\t\t' + filename.name
[docs] def fulldetails_tofile(self, fi): """ prints information about the number of models, experiments, variables and files ina DataNode tree. """ with open(fi, 'w') as f: for model in self.children: f.write(model.name + ':\n') for experiment in model.children: f.write('\t' + experiment.name + '\n') for realization in experiment.children: f.write('\t\t' + realization.name + '\n') for variable in realization.children: f.write('\t\t\t' + variable.name + '\n') for filename in variable.children: f.write('\t\t\t\t' + filename.name + '\n')
[docs]def mkensemble(filepattern, experiment='*', prefix='', kwargs=''): """Creates and returns a cmipdata ensemble from a list of filenames matching filepattern. Optionally specifying prefix will remove prefix from each filename before the parsing is done. This is useful, for example, to remove pre-pended paths used in filepattern (see example 2). Once the list of matching filenames is derived, the model, experiment, realization, variable, start_date and end_date fields are extracted by parsing the filnames against a specified file naming convention. By default this is the CMIP5 convention, which is:: variable_realm_model_experiment_realization_startdate-enddate.nc If the default CMIP5 naming convention is not used by your files, an arbitary naming convention for the parsing may be specified by the dicionary kwargs (see example 3). Parameters ---------- filepattern : string A string that by default is matched against all files in the current directory. But filepattern could include a full path to reference files not in the current directory, and can also include wildcards. prefix : string A pattern occuring in filepattern before the start of the official filename, as defined by the file naming converntion. For instance, a path preceeding the filename. EXAMPLES -------- 1. Create ensemble of all sea-level pressure files from the historical experiment in the current directory:: ens = mkensemble('psl*historical*.nc') 2. Create ensemble of all sea-level pressure files from all experiments in a non-local directory:: ens = mkensemble('/home/ncs/ra40/cmip5/sam/c5_slp/psl*' , prefix='/home/ncs/ra40/cmip5/sam/c5_slp/') 3. Create ensemble defining a custom file naming convention:: kwargs = {'separator':'_', 'variable':0, 'realm':1, 'model':2, 'experiment':3, 'realization':4, 'dates':5} ens = mkensemble('psl*.nc', **kwargs) """ # find all files matching filepattern filenames = sorted(glob.glob(filepattern)) if kwargs == '': kwargs = {'separator': '_', 'variable': 0, 'realm': 1, 'model': 2, 'experiment': 3, 'realization': 4, 'dates': 5} # Initialize the ensemble object ens = DataNode('ensemble', 'ensemble') # Loop over all files and for name in filenames: name = name.replace(prefix, '') variablename = name.split(kwargs['separator'])[kwargs['variable']] realm = name.split(kwargs['separator'])[kwargs['realm']] modelname = name.split(kwargs['separator'])[kwargs['model']] experiment = name.split(kwargs['separator'])[kwargs['experiment']] realization = name.split(kwargs['separator'])[kwargs['realization']] dates = name.split(kwargs['separator'])[kwargs['dates']] start_date = name.split(kwargs['separator'])[kwargs['dates']].split('-')[0] end_date = name.split(kwargs['separator'])[kwargs['dates']].split('-')[1].split('.')[0] # create the model if necessary m = ens.getChild(modelname) if m is None: m = DataNode('model', modelname, parent=ens) ens.add(m) # create the experiment if necessary e = m.getChild(experiment) if e is None: e = DataNode('experiment', experiment, parent=m) m.add(e) # create the realization if necessary r = e.getChild(realization) if r is None: r = DataNode('realization', realization, parent=e) e.add(r) # create the variable if necessary v = r.getChild(variablename) if v is None: v = DataNode('variable', variablename, parent=r, realm=realm) r.add(v) filename = (prefix + name) # create the file if necessary f = v.getChild(filename) if f is None: f = DataNode('ncfile', filename, parent=v, start_date=start_date, end_date=end_date) v.add(f) ens.sinfo() print('\n For more details use ens.fulldetails() \n') return ens
[docs]def match_models(ens1, ens2, delete=False): """ Find common models between two ensembles. Parameters ---------- ens1 : cmipdata ensemble ens2 : cmipdata ensemble the two cmipdata ensembles to compare. Returns ------- ens1 : cmipdata ensemble ens2 : cmipdata ensemble two ensembles with matching models. """ # get lists of the models in both ensembles ens1_modelnames = ens1.lister('model') ens2_modelnames = ens2.lister('model') # find the model misses model_misses = set(ens1_modelnames).symmetric_difference(ens2_modelnames) # remove the models that are not in both ensembles for name in model_misses: m = ens1.getChild(name) if m is not None: if delete: files = m.lister('ncfile') for f in files: os.system('rm -f ' + f) print 'deleting %s from ens1' % (m.name) ens1.delete(m) m = ens2.getChild(name) if m is not None: if delete: files = m.lister('ncfile') for f in files: os.system('rm -f ' + f) print 'deleting %s from ens2' % (m.name) ens2.delete(m) ens1.squeeze() ens2.squeeze() return ens1, ens2
[docs]def match_realizations(ens1, ens2, delete=False): """ Find common realizations between two ensembles. Parameters ---------- ens1 : cmipdata ensemble ens2 : cmipdata ensemble the two cmipdata ensembles to compare. Returns ------- ens1 : cmipdata ensemble ens2 : cmipdata ensemble two ensembles with matching realizations. """ mer_e1 = list(ens1.mer()) mer_e2 = list(ens2.mer()) # make lists of strings of form: model-experiment-realization mer_string_e1 = [] mer_string_e2 = [] for n in mer_e1: mer_string_e1.append(n[2]) for n in mer_e2: mer_string_e2.append(n[2]) # find matching and non-matching models matches = set(mer_string_e1).intersection(mer_string_e2) misses = set(mer_string_e1).symmetric_difference(mer_string_e2) print 'misses:', len(misses), 'matches:', len(matches) # delete realizations not in both ensembles def deleting(items): for nm in items: if nm[2] in misses: if delete: files = nm[0].lister('ncfile') for f in files: os.system('rm -f ' + f) nm[1].delete(nm[0]) deleting(mer_e1) deleting(mer_e2) ens1.squeeze() ens2.squeeze() return ens1, ens2
if __name__ == "__main__": pass