diff --git a/storage.py b/storage.py index 81968a7..b6d8219 100644 --- a/storage.py +++ b/storage.py @@ -6,10 +6,11 @@ import numpy as np import os import h5py import collections - import logging log = logging.getLogger(__name__) +_array_cache = {} + def unwrapArray(a,recursive=True,readH5pyDataset=True): """ This function takes an object (like a dictionary) and recursively unwraps it solving issues like: @@ -42,31 +43,54 @@ def unwrapArray(a,recursive=True,readH5pyDataset=True): if isinstance(a,np.ndarray) and a.dtype.char == "S": a = a.astype(str) return a -def dictToH5Group(d,group): +def dictToH5Group(d,group,link_copy=True): """ helper function that transform (recursive) a dictionary into an - hdf group by creating subgroups """ + hdf group by creating subgroups + link_copy = True, tries to save space in the hdf file by creating an internal link. + the current implementation uses memory though ... + """ + global _array_cache for key,value in d.items(): + TOTRY = True + if isinstance(value,(list,tuple)): value = np.asarray(value) if isinstance(value,dict): group.create_group(key) dictToH5Group(value,group[key]) - else: - # h5py can't handle numpy unicode arrays - if isinstance(value,np.ndarray) and value.dtype.char == "U": - value = np.asarray([vv.encode('ascii') for vv in value]) - group[key] = value + elif value is None: + group[key] = "NONE_PYTHON_OBJECT" + elif isinstance(value,np.ndarray): + # take care of unicode (h5py can't handle numpy unicode arrays) + if value.dtype.char == "U": value = np.asarray([vv.encode('ascii') for vv in value]) # check if it is list of array elif isinstance(value,np.ndarray) and value.ndim == 1 and isinstance(value[0],np.ndarray): group.create_group(key) group[key].attrs["IS_LIST_OF_ARRAYS"] = True - for index,array in enumerate(value): group["%s/index%05d"%(key,index)] = array - # h5py can't save None - elif value is None: - group[key] = "NONE_PYTHON_OBJECT" + for index,array in enumerate(value): dictToH5Group( { "index%010d"%index : array},group[key] ); + TOTRY = False; # don't even try to save as generic call group[key]=value else: - try: + if link_copy: + found_address = None + for address,(file_handle,array) in _array_cache.items(): + if np.array_equal(array,value) and group.file == file_handle: + log.info("Found array in cache, asked for %s/%s, found as %s"%(group.name,key,address)) + found_address = address + if found_address is not None: + value = group.file[found_address] + try: + if TOTRY: group[key] = value - except TypeError: - log.error("Can't save %s"%(key)) + if link_copy: + log.info("Addind array %s to cache"%(group.name)) + _array_cache[ group[key].name ] = (group.file,value) + except Exception as e: + log.warning("Can't save %s, error was %s"%(key,e)) + # try saving everything else that is not dict or array + else: + try: + group[key] = value + except Exception as e: + log.error("Can't save %s, error was %s"%(key,e)) + def dictToH5(h5,d): """ Save a dictionary into an hdf5 file @@ -125,7 +149,8 @@ def read(fname): else: raise ValueError("Extension must be h5, npy or npz, it was %s"%extension) -def save(fname,d): +def save(fname,d,link_copy=True): + """ link_copy is used by hdf5 saving only, it allows to creat link of identical arrays (saving space) """ # make sure the object is dict (recursively) this allows reading it # without the DataStorage module d = objToDict(d,recursive=True) @@ -261,7 +286,7 @@ class DataStorage(dict): keys = [k for k in keys if k[0] != '_' ] return keys - def save(self,fname=None): + def save(self,fname=None,link_copy=True): if fname is None: fname = self.filename assert fname is not None - save(fname,self) + save(fname,self,link_copy=link_copy)