2017-01-20 10:41:31 +01:00
|
|
|
""" npy/npz/hdf5 file based storage;
|
2017-01-07 23:53:12 +01:00
|
|
|
this modules adds the possibility to dump and load objects in files and
|
|
|
|
a more convenient was of accessing the data via the .attributedict thanks
|
|
|
|
to the DataStorage class """
|
2017-01-05 19:22:37 +01:00
|
|
|
import numpy as np
|
|
|
|
import os
|
|
|
|
import h5py
|
|
|
|
import collections
|
|
|
|
|
2017-01-10 22:43:22 +01:00
|
|
|
import logging
|
2017-03-03 17:51:47 +01:00
|
|
|
log = logging.getLogger(__name__)
|
2017-01-05 19:22:37 +01:00
|
|
|
|
2017-01-07 23:53:12 +01:00
|
|
|
def unwrapArray(a,recursive=True,readH5pyDataset=True):
|
2017-01-20 10:41:31 +01:00
|
|
|
""" This function takes an object (like a dictionary) and recursively
|
2017-03-03 17:51:47 +01:00
|
|
|
unwraps it solving issues like:
|
|
|
|
* the fact that many objects are packaged as 0d array
|
|
|
|
This funciton has also some specific hack for handling h5py limits:
|
|
|
|
* handle the None python object
|
|
|
|
* numpy unicode ...
|
2017-01-07 23:53:12 +01:00
|
|
|
"""
|
|
|
|
# is h5py dataset convert to array
|
|
|
|
if isinstance(a,h5py.Dataset) and readH5pyDataset: a = a[...]
|
|
|
|
if isinstance(a,h5py.Dataset) and a.shape == (): a = a[...]
|
|
|
|
if isinstance(a,np.ndarray) and a.ndim == 0 : a = a.item()
|
|
|
|
if isinstance(a,np.ndarray) and a.dtype.char == "S": a = a.astype(str)
|
|
|
|
if recursive:
|
|
|
|
if "items" in dir(a): # dict, h5py groups, npz file
|
|
|
|
a = dict(a); # convert to dict, otherwise can't asssign values
|
|
|
|
for key,value in a.items(): a[key] = unwrapArray(value)
|
|
|
|
elif isinstance(a,list):
|
2017-01-13 14:49:48 +01:00
|
|
|
for index in range(len(a)): a[index] = unwrapArray(a[index])
|
2017-01-07 23:53:12 +01:00
|
|
|
else:
|
|
|
|
pass
|
|
|
|
if isinstance(a,dict): a = DataStorage(a)
|
|
|
|
# restore None that cannot be saved in h5py
|
|
|
|
if isinstance(a,str) and a == "NONE_PYTHON_OBJECT": a = None
|
|
|
|
# h5py can't save numpy unicode
|
|
|
|
if isinstance(a,np.ndarray) and a.dtype.char == "S": a = a.astype(str)
|
|
|
|
return a
|
|
|
|
|
2017-01-05 19:22:37 +01:00
|
|
|
def dictToH5Group(d,group):
|
2017-01-07 23:53:12 +01:00
|
|
|
""" helper function that transform (recursive) a dictionary into an
|
|
|
|
hdf group by creating subgroups """
|
2017-01-05 19:22:37 +01:00
|
|
|
for key,value in d.items():
|
2017-01-07 23:53:12 +01:00
|
|
|
if isinstance(value,dict):
|
|
|
|
group.create_group(key)
|
|
|
|
dictToH5Group(value,group[key])
|
|
|
|
else:
|
2017-01-05 19:22:37 +01:00
|
|
|
# h5py can't handle numpy unicode arrays
|
|
|
|
if isinstance(value,np.ndarray) and value.dtype.char == "U":
|
|
|
|
value = np.asarray([vv.encode('ascii') for vv in value])
|
|
|
|
# h5py can't save None
|
|
|
|
if value is None: value="NONE_PYTHON_OBJECT"
|
2017-01-06 15:40:26 +01:00
|
|
|
try:
|
|
|
|
group[key] = value
|
|
|
|
except TypeError:
|
|
|
|
log.error("Can't save %s"%(key))
|
2017-01-05 19:22:37 +01:00
|
|
|
|
|
|
|
def dictToH5(h5,d):
|
2017-01-06 15:40:26 +01:00
|
|
|
""" Save a dictionary into an hdf5 file
|
2017-01-20 10:41:31 +01:00
|
|
|
TODO: add capability of saving list of array
|
2017-01-06 15:40:26 +01:00
|
|
|
h5py is not capable of handling dictionaries natively"""
|
2017-01-05 19:22:37 +01:00
|
|
|
h5 = h5py.File(h5,mode="w")
|
|
|
|
# group = h5.create_group("/")
|
|
|
|
dictToH5Group(d,h5["/"])
|
|
|
|
h5.close()
|
|
|
|
|
2017-01-07 23:53:12 +01:00
|
|
|
def h5ToDict(h5,readH5pyDataset=True):
|
2017-01-06 15:40:26 +01:00
|
|
|
""" Read a hdf5 file into a dictionary """
|
2017-01-05 19:22:37 +01:00
|
|
|
with h5py.File(h5,"r") as h:
|
2017-01-07 23:53:12 +01:00
|
|
|
ret = unwrapArray(h,recursive=True,readH5pyDataset=readH5pyDataset)
|
2017-01-05 19:22:37 +01:00
|
|
|
return ret
|
|
|
|
|
|
|
|
def npzToDict(npzFile):
|
|
|
|
with np.load(npzFile) as npz: d = dict(npz)
|
2017-01-07 23:53:12 +01:00
|
|
|
d = unwrapArray(d,recursive=True)
|
2017-01-05 19:22:37 +01:00
|
|
|
return d
|
|
|
|
|
2017-01-20 10:41:31 +01:00
|
|
|
def npyToDict(npyFile):
|
|
|
|
d = unwrapArray( np.load(npyFile).item() ,recursive=True)
|
|
|
|
return d
|
|
|
|
|
2017-01-05 19:22:37 +01:00
|
|
|
def dictToNpz(npzFile,d): np.savez(npzFile,**d)
|
2017-01-20 10:41:31 +01:00
|
|
|
def dictToNpy(npyFile,d): np.save(npyFile,d)
|
|
|
|
|
|
|
|
def objToDict(o,recursive=True):
|
|
|
|
""" convert a DictWrap to a dictionary (useful for saving); it should work for other objects too
|
|
|
|
TODO: this function does not catch a list of DataStorage instances like
|
|
|
|
objToDict( ( DataStorage(), DataStorage() ) )
|
|
|
|
is not converted !!
|
|
|
|
"""
|
|
|
|
if "items" not in dir(o): return o
|
|
|
|
d = dict()
|
|
|
|
for k,v in o.items():
|
|
|
|
try:
|
|
|
|
d[k] = objToDict( v )
|
|
|
|
except Exception as e:
|
|
|
|
log.info("In objToDict, could not convert key %s to dict, error was"%\
|
|
|
|
(k,e))
|
|
|
|
d[k] = v
|
|
|
|
return d
|
|
|
|
|
2017-01-05 19:22:37 +01:00
|
|
|
|
|
|
|
def read(fname):
|
|
|
|
extension = os.path.splitext(fname)[1]
|
2017-01-06 18:06:34 +01:00
|
|
|
log.info("Reading storage file %s"%fname)
|
2017-01-05 19:22:37 +01:00
|
|
|
if extension == ".npz":
|
2017-01-13 14:49:48 +01:00
|
|
|
return DataStorage(npzToDict(fname))
|
2017-01-20 10:41:31 +01:00
|
|
|
elif extension == ".npy":
|
|
|
|
return DataStorage(npyToDict(fname))
|
2017-01-05 19:22:37 +01:00
|
|
|
elif extension == ".h5":
|
2017-01-13 14:49:48 +01:00
|
|
|
return DataStorage(h5ToDict(fname))
|
2017-01-05 19:22:37 +01:00
|
|
|
else:
|
2017-01-20 10:41:31 +01:00
|
|
|
raise ValueError("Extension must be h5, npy or npz, it was %s"%extension)
|
2017-01-05 19:22:37 +01:00
|
|
|
|
|
|
|
def save(fname,d):
|
2017-01-20 10:41:31 +01:00
|
|
|
# make sure the object is dict (recursively) this allows reading it
|
|
|
|
# without the DataStorage module
|
|
|
|
d = objToDict(d,recursive=True)
|
2017-01-27 15:37:59 +01:00
|
|
|
d['filename'] = fname
|
2017-01-05 19:22:37 +01:00
|
|
|
extension = os.path.splitext(fname)[1]
|
2017-01-06 18:06:34 +01:00
|
|
|
log.info("Saving storage file %s"%fname)
|
2017-01-13 14:49:48 +01:00
|
|
|
try:
|
|
|
|
if extension == ".npz":
|
|
|
|
return dictToNpz(fname,d)
|
|
|
|
elif extension == ".h5":
|
|
|
|
return dictToH5(fname,d)
|
2017-01-20 10:41:31 +01:00
|
|
|
elif extension == ".npy":
|
|
|
|
return dictToNpy(fname,d)
|
2017-01-13 14:49:48 +01:00
|
|
|
else:
|
2017-01-20 10:41:31 +01:00
|
|
|
raise ValueError("Extension must be h5, npy or npz, it was %s"%extension)
|
2017-01-13 14:49:48 +01:00
|
|
|
except Exception as e:
|
|
|
|
log.exception("Could not save %s"%fname)
|
2017-01-05 19:22:37 +01:00
|
|
|
|
2017-01-20 10:41:31 +01:00
|
|
|
|
2017-01-07 23:53:12 +01:00
|
|
|
class DataStorage(dict):
|
2017-01-20 10:41:31 +01:00
|
|
|
""" Storage for dict like object.
|
|
|
|
recursive : bool
|
|
|
|
recursively convert dict-like objects to DataStorage
|
|
|
|
It can save data to file (format npy,npz or h5)
|
|
|
|
|
|
|
|
To initialize it:
|
|
|
|
|
2017-03-03 17:51:47 +01:00
|
|
|
data = DataStorage( a=(1,2,3),b="add",filename='store.npz' )
|
2017-01-20 10:41:31 +01:00
|
|
|
|
2017-03-03 17:51:47 +01:00
|
|
|
# recursively by default
|
|
|
|
# data.a will be a DataStorage instance
|
|
|
|
data = DataStorage( a=dict( b = 1)) );
|
2017-01-20 10:41:31 +01:00
|
|
|
|
2017-03-03 17:51:47 +01:00
|
|
|
# data.a will be a dictionary
|
|
|
|
data = DataStorage( a=dict( b = 1),recursive=False )
|
|
|
|
|
|
|
|
# reads from file if it exists
|
2017-01-20 10:41:31 +01:00
|
|
|
data = DataStorage( 'mysaveddata.npz' ) ;
|
|
|
|
|
|
|
|
DOES NOT READ FROM FILE (even if it exists)!!
|
|
|
|
data = DataStorage( filename = 'mysaveddata.npz' );
|
|
|
|
|
|
|
|
create empty storage (with default filename)
|
|
|
|
data = DataStorage()
|
|
|
|
"""
|
|
|
|
def __init__(self,*args,filename='data_storage.npz',recursive=True,**kwargs):
|
|
|
|
# self.filename = kwargs.pop('filename',"data_storage.npz")
|
|
|
|
self.filename = filename
|
2017-03-03 17:51:47 +01:00
|
|
|
self._recursive = recursive
|
2017-01-20 10:41:31 +01:00
|
|
|
# interpret kwargs as dict if there are
|
|
|
|
if len(kwargs) != 0:
|
|
|
|
fileOrDict = dict(kwargs)
|
|
|
|
elif len(kwargs)==0 and len(args)>0:
|
|
|
|
fileOrDict = args[0]
|
|
|
|
else:
|
|
|
|
fileOrDict = dict()
|
|
|
|
|
2017-03-03 17:51:47 +01:00
|
|
|
|
2017-01-20 10:41:31 +01:00
|
|
|
d = dict(); # data dictionary
|
2017-01-07 23:53:12 +01:00
|
|
|
if isinstance(fileOrDict,dict):
|
|
|
|
d = fileOrDict
|
2017-01-20 10:41:31 +01:00
|
|
|
elif isinstance(fileOrDict,str):
|
2017-01-20 10:55:24 +01:00
|
|
|
if os.path.isfile(fileOrDict):
|
2017-01-20 10:41:31 +01:00
|
|
|
d = read(fileOrDict)
|
|
|
|
else:
|
|
|
|
self.filename=fileOrDict
|
|
|
|
d = dict()
|
2017-01-07 23:53:12 +01:00
|
|
|
else:
|
2017-01-20 10:41:31 +01:00
|
|
|
raise ValueError("Invalid DataStorage definition")
|
2017-01-07 23:53:12 +01:00
|
|
|
|
2017-01-13 14:49:48 +01:00
|
|
|
if recursive:
|
|
|
|
for k in d.keys():
|
|
|
|
if not isinstance(d[k],DataStorage) and isinstance(d[k],dict):
|
|
|
|
d[k] = DataStorage(d[k])
|
|
|
|
|
2017-01-07 23:53:12 +01:00
|
|
|
# allow accessing with .data, .delays, etc.
|
|
|
|
for k,v in d.items(): setattr(self,k,v)
|
|
|
|
|
|
|
|
# allow accessing as proper dict
|
|
|
|
self.update( **dict(d) )
|
|
|
|
|
|
|
|
def __setitem__(self, key, value):
|
2017-02-09 14:39:29 +01:00
|
|
|
#print("__setitem__")
|
2017-01-07 23:53:12 +01:00
|
|
|
setattr(self,key,value)
|
|
|
|
super().__setitem__(key, value)
|
|
|
|
|
2017-01-12 16:35:36 +01:00
|
|
|
def __setattr__(self, key, value):
|
|
|
|
""" allows to add fields with data.test=4 """
|
2017-03-03 17:51:47 +01:00
|
|
|
# check if attr exists is essential (or it fails when defining an instance)
|
|
|
|
if hasattr(self,"_recursive") and self._recursive and \
|
|
|
|
isinstance(value,(dict,collections.OrderedDict)): value = DataStorage(value)
|
2017-01-12 16:35:36 +01:00
|
|
|
super().__setitem__(key, value)
|
|
|
|
super().__setattr__(key,value)
|
|
|
|
|
2017-01-07 23:53:12 +01:00
|
|
|
def __delitem__(self, key):
|
|
|
|
delattr(self,key)
|
|
|
|
super().__delitem__(key)
|
|
|
|
|
2017-01-13 14:49:48 +01:00
|
|
|
def __str__(self):
|
|
|
|
keys = list(self.keys())
|
|
|
|
keys.sort()
|
|
|
|
return "DataStorage obj containing: %s" % ",".join(keys)
|
|
|
|
|
|
|
|
def __repr__(self):
|
|
|
|
keys = list(self.keys())
|
|
|
|
keys.sort()
|
2017-02-09 14:39:29 +01:00
|
|
|
if len(keys) == 0: return "Empty DataStorage"
|
2017-01-13 14:49:48 +01:00
|
|
|
nchars = max(map(len,keys))
|
|
|
|
fmt = "%%%ds %%s" % (nchars)
|
|
|
|
s = ["DataStorage obj containing (sorted): ",]
|
|
|
|
for k in keys:
|
2017-03-03 17:51:47 +01:00
|
|
|
if k[0] == "_": continue
|
2017-02-09 14:39:29 +01:00
|
|
|
obj = self[k]
|
|
|
|
if isinstance(obj,np.ndarray):
|
|
|
|
value_str = "array, size %s, type %s"% ("x".join(map(str,obj.shape)),obj.dtype)
|
|
|
|
elif isinstance(obj,DataStorage):
|
|
|
|
value_str = str(obj)[:50]
|
|
|
|
elif isinstance(obj,(str,DataStorage)):
|
|
|
|
value_str = obj[:50]
|
|
|
|
elif isinstance(obj,(list,tuple)) and all( [isinstance(v,np.ndarray) for v in obj]):
|
|
|
|
value_str = "list of arrays, shapes " + ",".join([str(v.shape) for v in obj[:5]]) + " ..."
|
2017-01-13 14:49:48 +01:00
|
|
|
elif self[k] is None:
|
|
|
|
value_str = "None"
|
|
|
|
else:
|
|
|
|
value_str = str(self[k])
|
2017-02-09 14:39:29 +01:00
|
|
|
if len(str(obj))>50: value_str += " ..."
|
2017-01-13 14:49:48 +01:00
|
|
|
s.append( fmt % (k,value_str) )
|
|
|
|
return "\n".join(s)
|
|
|
|
|
2017-02-09 14:39:29 +01:00
|
|
|
def keys(self):
|
|
|
|
keys = list(super().keys())
|
|
|
|
keys = [k for k in keys if k != 'filename' ]
|
2017-03-03 17:51:47 +01:00
|
|
|
keys = [k for k in keys if k[0] != '_' ]
|
2017-02-09 14:39:29 +01:00
|
|
|
return keys
|
|
|
|
|
2017-01-07 23:53:12 +01:00
|
|
|
def save(self,fname=None):
|
|
|
|
if fname is None: fname = self.filename
|
|
|
|
assert fname is not None
|
2017-01-20 10:41:31 +01:00
|
|
|
save(fname,self)
|