new functionality when saving in hdf5 file, it check if an idenatical array has been saved, and create a link to the already saved one in case, can save a lot of space (but uses ram for caching)
This commit is contained in:
parent
fdd45061eb
commit
f7d0b88faf
59
storage.py
59
storage.py
|
@ -6,10 +6,11 @@ import numpy as np
|
||||||
import os
|
import os
|
||||||
import h5py
|
import h5py
|
||||||
import collections
|
import collections
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_array_cache = {}
|
||||||
|
|
||||||
def unwrapArray(a,recursive=True,readH5pyDataset=True):
|
def unwrapArray(a,recursive=True,readH5pyDataset=True):
|
||||||
""" This function takes an object (like a dictionary) and recursively
|
""" This function takes an object (like a dictionary) and recursively
|
||||||
unwraps it solving issues like:
|
unwraps it solving issues like:
|
||||||
|
@ -42,31 +43,54 @@ def unwrapArray(a,recursive=True,readH5pyDataset=True):
|
||||||
if isinstance(a,np.ndarray) and a.dtype.char == "S": a = a.astype(str)
|
if isinstance(a,np.ndarray) and a.dtype.char == "S": a = a.astype(str)
|
||||||
return a
|
return a
|
||||||
|
|
||||||
def dictToH5Group(d,group):
|
def dictToH5Group(d,group,link_copy=True):
|
||||||
""" helper function that transform (recursive) a dictionary into an
|
""" helper function that transform (recursive) a dictionary into an
|
||||||
hdf group by creating subgroups """
|
hdf group by creating subgroups
|
||||||
|
link_copy = True, tries to save space in the hdf file by creating an internal link.
|
||||||
|
the current implementation uses memory though ...
|
||||||
|
"""
|
||||||
|
global _array_cache
|
||||||
for key,value in d.items():
|
for key,value in d.items():
|
||||||
|
TOTRY = True
|
||||||
|
if isinstance(value,(list,tuple)): value = np.asarray(value)
|
||||||
if isinstance(value,dict):
|
if isinstance(value,dict):
|
||||||
group.create_group(key)
|
group.create_group(key)
|
||||||
dictToH5Group(value,group[key])
|
dictToH5Group(value,group[key])
|
||||||
else:
|
elif value is None:
|
||||||
# h5py can't handle numpy unicode arrays
|
group[key] = "NONE_PYTHON_OBJECT"
|
||||||
if isinstance(value,np.ndarray) and value.dtype.char == "U":
|
elif isinstance(value,np.ndarray):
|
||||||
value = np.asarray([vv.encode('ascii') for vv in value])
|
# take care of unicode (h5py can't handle numpy unicode arrays)
|
||||||
group[key] = value
|
if value.dtype.char == "U": value = np.asarray([vv.encode('ascii') for vv in value])
|
||||||
# check if it is list of array
|
# check if it is list of array
|
||||||
elif isinstance(value,np.ndarray) and value.ndim == 1 and isinstance(value[0],np.ndarray):
|
elif isinstance(value,np.ndarray) and value.ndim == 1 and isinstance(value[0],np.ndarray):
|
||||||
group.create_group(key)
|
group.create_group(key)
|
||||||
group[key].attrs["IS_LIST_OF_ARRAYS"] = True
|
group[key].attrs["IS_LIST_OF_ARRAYS"] = True
|
||||||
for index,array in enumerate(value): group["%s/index%05d"%(key,index)] = array
|
for index,array in enumerate(value): dictToH5Group( { "index%010d"%index : array},group[key] );
|
||||||
# h5py can't save None
|
TOTRY = False; # don't even try to save as generic call group[key]=value
|
||||||
elif value is None:
|
else:
|
||||||
group[key] = "NONE_PYTHON_OBJECT"
|
if link_copy:
|
||||||
|
found_address = None
|
||||||
|
for address,(file_handle,array) in _array_cache.items():
|
||||||
|
if np.array_equal(array,value) and group.file == file_handle:
|
||||||
|
log.info("Found array in cache, asked for %s/%s, found as %s"%(group.name,key,address))
|
||||||
|
found_address = address
|
||||||
|
if found_address is not None:
|
||||||
|
value = group.file[found_address]
|
||||||
|
try:
|
||||||
|
if TOTRY:
|
||||||
|
group[key] = value
|
||||||
|
if link_copy:
|
||||||
|
log.info("Addind array %s to cache"%(group.name))
|
||||||
|
_array_cache[ group[key].name ] = (group.file,value)
|
||||||
|
except Exception as e:
|
||||||
|
log.warning("Can't save %s, error was %s"%(key,e))
|
||||||
|
# try saving everything else that is not dict or array
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
group[key] = value
|
group[key] = value
|
||||||
except TypeError:
|
except Exception as e:
|
||||||
log.error("Can't save %s"%(key))
|
log.error("Can't save %s, error was %s"%(key,e))
|
||||||
|
|
||||||
|
|
||||||
def dictToH5(h5,d):
|
def dictToH5(h5,d):
|
||||||
""" Save a dictionary into an hdf5 file
|
""" Save a dictionary into an hdf5 file
|
||||||
|
@ -125,7 +149,8 @@ def read(fname):
|
||||||
else:
|
else:
|
||||||
raise ValueError("Extension must be h5, npy or npz, it was %s"%extension)
|
raise ValueError("Extension must be h5, npy or npz, it was %s"%extension)
|
||||||
|
|
||||||
def save(fname,d):
|
def save(fname,d,link_copy=True):
|
||||||
|
""" link_copy is used by hdf5 saving only, it allows to creat link of identical arrays (saving space) """
|
||||||
# make sure the object is dict (recursively) this allows reading it
|
# make sure the object is dict (recursively) this allows reading it
|
||||||
# without the DataStorage module
|
# without the DataStorage module
|
||||||
d = objToDict(d,recursive=True)
|
d = objToDict(d,recursive=True)
|
||||||
|
@ -261,7 +286,7 @@ class DataStorage(dict):
|
||||||
keys = [k for k in keys if k[0] != '_' ]
|
keys = [k for k in keys if k[0] != '_' ]
|
||||||
return keys
|
return keys
|
||||||
|
|
||||||
def save(self,fname=None):
|
def save(self,fname=None,link_copy=True):
|
||||||
if fname is None: fname = self.filename
|
if fname is None: fname = self.filename
|
||||||
assert fname is not None
|
assert fname is not None
|
||||||
save(fname,self)
|
save(fname,self,link_copy=link_copy)
|
||||||
|
|
Loading…
Reference in New Issue