new functionality when saving in hdf5 file, it check if an idenatical array has been saved, and create a link to the already saved one in case, can save a lot of space (but uses ram for caching)

2017-03-13 13:27:39 +01:00 · 2017-03-13 13:27:39 +01:00 · f7d0b88faf
parent fdd45061eb
commit f7d0b88faf
1 changed files with 43 additions and 18 deletions
--- a/storage.py
+++ b/storage.py
@ -6,10 +6,11 @@ import numpy as np
 import os
 import h5py 
 import collections
-
 import logging
 log = logging.getLogger(__name__) 

+_array_cache = {}
+
 def unwrapArray(a,recursive=True,readH5pyDataset=True):
  """ This function takes an object (like a dictionary) and recursively
      unwraps it solving issues like:
@ -42,31 +43,54 @@ def unwrapArray(a,recursive=True,readH5pyDataset=True):
  if isinstance(a,np.ndarray) and a.dtype.char == "S": a = a.astype(str)
  return a

-def dictToH5Group(d,group):
+def dictToH5Group(d,group,link_copy=True):
  """ helper function that transform (recursive) a dictionary into an
-      hdf group by creating subgroups """
+      hdf group by creating subgroups 
+      link_copy = True, tries to save space in the hdf file by creating an internal link.
+                  the current implementation uses memory though ...
+  """
+  global _array_cache
  for key,value in d.items():
+    TOTRY = True
+    if isinstance(value,(list,tuple)): value = np.asarray(value)
    if isinstance(value,dict):
      group.create_group(key)
      dictToH5Group(value,group[key])
-    else:
-      # h5py can't handle numpy unicode arrays
-      if isinstance(value,np.ndarray) and value.dtype.char == "U":
-        value = np.asarray([vv.encode('ascii') for vv in value])
-        group[key] = value
+    elif value is None:
+      group[key] = "NONE_PYTHON_OBJECT"
+    elif isinstance(value,np.ndarray):
+      # take care of unicode (h5py can't handle numpy unicode arrays)
+      if value.dtype.char == "U": value = np.asarray([vv.encode('ascii') for vv in value])
      # check if it is list of array
      elif isinstance(value,np.ndarray) and value.ndim == 1 and isinstance(value[0],np.ndarray):
        group.create_group(key)
        group[key].attrs["IS_LIST_OF_ARRAYS"] = True
-        for index,array in enumerate(value): group["%s/index%05d"%(key,index)] = array
-      # h5py can't save None
-      elif value is None:
-        group[key] = "NONE_PYTHON_OBJECT"
+        for index,array in enumerate(value): dictToH5Group( { "index%010d"%index : array},group[key] );
+        TOTRY = False; # don't even try to save as generic call group[key]=value
      else:
-        try:
+        if link_copy:
+          found_address = None
+          for address,(file_handle,array) in _array_cache.items():
+            if np.array_equal(array,value) and group.file == file_handle:
+              log.info("Found array in cache, asked for %s/%s, found as %s"%(group.name,key,address))
+              found_address = address
+            if found_address is not None:
+              value = group.file[found_address]
+      try:
+        if TOTRY:
          group[key] = value
-        except TypeError:
-          log.error("Can't save %s"%(key))
+          if link_copy:
+            log.info("Addind array %s to cache"%(group.name))
+            _array_cache[ group[key].name ] = (group.file,value)
+      except Exception as e:
+        log.warning("Can't save %s, error was %s"%(key,e))
+    # try saving everything else that is not dict or array
+    else:
+      try:
+        group[key] = value
+      except Exception as e:
+        log.error("Can't save %s, error was %s"%(key,e))
+ 

 def dictToH5(h5,d):
  """ Save a dictionary into an hdf5 file
@ -125,7 +149,8 @@ def read(fname):
  else:
    raise ValueError("Extension must be h5, npy or npz, it was %s"%extension) 

-def save(fname,d):
+def save(fname,d,link_copy=True):
+  """ link_copy is used by hdf5 saving only, it allows to creat link of identical arrays (saving space) """
  # make sure the object is dict (recursively) this allows reading it
  # without the DataStorage module
  d = objToDict(d,recursive=True)
@ -261,7 +286,7 @@ class DataStorage(dict):
    keys = [k for k in keys if k[0] != '_' ]
    return keys

-  def save(self,fname=None):
+  def save(self,fname=None,link_copy=True):
    if fname is None: fname = self.filename
    assert fname is not None
-    save(fname,self)
+    save(fname,self,link_copy=link_copy)