Source code for velocyto.serialization

import h5py
import numpy as np
import pickle
import zlib
import os
from typing import *


def _obj2uint(obj: object, compression: int=9, protocol: int=2) -> np.ndarray:
    """Transform a python object in a numpy array of uint8
    
    Arguments
    ---------
    obj: object
        The object to encode
    compression: int, default=9
        the compression level of ``zlib``
    protocol: int, default=2
        the protocol used by ``pickle.dumps``
    
    Returns
    -------
    An array encoding in bytes (uint8) the object pickled
    """
    zstr = zlib.compress(pickle.dumps(obj, protocol=protocol), compression)
    return np.fromstring(zstr, dtype=np.uint8)


def _uint2obj(uint: np.ndarray) -> object:
    """Recover the original object encoded in a uint8 numpy array
    
    Arguments
    ---------
    uint: np.ndarray
        A uint8 numpy array generated by the ``_obj2uint`` function
    
    Returns
    -------
    A python object that was encoded in the numpy array
    """
    return pickle.loads(zlib.decompress(uint.tobytes()))


[docs]def dump_hdf5(obj: object, filename: str,
              data_compression: int=7, chunks: Tuple =(2048, 2048),
              noarray_compression: int=9, pickle_protocol: int=2) -> None:
    """Dump all attribute of a python object to hdf5
    
    Arguments
    ---------
    obj: object
        a python object
    filename: str
        the name of the file to be saved
    data_compression: int
        the level of compression used by hdf5
    chunks: Tuple, default=(2048, 2048)
        The size of the chunks to be used for compression/random access
    noarray_compression: int, default=9
        the compression level of ``zlib``, used when the attribute is not an array
    pickle_protocol: int, default=2
        the protocol used by ``pickle.dumps`` when the attribute is not an array
    
    Returns
    -------
    Nothing but it creates a file ``filename``
    """
    if os.path.isfile(filename):
        os.remove(filename)
    _file = h5py.File(filename, "w")
    for k in obj.__dict__.keys():
        attribute = getattr(obj, k)
        if type(attribute) is not np.ndarray:
            serialized = _obj2uint(attribute, compression=noarray_compression, protocol=pickle_protocol)
            _file.create_dataset(f"&{k}", data=serialized,
                                 chunks=tuple((min(1024, len(serialized)),)),
                                 compression="gzip", compression_opts=data_compression,
                                 fletcher32=False, shuffle=False)
        else:
            chunk_size = tuple((min(chunks[i], attribute.shape[i]) for i in range(len(attribute.shape))))
            _file.create_dataset(k, data=attribute, chunks=chunk_size,
                                 compression="gzip", compression_opts=data_compression,
                                 fletcher32=False, shuffle=False)
    _file.close()


[docs]def load_hdf5(filename: str, obj_class: Type[object]) -> object:
    """Load all attributes from a hdf5 encoded python object
    
    Arguments
    ---------
    filename:
        the name of the file to be loaded
    obj_class:
        the type of object to be generated
    
    Returns
    -------
    An object that has been filled with the attributes stored in the hdf5 file
    
    Note
    ----
    The trick to create an empty object was adapted from Guido van Rossum answer:
    https://stackoverflow.com/questions/2168964/python-creating-class-instance-without-calling-initializer
    """
    
    obj = obj_class.__new__(obj_class)
    _file = h5py.File(filename, "r")
    for k in _file.keys():
        if k.startswith("&"):
            setattr(obj, k[1:], _uint2obj(_file[k][:]))
        else:
            setattr(obj, k, _file[k][:])
    _file.close()
    return obj