Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Compress/Zip numpy arrays in Memory

My memory is too small for my data, so I tried packing it in memory.

The following code does work, but I have to remember the type of the data, which is kind of akward (lots of different data types).

Any better suggestions? Smaller running time would also be appreciated

import numpy as np    
import zlib

A = np.arange(10000)
dtype = A.dtype

B = zlib.compress(A, 1)
C = np.fromstring(zlib.decompress(B), dtype)
np.testing.assert_allclose(A, C)
like image 994
Okapi575 Avatar asked Aug 19 '16 09:08

Okapi575


2 Answers

You could try using numpy's builtin array compressor np.savez_compressed(). This will save you the hassle of keeping track of the data types, but would probably give similar performance to your method. Here's an example:

import io
import numpy as np

A = np.arange(10000)
compressed_array = io.BytesIO()    # np.savez_compressed() requires a file-like object to write to
np.savez_compressed(compressed_array, A)

# load it back
compressed_array.seek(0)    # seek back to the beginning of the file-like object
decompressed_array = np.load(compressed_array)['arr_0']

>>> print(len(compressed_array.getvalue()))    # compressed array size
15364
>>> assert A.dtype == decompressed_array.dtype
>>> assert all(A == decompressed_array)

Note that any size reduction depends on the distribution of your data. Random data is inherently incompressible, so you might not see much benefit by attempting to compress it.

like image 191
mhawke Avatar answered Nov 03 '22 21:11

mhawke


I want to post my final code, in case it helps anyone. It can compress in RAM with different pack algorithems, or alternatively, if there is not enough RAM, store the data in a hdf5 file. Any speedups or advice for better code is appreciated.

import zlib,bz2
import numpy as np
import h5py
import os

class packdataclass():
    def __init__(self,packalg='nocompress',Filename=None):
        self.packalg=packalg
        if self.packalg=='hdf5_on_drive':
            self.Filename=Filename
            self.Running_Number=0
            if os.path.isfile(Filename):
                os.remove(Filename)
            with h5py.File(self.Filename,'w') as hdf5_file:
                hdf5_file.create_dataset("TMP_File", data="0")

    def clean_up(self):
        if self.packalg=='hdf5_on_drive':
            if os.path.isfile(self.Filename):
                os.remove(self.Filename)

    def compress (self, array):
        Returndict={'compression':self.packalg,'type':array.dtype}
        if array.dtype==np.bool:
            Returndict['len_bool_array']=len(array)            
            array=np.packbits(array.astype(np.uint8)) # Code converts 8 bool to an int8
            Returndict['type']='bitfield'
        if self.packalg == 'nocompress':
            Returndict['data'] = array

        elif self.packalg == 'zlib':
            Returndict['data'] = zlib.compress(array,1)

        elif self.packalg == 'bz2':
            Returndict['data'] = bz2.compress(array,1)
        elif self.packalg == 'hdf5_on_drive':
            with h5py.File(self.Filename,'r+') as hdf5_file:
                datatype=array.dtype
                Returndict['data']=str(self.Running_Number)
                hdf5_file.create_dataset(Returndict['data'], data=array, dtype=datatype, compression='gzip',compression_opts=4)
            self.Running_Number+=1

        else:
            raise ValueError("Algorithm for packing {} is unknown".format(self.packalg))

        return(Returndict)

    def decompress (self, data):

        if data['compression'] == 'nocompress':
            data_decompressed=data['data']
        else:
            if data['compression'] == 'zlib':
                data_decompressed = zlib.decompress(data['data'])

            elif data['compression'] == 'bz2':
                data_decompressed = bz2.decompress(data['data'])
            elif data['compression'] == 'hdf5_on_drive':
                with h5py.File(self.Filename, "r") as Readfile:
                    data_decompressed=np.array(Readfile[data['data']])
            else:
                raise
            if type(data['type'])!=np.dtype and data['type']=='bitfield':
                data_decompressed =np.fromstring(data_decompressed, np.uint8)
            else:                            
                data_decompressed =np.fromstring(data_decompressed, data['type'])

        if type(data['type'])!=np.dtype and data['type']=='bitfield':
            return np.unpackbits(data_decompressed).astype(np.bool)[:data['len_bool_array']]
        else:
            return(data_decompressed)
like image 27
Okapi575 Avatar answered Nov 03 '22 20:11

Okapi575