My memory is too small for my data, so I tried packing it in memory.
The following code does work, but I have to remember the type of the data, which is kind of akward (lots of different data types).
Any better suggestions? Smaller running time would also be appreciated
import numpy as np
import zlib
A = np.arange(10000)
dtype = A.dtype
B = zlib.compress(A, 1)
C = np.fromstring(zlib.decompress(B), dtype)
np.testing.assert_allclose(A, C)
You could try using numpy's builtin array compressor np.savez_compressed()
. This will save you the hassle of keeping track of the data types, but would probably give similar performance to your method. Here's an example:
import io
import numpy as np
A = np.arange(10000)
compressed_array = io.BytesIO() # np.savez_compressed() requires a file-like object to write to
np.savez_compressed(compressed_array, A)
# load it back
compressed_array.seek(0) # seek back to the beginning of the file-like object
decompressed_array = np.load(compressed_array)['arr_0']
>>> print(len(compressed_array.getvalue())) # compressed array size
15364
>>> assert A.dtype == decompressed_array.dtype
>>> assert all(A == decompressed_array)
Note that any size reduction depends on the distribution of your data. Random data is inherently incompressible, so you might not see much benefit by attempting to compress it.
I want to post my final code, in case it helps anyone. It can compress in RAM with different pack algorithems, or alternatively, if there is not enough RAM, store the data in a hdf5 file. Any speedups or advice for better code is appreciated.
import zlib,bz2
import numpy as np
import h5py
import os
class packdataclass():
def __init__(self,packalg='nocompress',Filename=None):
self.packalg=packalg
if self.packalg=='hdf5_on_drive':
self.Filename=Filename
self.Running_Number=0
if os.path.isfile(Filename):
os.remove(Filename)
with h5py.File(self.Filename,'w') as hdf5_file:
hdf5_file.create_dataset("TMP_File", data="0")
def clean_up(self):
if self.packalg=='hdf5_on_drive':
if os.path.isfile(self.Filename):
os.remove(self.Filename)
def compress (self, array):
Returndict={'compression':self.packalg,'type':array.dtype}
if array.dtype==np.bool:
Returndict['len_bool_array']=len(array)
array=np.packbits(array.astype(np.uint8)) # Code converts 8 bool to an int8
Returndict['type']='bitfield'
if self.packalg == 'nocompress':
Returndict['data'] = array
elif self.packalg == 'zlib':
Returndict['data'] = zlib.compress(array,1)
elif self.packalg == 'bz2':
Returndict['data'] = bz2.compress(array,1)
elif self.packalg == 'hdf5_on_drive':
with h5py.File(self.Filename,'r+') as hdf5_file:
datatype=array.dtype
Returndict['data']=str(self.Running_Number)
hdf5_file.create_dataset(Returndict['data'], data=array, dtype=datatype, compression='gzip',compression_opts=4)
self.Running_Number+=1
else:
raise ValueError("Algorithm for packing {} is unknown".format(self.packalg))
return(Returndict)
def decompress (self, data):
if data['compression'] == 'nocompress':
data_decompressed=data['data']
else:
if data['compression'] == 'zlib':
data_decompressed = zlib.decompress(data['data'])
elif data['compression'] == 'bz2':
data_decompressed = bz2.decompress(data['data'])
elif data['compression'] == 'hdf5_on_drive':
with h5py.File(self.Filename, "r") as Readfile:
data_decompressed=np.array(Readfile[data['data']])
else:
raise
if type(data['type'])!=np.dtype and data['type']=='bitfield':
data_decompressed =np.fromstring(data_decompressed, np.uint8)
else:
data_decompressed =np.fromstring(data_decompressed, data['type'])
if type(data['type'])!=np.dtype and data['type']=='bitfield':
return np.unpackbits(data_decompressed).astype(np.bool)[:data['len_bool_array']]
else:
return(data_decompressed)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With