I've got a multidimensional numpy array that I'm trying to stick into a pandas data frame. I'd like to flatten the array, and create a pandas index that reflects the pre-flattened array indices.
Note I'm using 3D to keep the example small, but I'd like to generalize to at least 4D
A = np.random.rand(2,3,4)
array([[[ 0.43793885, 0.40078139, 0.48078691, 0.05334248],
[ 0.76331509, 0.82514441, 0.86169078, 0.86496111],
[ 0.75572665, 0.80860943, 0.79995337, 0.63123724]],
[[ 0.20648946, 0.57042315, 0.71777265, 0.34155005],
[ 0.30843717, 0.39381407, 0.12623462, 0.93481552],
[ 0.3267771 , 0.64097038, 0.30405215, 0.57726629]]])
df = pd.DataFrame(A.flatten())
I'm trying to generate x/y/z columns like this:
A z y x
0 0.437939 0 0 0
1 0.400781 0 0 1
2 0.480787 0 0 2
3 0.053342 0 0 3
4 0.763315 0 1 0
5 0.825144 0 1 1
6 0.861691 0 1 2
7 0.864961 0 1 3
...
21 0.640970 1 2 1
22 0.304052 1 2 2
23 0.577266 1 2 3
I've tried setting this up using np.meshgrid
but I'm going wrong somewhere:
dimnames = ['z', 'y', 'x']
ranges = [ np.arange(x) for x in A.shape ]
ix = [ x.flatten() for x in np.meshgrid(*ranges) ]
for name, col in zip(dimnames, ix):
df[name] = col
df = df.set_index(dimnames).squeeze()
This result looks somewhat sensible, but the indices are wrong:
df
z y x
0 0 0 0.437939
1 0.400781
2 0.480787
3 0.053342
1 0 0 0.763315
1 0.825144
2 0.861691
3 0.864961
0 1 0 0.755727
1 0.808609
2 0.799953
3 0.631237
1 1 0 0.206489
1 0.570423
2 0.717773
3 0.341550
0 2 0 0.308437
1 0.393814
2 0.126235
3 0.934816
1 2 0 0.326777
1 0.640970
2 0.304052
3 0.577266
print A[0,1,0]
0.76331508999999997
print print df.loc[0,1,0]
0.75572665000000006
How can I create the index columns to reflect the shape of A
?
Return a copy of the array collapsed into one dimension. 'C' means to flatten in row-major (C-style) order. 'F' means to flatten in column-major (Fortran- style) order.
flatten() method. As the name applies, the flatten() method in Numpy is used to convert an array into a 1-dimensional array. So it basically flattens the array irrespective of its shape.
The numpy. ravel() functions returns contiguous flattened array(1D array with all the input-array elements and with the same type as it).
unravel_index(indices, shape, order='C') Converts a flat index or array of flat indices into a tuple of coordinate arrays. Parameters indicesarray_like. An integer array whose elements are indices into the flattened version of an array of dimensions shape .
You could use pd.MultiIndex.from_product
:
import numpy as np
import pandas as pd
import string
def using_multiindex(A, columns):
shape = A.shape
index = pd.MultiIndex.from_product([range(s)for s in shape], names=columns)
df = pd.DataFrame({'A': A.flatten()}, index=index).reset_index()
return df
A = np.array([[[ 0.43793885, 0.40078139, 0.48078691, 0.05334248],
[ 0.76331509, 0.82514441, 0.86169078, 0.86496111],
[ 0.75572665, 0.80860943, 0.79995337, 0.63123724]],
[[ 0.20648946, 0.57042315, 0.71777265, 0.34155005],
[ 0.30843717, 0.39381407, 0.12623462, 0.93481552],
[ 0.3267771 , 0.64097038, 0.30405215, 0.57726629]]])
df = using_multiindex(A, list('ZYX'))
yields
Z Y X A
0 0 0 0 0.437939
1 0 0 1 0.400781
2 0 0 2 0.480787
3 0 0 3 0.053342
...
21 1 2 1 0.640970
22 1 2 2 0.304052
23 1 2 3 0.577266
Or if performance is a top priority, consider using senderle's cartesian_product
. (See the code, below.)
Here is a benchmark for A with shape (100, 100, 100):
In [321]: %timeit using_cartesian_product(A, columns)
100 loops, best of 3: 13.8 ms per loop
In [318]: %timeit using_multiindex(A, columns)
10 loops, best of 3: 35.6 ms per loop
In [320]: %timeit indices_merged_arr_generic(A, columns)
10 loops, best of 3: 29.1 ms per loop
In [319]: %timeit using_product(A)
1 loop, best of 3: 461 ms per loop
This is the setup I used for the benchmark:
import numpy as np
import pandas as pd
import functools
import itertools as IT
import string
product = IT.product
def cartesian_product_broadcasted(*arrays):
"""
http://stackoverflow.com/a/11146645/190597 (senderle)
"""
broadcastable = np.ix_(*arrays)
broadcasted = np.broadcast_arrays(*broadcastable)
dtype = np.result_type(*arrays)
rows, cols = functools.reduce(np.multiply, broadcasted[0].shape), len(broadcasted)
out = np.empty(rows * cols, dtype=dtype)
start, end = 0, rows
for a in broadcasted:
out[start:end] = a.reshape(-1)
start, end = end, end + rows
return out.reshape(cols, rows).T
def using_cartesian_product(A, columns):
shape = A.shape
coords = cartesian_product_broadcasted(*[np.arange(s, dtype='int') for s in shape])
df = pd.DataFrame(coords, columns=columns)
df['A'] = A.flatten()
return df
def using_multiindex(A, columns):
shape = A.shape
index = pd.MultiIndex.from_product([range(s)for s in shape], names=columns)
df = pd.DataFrame({'A': A.flatten()}, index=index).reset_index()
return df
def indices_merged_arr_generic(arr, columns):
n = arr.ndim
grid = np.ogrid[tuple(map(slice, arr.shape))]
out = np.empty(arr.shape + (n+1,), dtype=arr.dtype)
for i in range(n):
out[...,i] = grid[i]
out[...,-1] = arr
out.shape = (-1,n+1)
df = pd.DataFrame(out, columns=['A']+columns)
return df
def using_product(A):
x, y, z = A.shape
x_, y_, z_ = zip(*product(range(x), range(y), range(z)))
df = pd.DataFrame(A.flatten()).assign(x=x_, y=y_, z=z_)
return df
A = np.random.random((100,100,100))
shape = A.shape
columns = list(string.ascii_uppercase[-len(shape):][::-1])
from itertools import product
np.random.seed(0)
A = np.random.rand(2, 3, 4)
x, y, z = A.shape
x_, y_, z_ = zip(*product(range(x), range(y), range(z)))
df = pd.DataFrame(A.flatten()).assign(x=x_, y=y_, z=z_)
>>> df
0 x y z
0 0.548814 0 0 0
1 0.715189 0 0 1
2 0.602763 0 0 2
3 0.544883 0 0 3
4 0.423655 0 1 0
5 0.645894 0 1 1
6 0.437587 0 1 2
7 0.891773 0 1 3
8 0.963663 0 2 0
9 0.383442 0 2 1
10 0.791725 0 2 2
11 0.528895 0 2 3
12 0.568045 1 0 0
13 0.925597 1 0 1
14 0.071036 1 0 2
15 0.087129 1 0 3
16 0.020218 1 1 0
17 0.832620 1 1 1
18 0.778157 1 1 2
19 0.870012 1 1 3
20 0.978618 1 2 0
21 0.799159 1 2 1
22 0.461479 1 2 2
23 0.780529 1 2 3
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With