In a broad sense, I'm trying to calculate how much of the red path/trajectory falls in-between the black paths for many different trials (see plot below). I circled a couple examples, where for (0, 1, 3) approx 30-40% of the red path falls in-between the two black paths, but for (2, 1, 3) only about 1-2% of the red path is in-between the two black paths.
I have two dataframes, df_R & df_H.
df_R contains the position data for the red paths (in X & Z). Note that X & Z are both positional/spatial data. These data do not have a date/time-like index. Also note that this is 2D data from a Unity Application (for some reason X & Z are the equivalent of Y & X). All paths/trajectories are 1000 points in length in both dataframes.
Preview of df_R:
(0, 1, 1)_mean_X (0, 1, 1)_mean_Z ... (2, 2, 3)_mean_X (2, 2, 3)_mean_Z
0 -15.856713 5.002617 ... -15.600160 -5.010470
1 -15.831320 5.003529 ... -15.566172 -5.012251
2 -15.805927 5.004441 ... -15.532184 -5.014032
3 -15.780534 5.005353 ... -15.498196 -5.015814
4 -15.755141 5.006265 ... -15.464208 -5.017595
.. ... ... ... ... ...
95 -12.818362 5.429729 ... -12.391177 -5.391595
96 -12.783905 5.437335 ... -12.357563 -5.396919
97 -12.749456 5.444990 ... -12.323950 -5.402243
98 -12.715017 5.452697 ... -12.290336 -5.407567
99 -12.680594 5.460469 ... -12.256722 -5.412891
df_H contains the position data for the black paths, which includes a 'top' and 'bottom' column for X and for Z, corresponding to the two black paths in each plot.
Preview of df_H:
(0, 1, 1)_top_X (0, 1, 1)_bottom_X ... (2, 2, 3)_top_Z (2, 2, 3)_bottom_Z
0 -16.000000 -16.000000 ... -5.000000 -5.000000
1 -16.000000 -16.000000 ... -5.000000 -5.000000
2 -16.000000 -16.000000 ... -5.000000 -5.000000
3 -16.000000 -16.000000 ... -5.000000 -5.000000
4 -16.000000 -16.000000 ... -5.000000 -5.000000
.. ... ... ... ... ...
95 -15.000971 -15.417215 ... -4.993461 -5.011372
96 -14.979947 -15.402014 ... -4.993399 -5.013007
97 -14.957949 -15.385840 ... -4.993291 -5.014463
98 -14.934171 -15.368649 ... -4.993186 -5.015692
99 -14.908484 -15.349371 ... -4.993069 -5.016940
I need to see whether the X/Z value in df_R falls within the range of the X/Z values in df_H. However, the solution needs to NOT rely on time or index (or row-wise comparisons).
I have been trying to implement this for a while but am stuck. This is what I've been trying but it's not working because it relies on time.
import pandas as pd
import numpy as np
def CI_analysis(df_H, df_R):
# separate X & Z
df_H_top_X = df_H.filter(regex='top_X')
df_H_bottom_X = df_H.filter(regex='bottom_X')
df_H_top_Z = df_H.filter(regex='top_Z')
df_H_bottom_Z = df_H.filter(regex='bottom_Z')
df_R_X = CI_raycast.filter(regex='mean_X')
df_R_Z = CI_raycast.filter(regex='mean_Z')
# check if X is within the range of top & bottom X
CI_inside_X = pd.DataFrame()
for col in df_R_X:
temp = []
c = 0
for i, val in df_R_X[col].iteritems():
if (val < df_H_top_X.iloc[i,c]) & (val > df_H_bottom_X.iloc[i,c]):
temp.append(1)
else:
temp.append(0)
CI_inside_X[col] = temp
c = c+1
# check if Z is within the range of top & bottom Z
CI_inside_Z = pd.DataFrame()
for col in df_R_Z:
temp = []
# print(col)
c = 0
for i, val in df_R_Z[col].iteritems():
if (val < df_H_top_Z.iloc[i,c]) & (val > df_H_bottom_Z.iloc[i,c]):
temp.append(1)
else:
temp.append(0)
CI_inside_Z[col] = temp
c = c+1
# Check if X & Z were both in-between the top & bottom trajectories
CI_inside = pd.DataFrame()
for col in CI_inside_X:
temp = []
c = 0
for i,row in CI_inside_X[col].iteritems():
if (row == 1) & (CI_inside_Z.iloc[i,c] == 1):
temp.append(1)
else:
temp.append(0)
CI_inside[col] = temp
c = c+1
CI_inside_avg = pd.DataFrame(CI_inside.mean(axis=0)).transpose()
return CI_inside_X, CI_inside_Z, CI_inside, CI_inside_avg
Lastly, here is code to reproduce the two dataframes df_R & df_H (with random numbers):
df_R_cols = ['(0, 1, 1)_mean_X', '(0, 1, 1)_mean_Z', '(0, 1, 2)_mean_X',
'(0, 1, 2)_mean_Z', '(0, 1, 3)_mean_X', '(0, 1, 3)_mean_Z',
'(0, 2, 1)_mean_X', '(0, 2, 1)_mean_Z', '(0, 2, 2)_mean_X',
'(0, 2, 2)_mean_Z', '(0, 2, 3)_mean_X', '(0, 2, 3)_mean_Z',
'(1, 1, 1)_mean_X', '(1, 1, 1)_mean_Z', '(1, 1, 2)_mean_X',
'(1, 1, 2)_mean_Z', '(1, 1, 3)_mean_X', '(1, 1, 3)_mean_Z',
'(1, 2, 1)_mean_X', '(1, 2, 1)_mean_Z', '(1, 2, 2)_mean_X',
'(1, 2, 2)_mean_Z', '(1, 2, 3)_mean_X', '(1, 2, 3)_mean_Z',
'(2, 1, 1)_mean_X', '(2, 1, 1)_mean_Z', '(2, 1, 2)_mean_X',
'(2, 1, 2)_mean_Z', '(2, 1, 3)_mean_X', '(2, 1, 3)_mean_Z',
'(2, 2, 1)_mean_X', '(2, 2, 1)_mean_Z', '(2, 2, 2)_mean_X',
'(2, 2, 2)_mean_Z', '(2, 2, 3)_mean_X', '(2, 2, 3)_mean_Z']
df_H_cols = ['(0, 1, 1)_top_X', '(0, 1, 1)_bottom_X', '(0, 1, 1)_top_Z',
'(0, 1, 1)_bottom_Z', '(0, 1, 2)_top_X', '(0, 1, 2)_bottom_X',
'(0, 1, 2)_top_Z', '(0, 1, 2)_bottom_Z', '(0, 1, 3)_top_X',
'(0, 1, 3)_bottom_X', '(0, 1, 3)_top_Z', '(0, 1, 3)_bottom_Z',
'(0, 2, 1)_top_X', '(0, 2, 1)_bottom_X', '(0, 2, 1)_top_Z',
'(0, 2, 1)_bottom_Z', '(0, 2, 2)_top_X', '(0, 2, 2)_bottom_X',
'(0, 2, 2)_top_Z', '(0, 2, 2)_bottom_Z', '(0, 2, 3)_top_X',
'(0, 2, 3)_bottom_X', '(0, 2, 3)_top_Z', '(0, 2, 3)_bottom_Z',
'(1, 1, 1)_top_X', '(1, 1, 1)_bottom_X', '(1, 1, 1)_top_Z',
'(1, 1, 1)_bottom_Z', '(1, 1, 2)_top_X', '(1, 1, 2)_bottom_X',
'(1, 1, 2)_top_Z', '(1, 1, 2)_bottom_Z', '(1, 1, 3)_top_X',
'(1, 1, 3)_bottom_X', '(1, 1, 3)_top_Z', '(1, 1, 3)_bottom_Z',
'(1, 2, 1)_top_X', '(1, 2, 1)_bottom_X', '(1, 2, 1)_top_Z',
'(1, 2, 1)_bottom_Z', '(1, 2, 2)_top_X', '(1, 2, 2)_bottom_X',
'(1, 2, 2)_top_Z', '(1, 2, 2)_bottom_Z', '(1, 2, 3)_top_X',
'(1, 2, 3)_bottom_X', '(1, 2, 3)_top_Z', '(1, 2, 3)_bottom_Z',
'(2, 1, 1)_top_X', '(2, 1, 1)_bottom_X', '(2, 1, 1)_top_Z',
'(2, 1, 1)_bottom_Z', '(2, 1, 2)_top_X', '(2, 1, 2)_bottom_X',
'(2, 1, 2)_top_Z', '(2, 1, 2)_bottom_Z', '(2, 1, 3)_top_X',
'(2, 1, 3)_bottom_X', '(2, 1, 3)_top_Z', '(2, 1, 3)_bottom_Z',
'(2, 2, 1)_top_X', '(2, 2, 1)_bottom_X', '(2, 2, 1)_top_Z',
'(2, 2, 1)_bottom_Z', '(2, 2, 2)_top_X', '(2, 2, 2)_bottom_X',
'(2, 2, 2)_top_Z', '(2, 2, 2)_bottom_Z', '(2, 2, 3)_top_X',
'(2, 2, 3)_bottom_X', '(2, 2, 3)_top_Z', '(2, 2, 3)_bottom_Z']
df_R = pd.DataFrame(np.random.randint(0,100,size=(1000, 36)), columns=df_R_cols)
df_H = pd.DataFrame(np.random.randint(0,100,size=(1000, 72)), columns=df_H_cols)
Just a idea
If i understand the discussion right, the problem is that the data was sampled at different points. So you can't just compare the value of each row. And sometimes the buttom line is switched with the top line.
My idea would be now to interpolate the black trajectories at the same x-values as the red trajectorie. My answer concentrates on that idea. I borrowed some code from the previous answers for iterating over the data sets.
df_H = pd.read_pickle('df_H.pickle')
df_R = pd.read_pickle('df_R.pickle')
dfh_groups = [df_H.columns[x:x + 4] for x in range(0, len(df_H.columns), 4)]
dfr_groups = [df_R.columns[x:x + 2] for x in range(0, len(df_R.columns), 2)]
df_result = pd.DataFrame(columns=['Percentage'])
for i in range(len(dfr_groups)):
label = dfr_groups[i][0].split('_')[0]
X_R = df_R[dfr_groups[i][0]].to_numpy()
Y_R = df_R[dfr_groups[i][1]].to_numpy()
X_H_Top = df_H[dfh_groups[i][0]].to_numpy()
Y_H_Top = df_H[dfh_groups[i][1]].to_numpy()
X_H_Bottom = df_H[dfh_groups[i][2]].to_numpy()
Y_H_Bottom = df_H[dfh_groups[i][3]].to_numpy()
# Interpolate df_H to match the data points from df_R
bottom = interpolate.interp1d(X_H_Bottom,Y_H_Bottom)
top = interpolate.interp1d(X_H_Top,Y_H_Top)
# Respect the interpolation boundaries, so drop every row not in range from X_H_(Bottom/Top)
X_H_Bottom = X_R[(X_R > np.amin(X_H_Bottom)) & (X_R < np.amax(X_H_Bottom))]
X_H_Top = X_R[(X_R > np.amin(X_H_Top)) & (X_R < np.amax(X_H_Top))]
minimal_X = np.intersect1d(X_H_Bottom, X_H_Top)
# Calculate the new values an the data points from df_R
Y_I_Bottom = bottom(minimal_X)
Y_I_Top = top(minimal_X)
#Plot
'''
plt.plot(X_R, Y_R,'r-',minimal_X, Y_I_Bottom,'k-', minimal_X, Y_I_Top,'k-')
plt.show()
'''
# Count datapoints of df_R within bottom and top
minimal_x_idx = 0
nr_points_within = 0
for i in range(0,len(X_R)):
if minimal_x_idx >= len(minimal_X):
break
elif X_R[i] != minimal_X[minimal_x_idx]:
continue
else:
# Check if datapoint within even if bottom and top changed
if (Y_R[i] > np.amin(Y_I_Bottom[minimal_x_idx]) and Y_R[i] < np.amax(Y_I_Top[minimal_x_idx]))\
or (Y_R[i] < np.amin(Y_I_Bottom[minimal_x_idx]) and Y_R[i] > np.amax(Y_I_Top[minimal_x_idx])):
nr_points_within += 1
minimal_x_idx += 1
# Depends on definition if points outside of interpolation range should be count as outside or be dropped
percent_within = (nr_points_within * 100) / len(minimal_X)
df_result.loc[label] = [percent_within]
print(df_result)
I think and i really hope there are way more elegant ways to implement it especially the for-loop at the end.
I tested on it a few it worked quite well at least at a first glance. For your marked ones i got 71.8%(0,1,3) and 0,8%(2,1,3) that fall within.
I just compared each row after the interpolation. But at this point you could go a step further. For example you could get the spline interpolation coefficients and than calculate the intersections of the trajectories. So you could calculate either the percentage of a projection onto the x-Axis or really the percentage of length of the trajectorie that falls within. Maybe with a nice error estimation. I hoped that helped i little bit.
Little more detailed explaination based on comment
First i renamed your Z-Axis Y in my variables and in the explaination, i hope that's not too confusing. With the scipy function interp1d i do a spline interpolation of the bottom/top trajectories. Basically what that means i model two mathematical function based on the given X/Y values of the bottom and top trajectories. These functions return continuous ouput either for bottom or top. On every X value i get the Y value from the trajectorie even for X values that do not appear in the data. That's done by a so called spline interpolation. Between every X/Y value pair in the data a line is calculated (m * x +t). You can also use the keyword 'cubic' than a second degree polygon (a * x^2 + b * x + c) is calculated. Now with this model i can look, which value the bottom and top trajectories have at the X values given by the red trajectorie.
But this method has its limits that's why i need to drop some values. The interpolation is only defined between the min and the max of the X values given by the dataset. For example if the red trajectorie has smaller minimal X Value x1 than the bottom trajectorie in the dataset, i can not get the corresponding Y Value for x1, because the interpolation of the bottom trajectorie is not defined at x1. Therefor i limit myself to range in which a know every trajectorie in which my interolation is well defined for bottom and top.
PS.: Here my output for the whole dataset:
Percentage
(0, 1, 1) 3.427419
(0, 1, 2) 76.488396
(0, 1, 3) 71.802618
(0, 2, 1) 6.889564
(0, 2, 2) 16.330645
(0, 2, 3) 59.233098
(1, 1, 1) 13.373860
(1, 1, 2) 45.262097
(1, 1, 3) 91.084093
(1, 2, 1) 0.505051
(1, 2, 2) 1.010101
(1, 2, 3) 41.253792
(2, 1, 1) 4.853387
(2, 1, 2) 12.916246
(2, 1, 3) 0.808081
(2, 2, 1) 0.101112
(2, 2, 2) 0.708502
(2, 2, 3) 88.810484
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# create a reproducible dataframe
np.random.seed(365)
df_R = pd.DataFrame(np.random.randint(0,100,size=(1000, 36)), columns=df_R_cols)
df_H = pd.DataFrame(np.random.randint(0,100,size=(1000, 72)), columns=df_H_cols)
# create groups of column names: 18 groups
dfh_groups = [df_H.columns[x:x+4] for x in range(0, len(df_H.columns), 4)]
dfr_groups = [df_R.columns[x:x+2] for x in range(0, len(df_R.columns), 2)]
# create empty lists for pandas Series
x_series = list()
z_series = list()
both_series = list()
for i in range(len(dfr_groups)):
# print the groups
print(dfr_groups[i])
print(dfh_groups[i])
# extract the groups of column names
rx, rz = dfr_groups[i]
htx, hbx, htz, hbz = dfh_groups[i]
# check if _mean is between _top & _bottom
x_between = (df_R.loc[:, rx] < df_H.loc[:, htx]) & (df_R.loc[:, rx] > df_H.loc[:, hbx])
z_between = (df_R.loc[:, rz] < df_H.loc[:, htz]) & (df_R.loc[:, rz] > df_H.loc[:, hbz])
# check if x & z meet the criteria
both_between = x_between & z_between
# name the pandas Series
name = rx.split('_')[0]
x_between.rename(f'{name}_x', inplace=True)
z_between.rename(f'{name}_z', inplace=True)
both_between.rename(f'{name}_xz', inplace=True)
# append Series to lists
x_series.append(x_between)
z_series.append(z_between)
both_series.append(both_between)
# the following section of the loop is only used for visualization
# it is not necessary, other that for the plots
# plot
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(16, 6))
ax1.plot(df_R.loc[:, rz], df_R.loc[:, rx], label='mid')
ax1.plot(df_H.loc[:, htz], df_H.loc[:, htx], label='top')
ax1.plot(df_H.loc[:, hbz], df_H.loc[:, hbx], label='bottom')
ax1.set_title(f'{name}\nboth: {both_between.mean()}\nx: {x_between.mean()}\nz: {z_between.mean()}')
ax1.set_xlabel('Z-val')
ax1.set_ylabel('X-val')
ax1.legend()
# plot x, z, and mean with respect to the index
ax2.plot(df_R.index, df_R.loc[:, rx], label='x_mean')
ax2.plot(df_H.index, df_H.loc[:, htx], label='x_top')
ax2.plot(df_H.index, df_H.loc[:, hbx], label='x_bot')
ax2.plot(df_R.index, df_R.loc[:, rz], label='z_mean')
ax2.plot(df_H.index, df_H.loc[:, htz], label='z_top')
ax2.plot(df_H.index, df_H.loc[:, hbz], label='z_bot')
ax2.set_title('top, bottom and mean plotted with the x-axis as the index')
ax2.legend()
plt.show()
# concat all the Series into dataframes and set the type to int
df_x_between = pd.concat(x_series, axis=1).astype(int)
df_z_between = pd.concat(z_series, axis=1).astype(int)
df_both_between = pd.concat(both_series, axis=1).astype(int)
# calculate the mean
df_both_between.mean(axis=0).to_frame().T
(val < df_H_top_X.iloc[i,c]) & (val > df_H_bottom_X.iloc[i,c])
from the OP, is implemented above, with x_between
.mid
is between top
and bottom
, as shown in the left plot.
If possible, get all your tuples into an iterable:
a_rng = range(3)
b_rng = range(1, 3)
c_rng = range(1, 4)
all_my_tuples = [(a, b, c) for a in a_rng for b in b_rng for c in c_rng]
df_R_cols = [f"{x}_mean_{e}" for x in all_my_tuples for e in ["X","Z",]]
df_H_cols = [f"{x}_{pos}_{e}" for x in all_my_tuples for e in ["X","Z",] for pos in ["top", "bottom",]]
df_R_H = pd.merge(df_R, df_H, left_index=True, right_index=True)
pandas.query()
to create and execute dynamic query strings:df_fin = pd.DataFrame(index = map(str, all_my_tuples), columns=["n_found",])
# Iterate tuple elements
for t in all_my_tuples:
# Create query list.
qry_ = []
# Repeat same query creation process for X and Z.
for xz in ["X", "Z"]:
qry_.append(f"(`{t}_mean_{xz}` < `{t}_top_{xz}` & `{t}_mean_{xz}` > `{t}_bottom_{xz}`)")
# Join to create full query and execute into new dataframe
qry = " & ".join(qry_)
# print(qry)
dft = df_R_H.query(qry)
# Update dataframe with row count
if not (dft) is None:
df_fin.loc[f"{t}", "n_found"] = dft.shape[0]
else:
df_fin.loc[f"{t}", "n_found"] = 0
df_fin["n_mean"] = df_fin.loc[:, "n_found"].apply(lambda q: q / df_R.shape[0])
Output is something like the following:
n_found n_mean
(0, 1, 1) 27 0.027
(0, 1, 2) 34 0.034
(0, 1, 3) 25 0.025
(0, 2, 1) 23 0.023
(0, 2, 2) 31 0.031
(0, 2, 3) 29 0.029
(1, 1, 1) 22 0.022
(1, 1, 2) 23 0.023
(1, 1, 3) 22 0.022
(1, 2, 1) 21 0.021
(1, 2, 2) 22 0.022
(1, 2, 3) 27 0.027
(2, 1, 1) 29 0.029
(2, 1, 2) 35 0.035
(2, 1, 3) 25 0.025
(2, 2, 1) 29 0.029
(2, 2, 2) 23 0.023
(2, 2, 3) 32 0.032
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With