I've tried to apply the solution provided in this question to my real data: Selecting rows in a MultiIndexed dataframe. Somehow I cannot get the results it should give. I've attached both the dataframe to select from, as well as the result.
What I need;
Rows 3, 11 AND 12 should be returned (when you add the 4 columns consecutively, 12 should be selected as well. It isn't now).
df_test = pd.read_csv('df_test.csv')
def find_window(df):
v = df.values
s = np.vstack([np.zeros((1, v.shape[1])), v.cumsum(0)])
threshold = 0
r, c = np.triu_indices(s.shape[0], 1)
d = (c - r)[:, None]
e = s[c] - s[r]
mask = (e / d < threshold).all(1)
rng = np.arange(mask.shape[0])
if mask.any():
idx = rng[mask][d[mask].argmax()]
i0, i1 = r[idx], c[idx]
return pd.DataFrame(
v[i0:i1],
df.loc[df.name].index[i0:i1],
df.columns
)
cols = ['2012', '2013', '2014', '2015']
df_test.groupby(level=0)[cols].apply(find_window)
csv_file is here: https://docs.google.com/spreadsheets/d/19oOoBdAs3xRBWq6HReizlqrkWoQR2159nk8GWoR_4-g/edit?usp=sharing
EDIT: Correct dataframes added.
Note: Blue frame = rows which should be returned, yellow frames is consecutive column values which are < 0 (threshold).
I couldn't figure out the way to modify the original question you were linking to, since your solution looked like it should work. However, this is an iterative way to solve what you're looking for.
import pandas as pd
df_test = pd.read_csv('df_test.csv')
print(df_test.head())
"""
bins_DO L T2011 2011 T2012 2012 T2013 2013 T2014 2014 T2015 2015 Ttotal total
0 0 IR1 6 -6.06 13 -3.22 12 -1.60 7 14.64 12 -18.20 50 -14.44
1 1 IR1 14 -16.32 12 -6.74 14 -1.22 5 1.58 8 -0.42 53 -23.12
2 2 IR1 10 -9.14 10 -0.42 10 11.84 13 -5.74 7 -3.10 50 -6.56
3 3 IR1 9 -13.78 14 -3.14 10 -2.48 6 -0.02 5 -4.78 44 -24.20
4 4 IR1 6 0.54 9 -9.40 15 -11.20 7 0.68 9 12.04 46 -7.34
"""
cols = ['2012', '2013', '2014', '2015']
def process_df(df: pd.DataFrame, cols: list, threshold: float):
# initialize the benchmark
# this gets reset any time the newest row fails the threshold test
base_vals = [0 for _ in cols]
keep_col = []
for row in df[cols].values:
# by default, keep the row
keep_row = True
for x in range(len(cols)):
# if it fails on the row, then make keep row false
if row[x] + base_vals[x] > threshold:
keep_row = False
keep_col.append(keep_row)
if keep_row:
# if we were happy with those results, then keep adding the column values to the base_vals
for x in range(len(cols)):
base_vals[x] += row[x]
else:
# otherwise, reset the base vals
base_vals = [0 for _ in cols]
# only keep rows that we want
df = df.loc[keep_col, :]
return df
new_df = process_df(df = df_test, cols = cols, threshold = 0)
print(new_df)
"""
bins_DO L T2011 2011 T2012 2012 T2013 2013 T2014 2014 T2015 2015 Ttotal total
3 3 IR1 9 -13.78 14 -3.14 10 -2.48 6 -0.02 5 -4.78 44 -24.20
11 11 IR1 7 7.10 10 -10.04 7 -10.60 17 -5.56 11 -8.44 52 -27.54
12 12 IR1 10 -0.28 7 -7.30 8 5.96 8 -12.58 10 -6.86 43 -21.06
"""
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With