In the below example, I wish to return the last index ocurrance relative to the current row in which the "lower" was >= the "upper" column. I am able to do this with the results as expected but it is not truly vectorized and is very inefficient for larger dataframes.
import pandas as pd
# Sample DataFrame
data = {'lower': [7, 1, 6, 1, 1, 1, 1, 11, 1, 1],
'upper': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}
df = pd.DataFrame(data=data)
df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']))
df['DATE'] = pd.to_datetime(df['DATE'])
df.set_index('DATE', inplace=True)
# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]
recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
return recent_index
df['prev'] = df.apply(get_most_recent_index, axis=1)
print(df)
How would I rewrite this to be most efficient?
EDIT:
Firstly, thank you to you all for your responses.
On the topic of performance between the four viable solutions, the clear winner is bisect, proposed by Andrej Kesely. I have excluded pyjanitor as with any data volume approaching my set, we quickly run into memory alloc errors.
baseline: 1min 35s ± 5.15 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
bisect: 1.76 s ± 82.5 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)
enumerate: 1min 13s ± 2.17 s per loop (mean ± std. dev. of 2 runs, 2 loops each)
import pandas as pd
import numpy as np
from bisect import bisect_left
import janitor
def get_sample_df(rows=100_000):
# Sample DataFrame
data = {'lower': np.random.default_rng(seed=1).uniform(1,100,rows),
'upper': np.random.default_rng(seed=2).uniform(1,100,rows)}
df = pd.DataFrame(data=data)
df = df.astype(int)
df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']), freq="min")
df['DATE'] = pd.to_datetime(df['DATE'])
df.set_index('DATE', inplace=True)
return df
def get_baseline():
df = get_sample_df()
# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]
recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
return recent_index
df['prev'] = df.apply(get_most_recent_index, axis=1)
return df
def get_pyjanitor():
df = get_sample_df()
df.reset_index(inplace=True)
# set the DATE column as an index
# after the operation you can set the original DATE
# column as an index
left_df = df.assign(index_prev=df.index)
right_df = df.assign(index_next=df.index)
out=(left_df
.conditional_join(
right_df,
('lower','upper','>='),
('index_prev','index_next','<'),
df_columns='index_prev',
right_columns=['index_next','lower','upper'])
)
# based on the matches, we may have multiple returns
# what we need is the closest to the current row
closest=out.index_next-out.index_prev
grouper=[out.index_next, out.lower,out.upper]
min_closest=closest.groupby(grouper).transform('min')
closest=closest==min_closest
# we have out matches, which is defined by `index_prev`
# use index_prev to get the relevant DATE
prev=out.loc[closest,'index_prev']
prev=df.loc[prev,'DATE'].array # avoid index alignment here
index_next=out.loc[closest,'index_next']
# now assign back to df, based on index_next and prev
prev=pd.Series(prev,index=index_next)
df = df.assign(prev=prev)
return df
def get_bisect():
df = get_sample_df()
def get_prev_bs(lower, upper, _date):
uniq_lower = sorted(set(lower))
last_seen = {}
for l, u, d in zip(lower, upper, _date):
# find index of element that is >= u
idx = bisect_left(uniq_lower, u)
max_date = None
for lv in uniq_lower[idx:]:
if lv in last_seen:
if max_date is None:
max_date = last_seen[lv]
elif last_seen[lv] > max_date:
max_date = last_seen[lv]
yield max_date
last_seen[l] = d
df["prev"] = list(get_prev_bs(df["lower"], df["upper"], df.index))
return df
def get_enumerate():
df = get_sample_df()
df.reset_index(inplace=True)
date_list=df["DATE"].values.tolist()
lower_list=df["lower"].values.tolist()
upper_list=df["upper"].values.tolist()
new_list=[]
for i,(x,y) in enumerate(zip(lower_list,upper_list)):
if i==0:
new_list.append(None)
else:
if (any(j >= y for j in lower_list[0:i])):
for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
if ll>=y:
new_list.append(dl)
break
else:
continue
else:
new_list.append(None)
df['prev']=new_list
df['prev']=pd.to_datetime(df['prev'])
return df
print("baseline:")
%timeit -n 2 -r 2 get_baseline()
# Unable to allocate 37.2 GiB for an array with shape (4994299505,) and data type int64
# print("pyjanitor:")
# %timeit -n 2 get_pyjanitor()
print("bisect:")
%timeit -n 2 -r 2 get_bisect()
print("enumerate:")
%timeit -n 2 -r 2 get_enumerate()
I'm not sure if this can be vectorized (because you have variables that are dependent on past state). But you can try to speed up the computation using binary search, e.g.:
from bisect import bisect_left
def get_prev(lower, upper, _date):
uniq_lower = sorted(set(lower))
last_seen = {}
for l, u, d in zip(lower, upper, _date):
# find index of element that is >= u
idx = bisect_left(uniq_lower, u)
max_date = None
for lv in uniq_lower[idx:]:
if lv in last_seen:
if max_date is None:
max_date = last_seen[lv]
elif last_seen[lv] > max_date:
max_date = last_seen[lv]
yield max_date
last_seen[l] = d
df["prev_new"] = list(get_prev(df["lower"], df["upper"], df.index))
print(df)
Prints:
lower upper prev prev_new
DATE
2020-01-01 7 2 NaT NaT
2020-01-02 1 3 2020-01-01 2020-01-01
2020-01-03 6 4 2020-01-01 2020-01-01
2020-01-04 1 5 2020-01-03 2020-01-03
2020-01-05 1 6 2020-01-03 2020-01-03
2020-01-06 1 7 2020-01-01 2020-01-01
2020-01-07 1 8 NaT NaT
2020-01-08 11 9 NaT NaT
2020-01-09 1 10 2020-01-08 2020-01-08
2020-01-10 1 11 2020-01-08 2020-01-08
In my understanding, it is faster to loop through python objects such as lists and dictionaries rather than pandas dataframe rows (could be wrong). Therefore, below is what i have tried and it works for your input df:
date_list=df["DATE"].values.tolist()
lower_list=df["lower"].values.tolist()
upper_list=df["upper"].values.tolist()
new_list=[]
for i,(x,y) in enumerate(zip(lower_list,upper_list)):
if i==0:
new_list.append(None)
else:
if (any(j >= y for j in lower_list[0:i])):
for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
if ll>=y:
new_list.append(dl)
break
else:
continue
else:
new_list.append(None)
df['prev']=new_list
df['prev']=pd.to_datetime(df['prev'])
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With