Pandas: Create new column that contains the most recent index where a condition related to the current row is met

Question

In the below example, I wish to return the last index ocurrance relative to the current row in which the "lower" was >= the "upper" column. I am able to do this with the results as expected but it is not truly vectorized and is very inefficient for larger dataframes.

import pandas as pd

# Sample DataFrame
data = {'lower': [7, 1, 6, 1, 1, 1, 1, 11, 1, 1],
        'upper': [2, 3, 4, 5, 6, 7, 8, 9, 10, 11]}

df = pd.DataFrame(data=data)

df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']))
df['DATE'] = pd.to_datetime(df['DATE'])
df.set_index('DATE', inplace=True)

# new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
def get_most_recent_index(row):
    previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]  
    recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
    return recent_index

df['prev'] = df.apply(get_most_recent_index, axis=1) 

print(df)

How would I rewrite this to be most efficient?

EDIT:

Firstly, thank you to you all for your responses.

On the topic of performance between the four viable solutions, the clear winner is bisect, proposed by Andrej Kesely. I have excluded pyjanitor as with any data volume approaching my set, we quickly run into memory alloc errors.

baseline: 1min 35s ± 5.15 s per loop (mean ± std. dev. of 2 runs, 2 loops each)

bisect: 1.76 s ± 82.5 ms per loop (mean ± std. dev. of 2 runs, 2 loops each)

enumerate: 1min 13s ± 2.17 s per loop (mean ± std. dev. of 2 runs, 2 loops each)

import pandas as pd
import numpy as np
from bisect import bisect_left
import janitor

def get_sample_df(rows=100_000):
    # Sample DataFrame
    data = {'lower': np.random.default_rng(seed=1).uniform(1,100,rows),
            'upper': np.random.default_rng(seed=2).uniform(1,100,rows)}

    df = pd.DataFrame(data=data)
    df = df.astype(int)

    df['DATE'] = pd.date_range('2020-01-01', periods=len(data['lower']), freq="min")
    df['DATE'] = pd.to_datetime(df['DATE'])
    df.set_index('DATE', inplace=True)

    return df


def get_baseline():
    df = get_sample_df()

    # new columns that contains the most recent index of previous rows, where the previous "lower" is greater than or equal to the current "upper"
    def get_most_recent_index(row):
        previous_indices = df.loc[:row.name - pd.Timedelta(minutes=1)]  
        recent_index = previous_indices[previous_indices['lower'] >= row['upper']].index.max()
        return recent_index

    df['prev'] = df.apply(get_most_recent_index, axis=1) 
    return df


def get_pyjanitor():

    df = get_sample_df()
    df.reset_index(inplace=True)

    # set the DATE column as an index
    # after the operation you can set the original DATE
    # column as an index
    left_df = df.assign(index_prev=df.index)
    right_df = df.assign(index_next=df.index)
    out=(left_df
        .conditional_join(
            right_df, 
            ('lower','upper','>='), 
            ('index_prev','index_next','<'), 
            df_columns='index_prev', 
            right_columns=['index_next','lower','upper'])
        )
    # based on the matches, we may have multiple returns
    # what we need is the closest to the current row
    closest=out.index_next-out.index_prev
    grouper=[out.index_next, out.lower,out.upper]
    min_closest=closest.groupby(grouper).transform('min')
    closest=closest==min_closest
    # we have out matches, which is defined by `index_prev`
    # use index_prev to get the relevant DATE
    prev=out.loc[closest,'index_prev']
    prev=df.loc[prev,'DATE'].array # avoid index alignment here
    index_next=out.loc[closest,'index_next']
    # now assign back to df, based on index_next and prev
    prev=pd.Series(prev,index=index_next)
    df = df.assign(prev=prev)
    return df

   

def get_bisect():
    df = get_sample_df()

    def get_prev_bs(lower, upper, _date):
        uniq_lower = sorted(set(lower))
        last_seen = {}

        for l, u, d in zip(lower, upper, _date):
            # find index of element that is >= u
            idx = bisect_left(uniq_lower, u)

            max_date = None
            for lv in uniq_lower[idx:]:
                if lv in last_seen:
                    if max_date is None:
                        max_date = last_seen[lv]
                    elif last_seen[lv] > max_date:
                        max_date = last_seen[lv]
            yield max_date
            last_seen[l] = d

    df["prev"] = list(get_prev_bs(df["lower"], df["upper"], df.index))
    return df

def get_enumerate():
    df = get_sample_df()
    df.reset_index(inplace=True)

    date_list=df["DATE"].values.tolist()
    lower_list=df["lower"].values.tolist()
    upper_list=df["upper"].values.tolist()
    new_list=[]
    for i,(x,y) in enumerate(zip(lower_list,upper_list)):
        if i==0:
            new_list.append(None)
        else:
            if (any(j >= y for j in lower_list[0:i])):
                

                for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
                    if ll>=y:
                        new_list.append(dl)
                        break
                    else:
                        continue
            else:
                new_list.append(None)
    df['prev']=new_list
    df['prev']=pd.to_datetime(df['prev'])
    return df

print("baseline:")
%timeit -n 2 -r 2 get_baseline()

# Unable to allocate 37.2 GiB for an array with shape (4994299505,) and data type int64
# print("pyjanitor:")
# %timeit -n 2 get_pyjanitor()

print("bisect:")
%timeit -n 2 -r 2 get_bisect()

print("enumerate:")
%timeit -n 2 -r 2 get_enumerate()

Andrej Kesely · Accepted Answer

I'm not sure if this can be vectorized (because you have variables that are dependent on past state). But you can try to speed up the computation using binary search, e.g.:

from bisect import bisect_left


def get_prev(lower, upper, _date):
    uniq_lower = sorted(set(lower))
    last_seen = {}

    for l, u, d in zip(lower, upper, _date):
        # find index of element that is >= u
        idx = bisect_left(uniq_lower, u)

        max_date = None
        for lv in uniq_lower[idx:]:
            if lv in last_seen:
                if max_date is None:
                    max_date = last_seen[lv]
                elif last_seen[lv] > max_date:
                    max_date = last_seen[lv]
        yield max_date
        last_seen[l] = d


df["prev_new"] = list(get_prev(df["lower"], df["upper"], df.index))
print(df)

Prints:

            lower  upper       prev   prev_new
DATE                                          
2020-01-01      7      2        NaT        NaT
2020-01-02      1      3 2020-01-01 2020-01-01
2020-01-03      6      4 2020-01-01 2020-01-01
2020-01-04      1      5 2020-01-03 2020-01-03
2020-01-05      1      6 2020-01-03 2020-01-03
2020-01-06      1      7 2020-01-01 2020-01-01
2020-01-07      1      8        NaT        NaT
2020-01-08     11      9        NaT        NaT
2020-01-09      1     10 2020-01-08 2020-01-08
2020-01-10      1     11 2020-01-08 2020-01-08

Pawan Tolani · Answer

In my understanding, it is faster to loop through python objects such as lists and dictionaries rather than pandas dataframe rows (could be wrong). Therefore, below is what i have tried and it works for your input df:

date_list=df["DATE"].values.tolist()
lower_list=df["lower"].values.tolist()
upper_list=df["upper"].values.tolist()
new_list=[]
for i,(x,y) in enumerate(zip(lower_list,upper_list)):
    if i==0:
        new_list.append(None)
    else:
        if (any(j >= y for j in lower_list[0:i])):
            

            for ll,dl in zip(reversed(lower_list[0:i]),reversed(date_list[0:i])):
                if ll>=y:
                    new_list.append(dl)
                    break
                else:
                    continue
        else:
            new_list.append(None)
df['prev']=new_list
df['prev']=pd.to_datetime(df['prev'])

Pandas: Create new column that contains the most recent index where a condition related to the current row is met

Tags:

python

pandas

dataframe

TheBob

2 Answers

Andrej Kesely

Pawan Tolani

Recent Activity

Donate For Us

Pandas: Create new column that contains the most recent index where a condition related to the current row is met

Tags:

python

pandas

dataframe

TheBob

2 Answers

Andrej Kesely

Pawan Tolani

Related questions

Recent Activity

Donate For Us