Ways to speed up rolling weighted mean on a pandas dataframe

Question

I have a large DataFrame, on which I need to calculate the rolling row-wise weighted average.

I know I can do the following:

import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.rand(20000, 50))
weights = [1/9, 2/9, 1/3, 2/9, 1/9]  
rolling_mean = df.rolling(5, axis=1).apply(lambda seq: np.average(seq, weights=weights))

The issue is that this takes around 40 seconds on my PC. Is there any way to speed up this computation?

Panda Kim · Accepted Answer

Code

Creating a new dataframe of multiplying df by weights[0], then shifting df by one and multiplying by weights[1], then shifting df by two and multiplying by weights[2], and repeating this process, then adding all of the created dataframes together, will speed up the process.

sum([df.shift(num, axis=1) * w for num, w in enumerate(weights)])

it take 0.05986 sec

Andrej Kesely · Answer

Quick benchmark using the above answers:

from timeit import timeit

import numba as nb
import numpy as np
import pandas as pd

df = pd.DataFrame(np.random.rand(20000, 50))

weights = [1 / 9, 2 / 9, 1 / 3, 2 / 9, 1 / 9]


def rolling_normal(df):
    return df.rolling(5, axis=1).apply(lambda seq: np.average(seq, weights=weights))


def rolling_numba(df):
    return df.rolling(5, axis=1).apply(weighted_mean, engine="numba", raw=True)


def weighted_mean(seq):
    weights = [1 / 9, 2 / 9, 1 / 3, 2 / 9, 1 / 9]
    return np.average(seq, weights=weights)


def rollin_panda_kim(df):
    return sum(df.shift(num, axis=1) * w for num, w in enumerate(weights))


def rolling_safffh(df):
    @nb.njit
    def weighted_average(arr, weights):
        n = len(arr)
        result = np.empty(n - 4)  # The size of the resulting rolling window
        for i in range(n - 4):
            result[i] = np.average(arr[i : i + 5], weights=weights)
        return result

    # Apply the weighted_average function to the DataFrame
    return pd.DataFrame(
        np.apply_along_axis(weighted_average, axis=1, arr=df.values, weights=weights),
        columns=df.columns[4:],  # Adjust the columns to match the rolling window size
    )


# warm-up numba
rolling_numba(df)
rolling_safffh(df)


t1 = timeit("rolling_normal(x)", setup="x=df.copy()", number=1, globals=globals())
t2 = timeit("rolling_numba(x)", setup="x=df.copy()", number=1, globals=globals())
t3 = timeit("rollin_panda_kim(x)", setup="x=df.copy()", number=1, globals=globals())
t4 = timeit("rolling_safffh(x)", setup="x=df.copy()", number=1, globals=globals())

print(t1)
print(t2)
print(t3)
print(t4)

Prints on my machine (AMD 5700X/Python 3.11):

21.627028748000157
0.3747533499990823
0.008139017998473719
0.40484421200017096

@PandaKim solution is fastest.

Ways to speed up rolling weighted mean on a pandas dataframe

Tags:

python

pandas

numpy

younggotti

2 Answers

Panda Kim

Andrej Kesely

Recent Activity

Donate For Us

Ways to speed up rolling weighted mean on a pandas dataframe

Tags:

python

pandas

numpy

younggotti

2 Answers

Panda Kim

Andrej Kesely

Related questions

Recent Activity

Donate For Us