I have a few functions that are using pandas.DataFrame.update method, and I'm trying to move into using Dask instead for the datasets, but the Dask Pandas API doesn't have the update method implemented. Is there an alternative way to get the same result in Dask?
Here are the methods I have using update:
df.update(df.filter(like='/').mask(lambda x: x == 0).ffill(1))
input
id .. .. ..(some cols) 1/1/20 1/2/20 1/3/20 1/4/20 1/5/20 1/6/20 ....
1 10 20 0 40 0 50
2 10 30 30 0 0 50
.
.
output
id .. .. ..(some cols) 1/1/20 1/2/20 1/3/20 1/4/20 1/5/20 1/6/20 ....
1 10 20 20 40 40 50
2 10 30 30 30 30 50
.
.
def replace_names(df1, df2, idxCol = 'id', srcCol = 'name', dstCol = 'name'):
df1 = df1.set_index(idxCol)
df1[dstCol].update(df2.set_index(idxCol)[srcCol])
return df1.reset_index()
df_new = replace_names(df1, df2)
input
df1
id name ...
123 city a
456 city b
789 city c
789 city c
456 city b
123 city a
.
.
.
df2
id name ...
123 City A
456 City B
789 City C
.
.
.
output
id name ...
123 City A
456 City B
789 City C
789 City C
456 City B
123 City A
.
.
.
There is a way to partially solve this. I'm assuming that df2 is much smaller than df1 and it actually fit in memory so we can read as pandas dataframe. If this is the case the following function work if df1 is a pandas or a dask dataframe but df2should be a pandas one.
import pandas as pd
import dask.dataframe as dd
def replace_names(df1, # can be pandas or dask dataframe
df2, # this should be pandas.
idxCol='id',
srcCol='name',
dstCol='name'):
diz = df2[[idxCol, srcCol]].set_index(idxCol).to_dict()[srcCol]
out = df1.copy()
out[dstCol] = out[idxCol].map(diz)
return out
Regarding the first problem the following code works in pandas and dask
df = pd.DataFrame({'a': {0: 1, 1: 2},
'b': {0: 3, 1: 4},
'1/1/20': {0: 10, 1: 10},
'1/2/20': {0: 20, 1: 30},
'1/3/20': {0: 0, 1: 30},
'1/4/20': {0: 40, 1: 0},
'1/5/20': {0: 0, 1: 0},
'1/6/20': {0: 50, 1: 50}})
# if you want to try with dask
# df = dd.from_pandas(df, npartitions=2)
cols = [col for col in df.columns if "/" in col]
df[cols] = df[cols].mask(lambda x: x==0).ffill(1) #.astype(int)
Remove the comment in the last line if you want the output to be integer.
UPDATE Question 2
In case you want a dask only solution you could try the following.
import numpy as np
import pandas as pd
import dask.dataframe as dd
df1 = pd.DataFrame({'id': {0: 123, 1: 456, 2: 789, 3: 789, 4: 456, 5: 123},
'name': {0: 'city a',
1: 'city b',
2: 'city c',
3: 'city c',
4: 'city b',
5: 'city a'}})
df2 = pd.DataFrame({'id': {0: 123, 1: 456, 2: 789},
'name': {0: 'City A', 1: 'City B', 2: 'City C'}})
df1 = dd.from_pandas(df1, npartitions=2)
df2 = dd.from_pandas(df2, npartitions=2)
In this case if one id is present in df1 but not in df2 you keep the name in df1.
def replace_names_dask(df1, df2,
idxCol='id',
srcCol='name',
dstCol='name'):
if srcCol == dstCol:
df2 = df2.rename(columns={srcCol:f"{srcCol}_new"})
srcCol = f"{srcCol}_new"
def map_replace(x, srcCol, dstCol):
x[dstCol] = np.where(x[srcCol].notnull(),
x[srcCol],
x[dstCol])
return x
df = dd.merge(df1, df2, on=idxCol, how="left")
df = df.map_partitions(lambda x: map_replace(x, srcCol, dstCol))
df = df.drop(srcCol, axis=1)
return df
df = replace_names_dask(df1, df2)
In this case if one id is present in df1 but not in df2 then name in the output df will be NaN (as in a standard left join)
def replace_names_dask(df1, df2,
idxCol='id',
srcCol='name',
dstCol='name'):
df1 = df1.drop(dstCol, axis=1)
df2 = df2.rename(columns={srcCol: dstCol})
df = dd.merge(df1, df2, on=idxCol, how="left")
return df
df = replace_names_dask(df1, df2)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With