Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Modify pandas dataframe in python based on multiple rows

I am working with a DataFrame in Pandas / Python, each row has an ID (that is not unique), I would like to modify the dataframe to add a column with the secondname for each row that has multiple matching ID's.

Starting with:

   ID Name  Rate
0   1    A  65.5
1   2    B  67.3
2   2    C  78.8
3   3    D  65.0
4   4    E  45.3
5   5    F  52.0
6   5    G  66.0
7   6    H  34.0
8   7    I   2.0

Trying to get to:

   ID Name  Rate Secondname
0   1    A  65.5       None
1   2    B  67.3       C
2   2    C  78.8       B
3   3    D  65.0       None
4   4    E  45.3       None
5   5    F  52.0       G
6   5    G  66.0       F
7   6    H  34.0       None
8   7    I   2.0       None

My code:

import numpy as np
import pandas as pd


mydict = {'ID':[1,2,2,3,4,5,5,6,7],
             'Name':['A','B','C','D','E','F','G','H','I'],
             'Rate':[65.5,67.3,78.8,65,45.3,52,66,34,2]}

df=pd.DataFrame(mydict)

df['Newname']='None'

for i in range(0, df.shape[0]-1):
    if df.irow(i)['ID']==df.irow(i+1)['ID']:       
        df.irow(i)['Newname']=df.irow(i+1)['Name']

Which results in the following error:

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
df.irow(i)['Newname']=df.irow(i+1)['Secondname']
C:\Users\L\Anaconda3\lib\site-packages\pandas\core\series.py:664:     SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas- docs/stable/indexing.html#indexing-view-versus-copy
self.loc[key] = value

Any help would be much appreciated.

like image 218
LJH11 Avatar asked Apr 14 '26 16:04

LJH11


1 Answers

You can use groupby with custom function f, which use shift and combine_first:

def f(x):
    #print x
    x['Secondname'] = x['Name'].shift(1).combine_first(x['Name'].shift(-1))
    return x

print df.groupby('ID').apply(f)
   ID Name  Rate Secondname
0   1    A  65.5        NaN
1   2    B  67.3          C
2   2    C  78.8          B
3   3    D  65.0        NaN
4   4    E  45.3        NaN
5   5    F  52.0          G
6   5    G  66.0          F
7   6    H  34.0        NaN
8   7    I   2.0        NaN

You can avoid groupby and find duplicated, then fill helper columns by loc with column Name, then shift and combine_first and last drop helper columns:

print df.duplicated('ID', keep='first')
0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
8    False
dtype: bool   
print df.duplicated('ID', keep='last')
0    False
1     True
2    False
3    False
4    False
5     True
6    False
7    False
8    False
dtype: bool  
df.loc[ df.duplicated('ID', keep='first'), 'first'] = df['Name']
df.loc[ df.duplicated('ID', keep='last'), 'last'] = df['Name']
print df
   ID Name  Rate   first   last
0   1    A  65.5  NaN  NaN
1   2    B  67.3  NaN    B
2   2    C  78.8    C  NaN
3   3    D  65.0  NaN  NaN
4   4    E  45.3  NaN  NaN
5   5    F  52.0  NaN    F
6   5    G  66.0    G  NaN
7   6    H  34.0  NaN  NaN
8   7    I   2.0  NaN  NaN
df['SecondName'] = df['first'].shift(-1).combine_first(df['last'].shift(1))
df = df.drop(['first', 'l1'], axis=1)
print df
   ID Name  Rate SecondName
0   1    A  65.5        NaN
1   2    B  67.3          C
2   2    C  78.8          B
3   3    D  65.0        NaN
4   4    E  45.3        NaN
5   5    F  52.0          G
6   5    G  66.0          F
7   6    H  34.0        NaN
8   7    I   2.0        NaN

TESTING: (in time of testing solution of Roman Kh has wrong output)

len(df) = 9:

In [154]: %timeit jez(df1)
100 loops, best of 3: 15 ms per loop

In [155]: %timeit jez2(df2)
100 loops, best of 3: 3.45 ms per loop

In [156]: %timeit rom(df)
100 loops, best of 3: 3.55 ms per loop    

len(df) = 90k:

In [158]: %timeit jez(df1)
10 loops, best of 3: 57.1 ms per loop

In [159]: %timeit jez2(df2)
10 loops, best of 3: 36.4 ms per loop

In [160]: %timeit rom(df)
10 loops, best of 3: 40.4 ms per loop
import pandas as pd

mydict = {'ID':[1,2,2,3,4,5,5,6,7],
             'Name':['A','B','C','D','E','F','G','H','I'],
             'Rate':[65.5,67.3,78.8,65,45.3,52,66,34,2]}

df=pd.DataFrame(mydict)
print df


df =  pd.concat([df]*10000).reset_index(drop=True)

df1 = df.copy()
df2 = df.copy()

def jez(df):
    def f(x):
        #print x
        x['Secondname'] = x['Name'].shift(1).combine_first(x['Name'].shift(-1))
        return x

    return df.groupby('ID').apply(f)


def jez2(df): 
    #print df.duplicated('ID', keep='first')
    #print df.duplicated('ID', keep='last')
    df.loc[ df.duplicated('ID', keep='first'), 'first'] = df['Name']
    df.loc[ df.duplicated('ID', keep='last'), 'last'] = df['Name']
    #print df

    df['SecondName'] = df['first'].shift(-1).combine_first(df['last'].shift(1))
    df = df.drop(['first', 'last'], axis=1)
    return df



def rom(df):

    # cpIDs = True if the next row has the same ID
    df['cpIDs'] = df['ID'][:-1] == df['ID'][1:]
    # fill in the last row (get rid of NaN)
    df.iloc[-1,df.columns.get_loc('cpIDs')] = False
    # ShiftName == Name of the next row
    df['ShiftName'] = df['Name'].shift(-1)
    # fill in SecondName
    df.loc[df['cpIDs'], 'SecondName'] = df.loc[df['cpIDs'], 'ShiftName']
    # remove columns
    del df['cpIDs']
    del df['ShiftName']
    return df


print jez(df1)  
print jez2(df2)
print rom(df) 
print jez(df1)  
   ID Name  Rate Secondname
0   1    A  65.5        NaN
1   2    B  67.3          C
2   2    C  78.8          B
3   3    D  65.0        NaN
4   4    E  45.3        NaN
5   5    F  52.0          G
6   5    G  66.0          F
7   6    H  34.0        NaN
8   7    I   2.0        NaN
print jez2(df2)
   ID Name  Rate SecondName
0   1    A  65.5        NaN
1   2    B  67.3          C
2   2    C  78.8          B
3   3    D  65.0        NaN
4   4    E  45.3        NaN
5   5    F  52.0          G
6   5    G  66.0          F
7   6    H  34.0        NaN
8   7    I   2.0        NaN
print rom(df) 
   ID Name  Rate SecondName
0   1    A  65.5        NaN
1   2    B  67.3          C
2   2    C  78.8        NaN
3   3    D  65.0        NaN
4   4    E  45.3        NaN
5   5    F  52.0          G
6   5    G  66.0        NaN
7   6    H  34.0        NaN
8   7    I   2.0        NaN

EDIT:

If there is more duplicated pairs with same names, use shift for creating first and last columns:

df.loc[ df['ID'] == df['ID'].shift(), 'first'] = df['Name']
df.loc[ df['ID'] == df['ID'].shift(-1), 'last'] = df['Name']
like image 114
jezrael Avatar answered Apr 16 '26 05:04

jezrael