I have following two dataframes:
>>> df1
c1 c2 v1 v2
0 A NaN 9 2
1 B NaN 2 5
2 C NaN 3 5
3 D NaN 4 2
>>> df2
c1 c2 v1 v2
0 A P 4 1
1 A T 3 1
2 A Y 2 0
3 B P 0 1
4 B T 2 2
5 B Y 0 2
6 C P 1 2
7 C T 1 2
8 C Y 1 1
9 D P 1 1
10 D T 2 0
11 D Y 1 1
I need to concatenate the dataframes and sort them or vice versa. The first dataframe needs to be sorted on v1
column, then the second dataframe needs to be sorted based on the order of the values from c1
column after sorting the first dataframe, and the v2
column from the second dataframe.
A working version is something like this: sorting first dataframe on v1
, then iterating the rows, and filtering the second dataframe for the value of c2
column, and sorting the filtered second dataframe on v2
, finally concatenating all the frames.
result = []
for i,row in df1.sort_values('v1').iterrows():
result.append(row.to_frame().T)
result.append(df2[df2['c1'].eq(row['c1'])].sort_values('v2'))
The resulting dataframe after sorting:
>>> pd.concat(result, ignore_index=True)
c1 c2 v1 v2
0 B NaN 2 5
1 B P 0 1
2 B T 2 2
3 B Y 0 2
4 C NaN 3 5
5 C Y 1 1
6 C P 1 2
7 C T 1 2
8 D NaN 4 2
9 D T 2 0
10 D P 1 1
11 D Y 1 1
12 A NaN 9 2
13 A Y 2 0
14 A P 4 1
15 A T 3 1
The problem with above approach is its iterative, and not so efficient when the number of dataframes increases and/or the number of rows increases in these dataframes. The real use-case scenario has from 2 to 6 dataframes, where number of rows ranges from few thousands to hundred thousands.
Either of sorting the dataframes first then concatenating them, or concatenating the datframes first then sorting, will be fine, that is why I just included both the dataframes instead of just concatenating them and presenting a single dataframe.
Here is 4 dataframes from actual use-case scenario:
from math import nan
import pandas as pd
df4 = pd.DataFrame({'c1': ['BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT'], 'c2': ['D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2', 'w2'], 'c3': ['BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH', 'BAF', 'BAF', 'BAF', 'BAF', 'BAF', 'WH', 'WH', 'WH', 'WH', 'WH'], 'c4': ['001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss', '001', '002', '003', '004', 'mss'], 'v1': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 2, 2, 0, 1, 0, 2, 3, 6, 2, 0, 2, 2, 0, 1, 0, 1, 3, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 2, 2, 0, 1, 0, 2, 3, 6, 2, 0, 2, 2, 0, 1, 0, 1, 3, 5, 1, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 2, 2, 0, 2, 0, 2, 4, 6, 4, 0, 2, 2, 0, 1, 0, 2, 3, 6, 2, 0, 2, 2, 0, 1, 0, 1, 3, 5, 1, 0], 'v2': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 0, 0, 1, 0, 2, 3, 5, 4, 0, 0, 0, 0, 1, 0, 1, 3, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 0, 0, 1, 0, 2, 3, 5, 4, 0, 0, 0, 0, 1, 0, 1, 3, 5, 3, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 1, 0, 1, 0, 2, 4, 6, 5, 0, 0, 0, 0, 1, 0, 2, 3, 5, 4, 0, 0, 0, 0, 1, 0, 1, 3, 5, 3, 0], 'v3': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 1, 2, 1, 0, 0, 0, 4, 6, 4, 0, 1, 2, 1, 0, 0, 0, 2, 6, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 1, 2, 1, 0, 0, 0, 4, 6, 4, 0, 1, 2, 1, 0, 0, 0, 2, 6, 3, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 1, 2, 1, 0, 0, 1, 5, 9, 7, 0, 1, 2, 1, 0, 0, 0, 4, 6, 4, 0, 1, 2, 1, 0, 0, 0, 2, 6, 3, 0], 'v4': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 3, 4, 1, 2, 0, 4, 10, 17, 10, 0, 3, 4, 1, 2, 0, 2, 8, 16, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 3, 4, 1, 2, 0, 4, 10, 17, 10, 0, 3, 4, 1, 2, 0, 2, 8, 16, 7, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 3, 5, 1, 3, 0, 5, 13, 21, 16, 0, 3, 4, 1, 2, 0, 4, 10, 17, 10, 0, 3, 4, 1, 2, 0, 2, 8, 16, 7, 0]})
df3 = pd.DataFrame({'c1': ['BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT'], 'c2': ['D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2', 'D1', 'D1', 'D1', 'Sc', 'Sc', 'Sc', 'w1', 'w1', 'w1', 'w2', 'w2', 'w2'], 'c3': ['BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss', 'BAF', 'WH', 'mss'], 'c4': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'v1': [0, 0, 0, 6, 16, 0, 0, 0, 0, 0, 0, 0, 6, 16, 0, 6, 16, 0, 5, 13, 0, 5, 10, 0, 0, 0, 0, 6, 16, 0, 0, 0, 0, 0, 0, 0, 6, 16, 0, 6, 16, 0, 5, 13, 0, 5, 10, 0, 6, 16, 0, 6, 16, 0, 5, 13, 0, 5, 10, 0], 'v2': [0, 0, 0, 2, 17, 0, 0, 0, 0, 0, 0, 0, 2, 17, 0, 2, 17, 0, 1, 14, 0, 1, 12, 0, 0, 0, 0, 2, 17, 0, 0, 0, 0, 0, 0, 0, 2, 17, 0, 2, 17, 0, 1, 14, 0, 1, 12, 0, 2, 17, 0, 2, 17, 0, 1, 14, 0, 1, 12, 0], 'v3': [0, 0, 0, 4, 22, 0, 0, 0, 0, 0, 0, 0, 4, 22, 0, 4, 22, 0, 4, 14, 0, 4, 11, 0, 0, 0, 0, 4, 22, 0, 0, 0, 0, 0, 0, 0, 4, 22, 0, 4, 22, 0, 4, 14, 0, 4, 11, 0, 4, 22, 0, 4, 22, 0, 4, 14, 0, 4, 11, 0], 'v4': [0, 0, 0, 12, 55, 0, 0, 0, 0, 0, 0, 0, 12, 55, 0, 12, 55, 0, 10, 41, 0, 10, 33, 0, 0, 0, 0, 12, 55, 0, 0, 0, 0, 0, 0, 0, 12, 55, 0, 12, 55, 0, 10, 41, 0, 10, 33, 0, 12, 55, 0, 12, 55, 0, 10, 41, 0, 10, 33, 0]})
df2 = pd.DataFrame({'c1': ['BMI', 'BMI', 'BMI', 'BMI', 'BMI', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'DIABP', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'HEIGHT', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'SYSBP', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT', 'WEIGHT'], 'c2': ['D1', 'Sc', 'w1', 'w2', 'mss', 'D1', 'Sc', 'w1', 'w2', 'mss', 'D1', 'Sc', 'w1', 'w2', 'mss', 'D1', 'Sc', 'w1', 'w2', 'mss', 'D1', 'Sc', 'w1', 'w2', 'mss'], 'c3': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'c4': [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan], 'v1': [0, 22, 0, 0, 0, 22, 22, 18, 15, 0, 0, 22, 0, 0, 0, 22, 22, 18, 15, 0, 22, 22, 18, 15, 0], 'v2': [0, 19, 0, 0, 0, 19, 19, 15, 13, 0, 0, 19, 0, 0, 0, 19, 19, 15, 13, 0, 19, 19, 15, 13, 0], 'v3': [0, 26, 0, 0, 0, 26, 26, 18, 15, 0, 0, 26, 0, 0, 0, 26, 26, 18, 15, 0, 26, 26, 18, 15, 0], 'v4': [0, 67, 0, 0, 0, 67, 67, 51, 43, 0, 0, 67, 0, 0, 0, 67, 67, 51, 43, 0, 67, 67, 51, 43, 0]})
df1 = pd.DataFrame({'c1': ['BMI', 'DIABP', 'HEIGHT', 'SYSBP', 'WEIGHT', 'mss'], 'c2': [nan, nan, nan, nan, nan, nan], 'c3': [nan, nan, nan, nan, nan, nan], 'c4': [nan, nan, nan, nan, nan, nan], 'v1': [22, 22, 22, 22, 22, 0], 'v2': [19, 19, 19, 19, 19, 0], 'v3': [26, 26, 26, 26, 26, 0], 'v4': [67, 67, 67, 67, 67, 0]})
# Comment for easy code selection
Even for above four dataframes, sorting and merging criteria is still the same
And in such cases when the number of dataframe to sort and merge grows, the solution I have used above is becoming really inefficient.
When we concatenate DataFrames, we need to specify the axis. axis=0 tells pandas to stack the second DataFrame UNDER the first one. It will automatically detect whether the column names are the same and will stack accordingly. axis=1 will stack the columns in the second DataFrame to the RIGHT of the first DataFrame.
To sort the DataFrame based on the values in a single column, you'll use . sort_values() . By default, this will return a new DataFrame sorted in ascending order. It does not modify the original DataFrame.
Another solution using groupby
without sorting groups:
import itertools
out = pd.concat([df1.sort_values('v1'),
df2.sort_values('v2')],
ignore_index=True)
# Original answer
# >>> out.reindex(out.groupby('c1', sort=False)
# .apply(lambda x: x.index)
# .explode())
# Faster alternative
>>> out.loc[itertools.chain.from_iterable(out.groupby('c1', sort=False)
.groups.values())]
>>> out
c1 c2 v1 v2
0 B NaN 2 5
8 B P 0 1
12 B T 2 2
13 B Y 0 2
1 C NaN 3 5
9 C Y 1 1
14 C P 1 2
15 C T 1 2
2 D NaN 4 2
5 D T 2 0
10 D P 1 1
11 D Y 1 1
3 A NaN 9 2
4 A Y 2 0
6 A P 4 1
7 A T 3 1
Here is another way which might help you:
c1
as keys and rank of v1
as valuesMethod 1:
def mysort(df1,df2):
d = dict(zip(df2['c1'],df2['v1'].rank()))
o = pd.concat((df2,df1),keys=[1,2],names=['Key'])
return (o.assign(k=o['c1'].map(d)).sort_values(['k','Key','v2'])
.loc[:,list(df1)])#.reset_index(drop=True)
Method 2:
I think should work faster using the same logic but sorting with lexsort:
def mysort_two(df1,df2):
d = dict(zip(df2['c1'],df2['v1'].rank()))
o = pd.concat((df2,df1),keys=[1,2],names=['Key'])
a = o.to_numpy()[np.lexsort((o['v2'],o.index.get_level_values('Key'),
o['c1'].map(d)))]
return pd.DataFrame(a,columns=df2.columns)
#Same can also be written as below:
# def mysort_two(df1,df2):
# d = dict(zip(df2['c1'],df2['v1'].rank()))
# o = pd.concat((df2,df1))
# a = o.to_numpy()[np.lexsort((o['v2']
# ,np.append(np.ones(len(df2)), np.ones(len(df1))*2),
# o['c1'].map(d)))]
# return pd.DataFrame(a,columns=df2.columns)
print(mysort_two(df1,df2)) #method2
#print(mysort(df1,df2)) #method1
c1 c2 v1 v2
0 B NaN 2 5
1 B P 0 1
2 B T 2 2
3 B Y 0 2
4 C NaN 3 5
5 C Y 1 1
6 C P 1 2
7 C T 1 2
8 D NaN 4 2
9 D T 2 0
10 D P 1 1
11 D Y 1 1
12 A NaN 9 2
13 A Y 2 0
14 A P 4 1
15 A T 3 1
Assuming df2
's c1
column contains unique values in c1
(as in OP) we can try establishing categorical ordering
in c1
based on the sorted values of v1
in df2
. Adding indicator values to each DataFrame, then concating and sorting based on the new categorical type (c1
), indicator
, and v1
.
# Establish ordering based on sorted df2
cat_type = pd.CategoricalDtype(df2.sort_values('v1')['c1'], ordered=True)
new_df = (
# Add Indicator to each DataFrame
pd.concat([df.assign(indicator=i) for (i, df) in enumerate([df1, df2])])
# Set c1 to the categorical ordering from above
.astype({'c1': cat_type})
# Sort By Categorical, then df2 first then v2 within dfs
.sort_values(['c1', 'indicator', 'v2'],
ascending=(True, False, True),
ignore_index=True)
# Remove Indicator column
.drop(columns='indicator')
)
new_df
:
c1 c2 v1 v2
0 B NaN 2 5
1 B P 0 1
2 B T 2 2
3 B Y 0 2
4 C NaN 3 5
5 C Y 1 1
6 C P 1 2
7 C T 1 2
8 D NaN 4 2
9 D T 2 0
10 D P 1 1
11 D Y 1 1
12 A NaN 9 2
13 A Y 2 0
14 A P 4 1
15 A T 3 1
Setup:
import pandas as pd
from numpy import nan
df1 = pd.DataFrame({
'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'],
'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'],
'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1],
'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1]
})
df2 = pd.DataFrame({
'c1': ['A', 'B', 'C', 'D'],
'c2': [nan, nan, nan, nan],
'v1': [9, 2, 3, 4],
'v2': [2, 5, 5, 2]
})
Some timing information:
OP
%timeit op_fn(df1, df2)
4.93 ms ± 376 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
This answer
%timeit cat_ordered(df1, df2)
3.72 ms ± 490 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
@Andrej Kesely's answer
%timeit groupby_fn(df1, df2)
7.79 ms ± 140 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
@Corralien's answer
%timeit concat_itertools(df1, df2)
1.48 ms ± 82.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
@Scott Boston's answer
%timeit helpcol(df1, df2)
3.05 ms ± 44.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
@U12-Forward's answer
%timeit U11(df1, df2)
5.41 ms ± 255 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
@Anky's answers
%timeit mysort_anky(df1, df2)
3.05 ms ± 180 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit mysort_two_anky(df1, df2)
1.69 ms ± 52.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Setup:
import itertools
import numpy as np
import pandas as pd
from numpy import nan
df1 = pd.DataFrame({
'c1': ['A', 'A', 'A', 'B', 'B', 'B', 'C', 'C', 'C', 'D', 'D', 'D'],
'c2': ['P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y', 'P', 'T', 'Y'],
'v1': [4, 3, 2, 0, 2, 0, 1, 1, 1, 1, 2, 1],
'v2': [1, 1, 0, 1, 2, 2, 2, 2, 1, 1, 0, 1]
})
df2 = pd.DataFrame({
'c1': ['A', 'B', 'C', 'D'],
'c2': [nan, nan, nan, nan],
'v1': [9, 2, 3, 4],
'v2': [2, 5, 5, 2]
})
def op_fn(df1, df2):
result = []
for i, row in df2.sort_values('v1').iterrows():
result.append(row.to_frame().T)
result.append(df1[df1['c1'].eq(row['c1'])].sort_values('v2'))
return pd.concat(result, ignore_index=True)
def cat_ordered(df1, df2):
# Establish ordering based on df2
cat_type = pd.CategoricalDtype(df2.sort_values('v1')['c1'], ordered=True)
return (
# Add Indicator to each DataFrame
pd.concat([df.assign(indicator=i) for (i, df) in enumerate([df1, df2])])
# Set c1 to the categorical ordering from above
.astype({'c1': cat_type})
# Sort By Categorical, then df2 first then v2 within dfs
.sort_values(['c1', 'indicator', 'v2'],
ascending=(True, False, True),
ignore_index=True)
# Remove Indicator column
.drop(columns='indicator')
)
def groupby_fn(df1, df2):
y = df1.assign(ind=df1['v1'])
x = df2.groupby("c1").apply(
lambda v: pd.concat(
[y[y["c1"].eq(v["c1"].iat[0])], v.sort_values("v2")]
)
)
x.loc[:, "ind"] = x.loc[:, "ind"].ffill()
return x.sort_values("ind").drop(columns="ind").reset_index(drop=True)
def concat_itertools(df1, df2):
out = pd.concat([df1.sort_values('v1'),
df2.sort_values('v2')],
ignore_index=True)
return out.loc[itertools.chain.from_iterable(out.groupby('c1', sort=False)
.groups.values())]
def helpcol(df1, df2):
cat_type = pd.CategoricalDtype(df2.sort_values('v1')['c1'], ordered=True)
dfc = pd.concat([df1, df2])
dfc["c2sort"] = dfc["c2"].notna()
dfc["c1sort"] = dfc["c1"].astype(cat_type)
return dfc.sort_values(["c1sort", "c2sort", "v2"], ignore_index=True).drop(
["c2sort", "c1sort"], axis=1
)
def U11(df1, df2):
df = pd.concat([df1, df2], ignore_index=True)
return (
df.reindex(
df.sort_values('c1')
.groupby('c1', as_index=False)['v1'].transform('min')
.squeeze().sort_values().index
).reset_index(drop=True)
)
def mysort_anky(df1, df2):
d = dict(zip(df2['c1'], df2['v1'].rank()))
o = pd.concat((df2, df1), keys=[1, 2], names=['Key'])
return (o.assign(k=o['c1'].map(d)).sort_values(['k', 'Key', 'v2'])
.loc[:, list(df1)]) # .reset_index(drop=True)
def mysort_two_anky(df1, df2):
d = dict(zip(df2['c1'], df2['v1'].rank()))
o = pd.concat((df2, df1), keys=[1, 2], names=['Key'])
a = o.to_numpy()[np.lexsort((o['v2'], o.index.get_level_values('Key'),
o['c1'].map(d)))]
return pd.DataFrame(a, columns=df2.columns)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With