Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Merging dataframes with multi indexes and column value

Tags:

I have two dataframes with multi indexes and dates as a columns:

df1

df1 = pd.DataFrame.from_dict({('group', ''): {0: 'A',
  1: 'A',
  2: 'A',
  3: 'A',
  4: 'A',
  5: 'A',
  6: 'A',
  7: 'A',
  8: 'B',
  9: 'B',
  10: 'B',
  11: 'B',
  12: 'B',
  13: 'B',
  14: 'B',
  15: 'B',
  16: 'C',
  17: 'C',
  18: 'C',
  19: 'C',
  20: 'C',
  21: 'C',
  22: 'C',
  23: 'C',
  24: 'D',
  25: 'D',
  26: 'D',
  27: 'D',
  28: 'D',
  29: 'D',
  30: 'D'},
 ('category', ''): {0: 'Apple',
  1: 'Amazon',
  2: 'Google',
  3: 'Netflix',
  4: 'Facebook',
  5: 'Uber',
  6: 'Tesla',
  7: 'total',
  8: 'Apple',
  9: 'Amazon',
  10: 'Google',
  11: 'Netflix',
  12: 'Facebook',
  13: 'Uber',
  14: 'Tesla',
  15: 'total',
  16: 'Apple',
  17: 'Amazon',
  18: 'Google',
  19: 'Netflix',
  20: 'Facebook',
  21: 'Uber',
  22: 'Tesla',
  23: 'total',
  24: 'Apple',
  25: 'Amazon',
  26: 'Google',
  27: 'Netflix',
  28: 'Uber',
  29: 'Tesla',
  30: 'total'},
 (pd.Timestamp('2021-06-28 00:00:00'), 'total_orders'): {0: 88.0,
  1: 66.0,
  2: 191.0,
  3: 558.0,
  4: 12.0,
  5: 4.0,
  6: 56.0,
  7: 975.0,
  8: 90.0,
  9: 26.0,
  10: 49.0,
  11: 250.0,
  12: 7.0,
  13: 2.0,
  14: 44.0,
  15: 468.0,
  16: 36.0,
  17: 52.0,
  18: 94.0,
  19: 750.0,
  20: 10.0,
  21: 0.0,
  22: 52.0,
  23: 994.0,
  24: 16.0,
  25: 22.0,
  26: 5.0,
  27: 57.0,
  28: 3.0,
  29: 33.0,
  30: 136.0},
 (pd.Timestamp('2021-06-28 00:00:00'), 'total_sales'): {0: 4603.209999999999,
  1: 2485.059999999998,
  2: 4919.39999999998,
  3: 6097.77,
  4: 31.22,
  5: 155.71,
  6: 3484.99,
  7: 17237.35999999996,
  8: 561.54,
  9: 698.75,
  10: 1290.13,
  11: 4292.68000000001,
  12: 947.65,
  13: 329.0,
  14: 2889.65,
  15: 9989.4,
  16: 330.8899999999994,
  17: 2076.26,
  18: 2982.270000000004,
  19: 11978.62000000002,
  20: 683.0,
  21: 0.0,
  22: 3812.16999999999,
  23: 20963.21000000002,
  24: 234.4900000000002,
  25: 896.1,
  26: 231.0,
  27: 893.810000000001,
  28: 129.0,
  29: 1712.329999999998,
  30: 4106.729999999996},
 (pd.Timestamp('2021-07-05 00:00:00'), 'total_orders'): {0: 109.0,
  1: 48.0,
  2: 174.0,
  3: 592.0,
  4: 13.0,
  5: 5.0,
  6: 43.0,
  7: 984.0,
  8: 62.0,
  9: 13.0,
  10: 37.0,
  11: 196.0,
  12: 8.0,
  13: 1.0,
  14: 3.0,
  15: 30.0,
  16: 76.0,
  17: 5.0,
  18: 147.0,
  19: 88.0,
  20: 8.0,
  21: 1.0,
  22: 78.0,
  23: 1248.0,
  24: 1.0,
  25: 18.0,
  26: 23.0,
  27: 83.0,
  28: 0.0,
  29: 29.0,
  30: 154.0},
 (pd.Timestamp('2021-07-05 00:00:00'), 'total_sales'): {0: 3453.02,
  1: 17868.730000000003,
  2: 44707.82999999999,
  3: 61425.99,
  4: 1261.0,
  5: 1914.6000000000001,
  6: 24146.09,
  7: 154777.25999999998,
  8: 6201.489999999999,
  9: 5513.960000000001,
  10: 9645.87,
  11: 25086.785,
  12: 663.0,
  13: 448.61,
  14: 26332.7,
  15: 73892.415,
  16: 556.749999999999,
  17: 1746.859999999997,
  18: 4103.219999999994,
  19: 15571.52000000008,
  20: 86.0,
  21: 69.0,
  22: 5882.759999999995,
  23: 26476.11000000004,
  24: 53.0,
  25: 801.220000000001,
  26: 684.56,
  27: 1232.600000000002,
  28: 0.0,
  29: 15902.1,
  30: 43943.48},
 (pd.Timestamp('2021-07-12 00:00:00'), 'total_orders'): {0: 32.0,
  1: 15.0,
  2: 89.0,
  3: 239.0,
  4: 2.0,
  5: 3.0,
  6: 20.0,
  7: 400.0,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 21.0,
  17: 14.0,
  18: 58.0,
  19: 281.0,
  20: 3.0,
  21: 3.0,
  22: 33.0,
  23: 413.0,
  24: 7.0,
  25: 6.0,
  26: 4.0,
  27: 13.0,
  28: 0.0,
  29: 18.0,
  30: 48.0},
 (pd.Timestamp('2021-07-12 00:00:00'), 'total_sales'): {0: 2147.7000000000003,
  1: 4767.3,
  2: 2399.300000000003,
  3: 3137.440000000002,
  4: 178.0,
  5: 866.61,
  6: 10639.03,
  7: 73235.38,
  8: 0.0,
  9: 0.0,
  10: 0.0,
  11: 0.0,
  12: 0.0,
  13: 0.0,
  14: 0.0,
  15: 0.0,
  16: 220.94,
  17: 727.5199999999995,
  18: 2500.96999999999,
  19: 4414.00999999998,
  20: 15.0,
  21: 196.71,
  22: 2170.1,
  23: 9745.24999999997,
  24: 126.55,
  25: 290.2,
  26: 146.01,
  27: 233.0,
  28: 0.0,
  29: 973.18,
  30: 1658.940000000002}}).set_index(['group','category'])

df2

df2 = pd.DataFrame.from_dict({'group': {0: 'total_full',
  1: 'total_full',
  2: 'A',
  3: 'A',
  4: 'B',
  5: 'B',
  6: 'C',
  7: 'C',
  8: 'D',
  9: 'D',
  10: 'Apple_total',
  11: 'Apple_total',
  12: 'A',
  13: 'A',
  14: 'B',
  15: 'B',
  16: 'C',
  17: 'C',
  18: 'D',
  19: 'D',
  20: 'Amazon_total',
  21: 'Amazon_total',
  22: 'A',
  23: 'A',
  24: 'B',
  25: 'B',
  26: 'C',
  27: 'C',
  28: 'D',
  29: 'D',
  30: 'Google_total',
  31: 'Google_total',
  32: 'A',
  33: 'A',
  34: 'B',
  35: 'B',
  36: 'C',
  37: 'C',
  38: 'D',
  39: 'D',
  40: 'Facebook_total',
  41: 'Facebook_total',
  42: 'A',
  43: 'A',
  44: 'B',
  45: 'B',
  46: 'C',
  47: 'C',
  48: 'D',
  49: 'D',
  50: 'Netflix_total',
  51: 'Netflix_total',
  52: 'A',
  53: 'A',
  54: 'B',
  55: 'B',
  56: 'C',
  57: 'C',
  58: 'D',
  59: 'D',
  60: 'Tesla_total',
  61: 'Tesla_total',
  62: 'A',
  63: 'A',
  64: 'B',
  65: 'B',
  66: 'C',
  67: 'C',
  68: 'D',
  69: 'D',
  70: 'Uber_total',
  71: 'Uber_total',
  72: 'A',
  73: 'A',
  74: 'B',
  75: 'B',
  76: 'C',
  77: 'C',
  78: 'D',
  79: 'D'},
 'category': {0: 'total_full',
  1: 'total_full',
  2: 'group_total',
  3: 'group_total',
  4: 'group_total',
  5: 'group_total',
  6: 'group_total',
  7: 'group_total',
  8: 'group_total',
  9: 'group_total',
  10: 'Apple_total',
  11: 'Apple_total',
  12: 'Apple',
  13: 'Apple',
  14: 'Apple',
  15: 'Apple',
  16: 'Apple',
  17: 'Apple',
  18: 'Apple',
  19: 'Apple',
  20: 'Amazon_total',
  21: 'Amazon_total',
  22: 'Amazon',
  23: 'Amazon',
  24: 'Amazon',
  25: 'Amazon',
  26: 'Amazon',
  27: 'Amazon',
  28: 'Amazon',
  29: 'Amazon',
  30: 'Google_total',
  31: 'Google_total',
  32: 'Google',
  33: 'Google',
  34: 'Google',
  35: 'Google',
  36: 'Google',
  37: 'Google',
  38: 'Google',
  39: 'Google',
  40: 'Facebook_total',
  41: 'Facebook_total',
  42: 'Facebook',
  43: 'Facebook',
  44: 'Facebook',
  45: 'Facebook',
  46: 'Facebook',
  47: 'Facebook',
  48: 'Facebook',
  49: 'Facebook',
  50: 'Netflix_total',
  51: 'Netflix_total',
  52: 'Netflix',
  53: 'Netflix',
  54: 'Netflix',
  55: 'Netflix',
  56: 'Netflix',
  57: 'Netflix',
  58: 'Netflix',
  59: 'Netflix',
  60: 'Tesla_total',
  61: 'Tesla_total',
  62: 'Tesla',
  63: 'Tesla',
  64: 'Tesla',
  65: 'Tesla',
  66: 'Tesla',
  67: 'Tesla',
  68: 'Tesla',
  69: 'Tesla',
  70: 'Uber_total',
  71: 'Uber_total',
  72: 'Uber',
  73: 'Uber',
  74: 'Uber',
  75: 'Uber',
  76: 'Uber',
  77: 'Uber',
  78: 'Uber',
  79: 'Uber'},
 'type': {0: 'Sales_1',
  1: 'Sales_2',
  2: 'Sales_1',
  3: 'Sales_2',
  4: 'Sales_1',
  5: 'Sales_2',
  6: 'Sales_1',
  7: 'Sales_2',
  8: 'Sales_1',
  9: 'Sales_2',
  10: 'Sales_1',
  11: 'Sales_2',
  12: 'Sales_1',
  13: 'Sales_2',
  14: 'Sales_1',
  15: 'Sales_2',
  16: 'Sales_1',
  17: 'Sales_2',
  18: 'Sales_1',
  19: 'Sales_2',
  20: 'Sales_1',
  21: 'Sales_2',
  22: 'Sales_1',
  23: 'Sales_2',
  24: 'Sales_1',
  25: 'Sales_2',
  26: 'Sales_1',
  27: 'Sales_2',
  28: 'Sales_1',
  29: 'Sales_2',
  30: 'Sales_1',
  31: 'Sales_2',
  32: 'Sales_1',
  33: 'Sales_2',
  34: 'Sales_1',
  35: 'Sales_2',
  36: 'Sales_1',
  37: 'Sales_2',
  38: 'Sales_1',
  39: 'Sales_2',
  40: 'Sales_1',
  41: 'Sales_2',
  42: 'Sales_1',
  43: 'Sales_2',
  44: 'Sales_1',
  45: 'Sales_2',
  46: 'Sales_1',
  47: 'Sales_2',
  48: 'Sales_1',
  49: 'Sales_2',
  50: 'Sales_1',
  51: 'Sales_2',
  52: 'Sales_1',
  53: 'Sales_2',
  54: 'Sales_1',
  55: 'Sales_2',
  56: 'Sales_1',
  57: 'Sales_2',
  58: 'Sales_1',
  59: 'Sales_2',
  60: 'Sales_1',
  61: 'Sales_2',
  62: 'Sales_1',
  63: 'Sales_2',
  64: 'Sales_1',
  65: 'Sales_2',
  66: 'Sales_1',
  67: 'Sales_2',
  68: 'Sales_1',
  69: 'Sales_2',
  70: 'Sales_1',
  71: 'Sales_2',
  72: 'Sales_1',
  73: 'Sales_2',
  74: 'Sales_1',
  75: 'Sales_2',
  76: 'Sales_1',
  77: 'Sales_2',
  78: 'Sales_1',
  79: 'Sales_2'},
 '2021-06-28': {0: 67.5277641202152,
  1: 82.7854700135998,
  2: 21.50082266792856,
  3: 22.03644997199996,
  4: 64.460440147,
  5: 10.1060499896,
  6: 65.1530371974946,
  7: 50.6429700519999,
  8: 56.413464107792045,
  9: 0,
  10: 17.48074540313092,
  11: 26.8376199976,
  12: 52.172,
  13: 61.16600000040001,
  14: 20.9447844,
  15: 40.69122000000001,
  16: 83.55718929717925,
  17: 14.98039999719995,
  18: 20.806771705951697,
  19: np.nan,
  20: 18.3766353690825,
  21: 12.82565001479992,
  22: 52.425508769690694,
  23: 25.661999978399994,
  24: 17.88071596,
  25: 24.384659998799997,
  26: 91.10086982794643,
  27: 12.77899003759993,
  28: 16.969540811445366,
  29: np.nan,
  30: 18.8795397517309,
  31: 26.73017999840005,
  32: 53.52039700062155,
  33: 58.81199999639999,
  34: 12.1243325,
  35: 24.0544100028,
  36: 55.94068246571674,
  37: 133.86376999920006,
  38: 7.294127785392621,
  39: np.nan,
  40: 6.07807089184563,
  41: 7.27483001599998,
  42: 2.300470581874837,
  43: 30.71300000639998,
  44: 5.810764652,
  45: 12.333119997600003,
  46: 25.475930745418292,
  47: 64.228710012,
  48: 9.490904912552498,
  49: np.nan,
  50: 8.184780211399392,
  51: 24.59321999400001,
  52: 6.807138946302334,
  53: 12.0879999972,
  54: 0.869207661,
  55: 0.324,
  56: 0.5084336040970575,
  57: 12.181219996800007,
  58: 0,
  59: np.nan,
  60: 9.293956915067886,
  61: 11.171379993599999,
  62: 6.384936971649232,
  63: 3.657999996,
  64: 0.913782413,
  65: 1.9992000012000002,
  66: 1.5322078073061867,
  67: 5.514179996399999,
  68: 0.4630297231124678,
  69: np.nan,
  70: 36.23403557795798,
  71: 53.35258999919999,
  72: 21.890370397789923,
  73: 9.937449997200002,
  74: 5.916852561,
  75: 6.319439989199998,
  76: 7.03772344983066,
  77: 37.095700012799995,
  78: 1.3890891693374032,
  79: np.nan},
 '2021-07-05': {0: 65.4690491915759,
  1: 98.5235100112003,
  2: 21.4573181155924,
  3: 241.06741999679997,
  4: 67.481716829,
  5: 11.60325000040002,
  6: 27.5807099999998,
  7: 65.8528400140003,
  8: 58.949304246983736,
  9: 0.0,
  10: 185.65887577993723,
  11: 318.9965699964001,
  12: 54.517,
  13: 66.55265999039996,
  14: 21.92632044,
  15: 43.67116000320002,
  16: 87.47349898707688,
  17: 208.7727500028001,
  18: 21.742056352860352,
  19: np.nan,
  20: 16.6038963173654,
  21: 25.28952001920013,
  22: 54.7820864335212,
  23: 36.75802000560001,
  24: 18.71872129,
  25: 30.1634600016,
  26: 95.37075040035738,
  27: 138.3680400120001,
  28: 17.73233819348684,
  29: np.nan,
  30: 14.80302342121337,
  31: 251.83851001200003,
  32: 55.926190956481534,
  33: 72.4443400032,
  34: 12.69221484,
  35: 26.032340003999998,
  36: 58.56261169338368,
  37: 153.36183000480003,
  38: 7.622005931348156,
  39: np.nan,
  40: 72.24367956241771,
  41: 14.83083001279999,
  42: 29.5726042895728,
  43: 38.723000005199985,
  44: 6.083562133,
  45: 12.845630001599998,
  46: 26.66998281055652,
  47: 63.26220000600001,
  48: 9.917530329288393,
  49: np.nan,
  50: 8.555606693927,
  51: 23.802009994800002,
  52: 7.113126469779095,
  53: 7.206999998399999,
  54: 0.910216433,
  55: 1.4089999991999997,
  56: 0.5322637911479053,
  57: 15.186009997200001,
  58: 0.0,
  59: np.nan,
  60: 9.716385738295367,
  61: 14.7327399948,
  62: 6.671946105284065,
  63: 5.691999996,
  64: 0.956574175,
  65: 1.0203399996,
  66: 1.6040220980113027,
  67: 8.020399999199999,
  68: 0.4838433599999999,
  69: np.nan,
  70: 37.88758167841983,
  71: 59.03332998119994,
  72: 22.874363860953647,
  73: 13.690399997999998,
  74: 6.194107518,
  75: 6.4613199911999954,
  76: 7.367580219466185,
  77: 38.881609991999944,
  78: 1.4515300799999995,
  79: np.nan},
 '2021-07-12': {0: 607.2971827405001,
  1: 88.9671100664001,
  2: 21.26749278974862,
  3: 17.1524199804,
  4: 64.471138092,
  5: 89.84481002279999,
  6: 26.2044999999998,
  7: 51.9698800632001,
  8: 5.354051858751745,
  9: 0.0,
  10: 177.42361595891452,
  11: 287.5395700032,
  12: 52.117,
  13: 47.388199995600004,
  14: 20.94835038,
  15: 41.4250800048,
  16: 83.57340667555117,
  17: 198.72629000280003,
  18: 20.784858903363354,
  19: np.nan,
  20: 178.323907459086,
  21: 185.83897002839998,
  22: 52.37029646474982,
  23: 27.87144997800001,
  24: 17.88339044,
  25: 23.645340010799984,
  26: 91.11855133792106,
  27: 134.3221800396,
  28: 16.95166921641509,
  29: np.nan,
  30: 128.82813286243115,
  31: 192.6867300156,
  32: 53.46403160619618,
  33: 41.412320006399995,
  34: 12.1261155,
  35: 11.840830002000002,
  36: 55.95153983444301,
  37: 139.43358000720002,
  38: 7.286445921791947,
  39: np.nan,
  40: 69.04410667683521,
  41: 93.877410018,
  42: 28.270665735943805,
  43: 27.512680004399986,
  44: 5.811656147,
  45: 5.2319800032,
  46: 25.480875296710053,
  47: 61.132750010400024,
  48: 9.480909497181356,
  49: np.nan,
  50: 8.178601399067174,
  51: 17.6743199976,
  52: 6.7999699585309585,
  53: 6.131999998799999,
  54: 0.870099156,
  55: 0.6185600004,
  56: 0.5085322845362154,
  57: 10.923759998400003,
  58: 0.0,
  59: np.nan,
  60: 9.287042311133577,
  61: 19.966500000000007,
  62: 6.378212628950804,
  63: 6.524999997600001,
  64: 0.913782413,
  65: 1.9303400016,
  66: 1.5325051891827732,
  67: 11.511160000800006,
  68: 0.4625420799999998,
  69: np.nan,
  70: 36.21177607303267,
  71: 51.3836100036,
  72: 21.86731639537707,
  73: 10.310769999600003,
  74: 5.917744056,
  75: 5.152679999999999,
  76: 7.039089381655591,
  77: 35.920160003999996,
  78: 1.3876262399999995,
  79: np.nan}}).set_index(['group','category','type'])

I am trying to merge df2 on df1 by group, category, date (date is a column) so that my output would look like this:

I omitted the values from df2 of sales_1 & sales_2 in my desired output example, but those rows should be filled with the corresponding group and category values from df2.

                    2021-06-28                                                               2021-07-05                                                      2021-07-12
                    total_orders    total_sales         sales_1        sales_2                  total_orders    total_sales         sales_1        sales_2            total_orders    total_sales         sales_1        sales_2
group    category
A        Apple        88.000          4,603.210
         Amazon       66.000          2,485.060
         Google      191.000          4,919.400
         Netflix     558.000          6,097.770
         Facebook     12.000             31.220
         Uber          4.000            155.710
         Tesla        56.000          3,484.990
         total       975.000         17,237.360
B        Apple        90.000            561.540
         Amazon       26.000            698.750
         Google       49.000          1,290.130
         Netflix     250.000          4,292.680
         Facebook      7.000            947.650
         Uber          2.000            329.000
         Tesla        44.000          2,889.650
         total       468.000          9,989.400
C        Apple        36.000            330.890
         Amazon       52.000          2,076.260
         Google       94.000          2,982.270
         Netflix     750.000         11,978.620
         Facebook     10.000            683.000
         Uber          0.000              0.000
         Tesla        52.000          3,812.170
         total       994.000         20,963.210
D        Apple        16.000            234.490
         Amazon       22.000            896.100
         Google        5.000            231.000
         Netflix      57.000            893.810
         Uber          3.000            129.000
         Tesla        33.000          1,712.330
         total       136.000          4,106.730

So that sales_1 & sales_2 are merged on group & category and are on the same date column. The total_x from df2 can be ignored as it can be calculated from the fields.

The total_values are not used in merge, only the ones after it.

Enter image description here

What I've tried:

df1.reset_index().merge(df2.reset_index(), left_on=['group', 'category'], right_on=['group', 'category'])

Which throws a warning:

UserWarning: merging between different levels can give an unintended result (2 levels on the left,1 on the right)

And is not how I expect it to merge. How could I achieve my desired output?

Using

df = df1.merge(df2.unstack(), left_index=True, right_index=True)

Produces:

Enter image description here

Is it then just reordering of columns as I want to have a unique date and 4 columns for it? Or it might be that one date has 00:00:00 to it?

like image 329
Jonas Palačionis Avatar asked Jul 16 '21 10:07

Jonas Palačionis


People also ask

How do I merge multiple DataFrames based on index?

Merging Dataframes by index of both the dataframes As both the dataframe contains similar IDs on the index. So, to merge the dataframe on indices pass the left_index & right_index arguments as True i.e. Both the dataframes are merged on index using default Inner Join.

Can a DataFrame have multiple indexes?

A multi-index (also known as hierarchical index) dataframe uses more than one column as the index of the dataframe. A multi-index dataframe allows you to store your data in multi-dimension format, and opens up a lot of exciting to represent your data.


1 Answers

Create DatetimeIndex in column in df2 first, then unstack and merge by MultiIndexes:

f = lambda x: pd.to_datetime(x)
df = (df1.merge(df2.rename(columns=f).unstack(), left_index=True, right_index=True)
        .sort_index(axis=1))

print (df.head())

               2021-06-28                                  2021-07-05  \
                  Sales_1 Sales_2 total_orders total_sales    Sales_1   
group category                                                          
A     Apple     52.172000  61.166         88.0     4603.21  54.517000   
      Amazon    52.425509  25.662         66.0     2485.06  54.782086   
      Google    53.520397  58.812        191.0     4919.40  55.926191   
      Netflix    6.807139  12.088        558.0     6097.77   7.113126   
      Facebook   2.300471  30.713         12.0       31.22  29.572604   

                                                  2021-07-12            \
                 Sales_2 total_orders total_sales    Sales_1   Sales_2   
group category                                                           
A     Apple     66.55266        109.0     3453.02  52.117000  47.38820   
      Amazon    36.75802         48.0    17868.73  52.370296  27.87145   
      Google    72.44434        174.0    44707.83  53.464032  41.41232   
      Netflix    7.20700        592.0    61425.99   6.799970   6.13200   
      Facebook  38.72300         13.0     1261.00  28.270666  27.51268   

                                         
               total_orders total_sales  
group category                           
A     Apple            32.0     2147.70  
      Amazon           15.0     4767.30  
      Google           89.0     2399.30  
      Netflix         239.0     3137.44  
      Facebook          2.0      178.00  
like image 70
jezrael Avatar answered Nov 15 '22 06:11

jezrael