Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Visualizing the difference between two numeric arrays

I have two numeric arrays of equal length, with one array always having the element value >= to the corresponding (same index) element in the second array.

I am trying to visualize in a single graph:

i) difference between the corresponding elements,

ii) values of the corresponding elements in the two arrays.

I have tried plotting the CDF as below:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

arr1 = np.random.uniform(1,20,[25,1])
arr2 = arr1 + np.random.uniform(1,10,[25,1])
df1 = pd.DataFrame(arr1)
df2 = pd.DataFrame(arr2)

fix, ax = plt.subplots()
sns.kdeplot(df1[0], cumulative=True, color='orange', label='arr1')
sns.kdeplot(df2[0], cumulative=True, color='b', label='arr2')
sns.kdeplot(df2[0]-df1[0], cumulative=True, color='r', label='difference')
plt.show()

which gives the following output:

CDF of arrays

However, it does not capture the difference, and values of the corresponding elements together. For example, suppose the difference between two elements is 3. The two numbers can be 2 and 5, but they can also be 15 and 18, and this can not be determined from the CDF.

Which kind of plotting can visualize both the difference between the elements and the values of the elements?

I do not wish to line plot as below because not much statistical insights can be derived from the visualization.

ax.plot(df1[0])
ax.plot(df2[0])
ax.plot(df2[0]-df1[0])
like image 438
Saad Avatar asked Sep 16 '25 17:09

Saad


1 Answers

There are lots of ways to show difference between two values. It really depends on your goal for the chart, how quantitative or qualitative you want to be, or if you want to show the raw data somehow. Here are a few ideas that come to mind that do not involve simple line plots or density functions. I strongly recommend the book Better Data Visualization by Johnathan Schwabish. He discusses interesting considerations regarding data presentation.

enter image description here

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import ticker

arr1 = np.random.uniform(1,20, size=25)
arr2 = arr1 + np.random.uniform(1,10, size=25)

df = pd.DataFrame({
    'col1' : arr1,
    'col2' : arr2
})

df['diff'] = df.col2 - df.col1
df['sum']  = df.col1 + df.col2

fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(15,15))
axes = axes.flatten()

# Pyramid chart
df_sorted = df.sort_values(by='sum', ascending=True)
axes[0].barh(
    y = np.arange(1,26),
    width = -df_sorted.col1
)
axes[0].barh(
    y = np.arange(1,26),
    width = df_sorted.col2
)
# Style axes[0]
style_func(axes[0], 'Pyramid Chart')

# Dot Plot
axes[1].scatter(df.col1, np.arange(1, 26), label='col1')
axes[1].scatter(df.col2, np.arange(1, 26), label='col2')
axes[1].hlines(
    y = np.arange(1, 26),
    xmin = df.col1, xmax = df.col2,
    zorder=0, linewidth=1.5, color='k'
)
# Style axes[1]
legend = axes[1].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[1], 'Dot Plot')
set_xlim    = axes[1].set_xlim(0,25)

# Dot Plot 2
df_sorted = df.sort_values(by=['col1', 'diff'], ascending=False)
axes[2].scatter(df_sorted.col1, np.arange(1, 26), label='col1')
axes[2].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[2].hlines(
    y = np.arange(1, 26),
    xmin = df_sorted.col1, xmax = df_sorted.col2,
    zorder=0, linewidth=1.5, color='k'
)
# Style axes[2]
legend = axes[2].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[2], 'Dot Plot')
set_xlim    = axes[2].set_xlim(0,25)

# Dot Plot 3
df_sorted = df.sort_values(by='sum', ascending=True)
axes[3].scatter(-df_sorted.col1, np.arange(1, 26), label='col1')
axes[3].scatter(df_sorted.col2, np.arange(1, 26), label='col2')
axes[3].vlines(x=0, ymin=-1, ymax=27, linewidth=2.5, color='k')
axes[3].hlines(
    y = np.arange(1, 26),
    xmin = -df_sorted.col1, xmax = df_sorted.col2,
    zorder=0, linewidth=2
)
# Style axes[3]
legend = axes[3].legend(ncol=2, loc='center', bbox_to_anchor=(0.14,1.025), edgecolor='w')
style_func(axes[3], 'Dot Plot')


# Strip plot
axes[4].scatter(df.col1, [4] * 25)
axes[4].scatter(df.col2, [6] * 25)
axes[4].set_ylim(0, 10)
axes[4].vlines(
    x = [df.col1.mean(), df.col2.mean()],
    ymin = [3.5, 5.5], ymax=[4.5,6.5],
    color='black', linewidth =2 
)

# Style axes[4]
axes[4].yaxis.set_major_locator(ticker.FixedLocator([4,6]))
axes[4].yaxis.set_major_formatter(ticker.FixedFormatter(['col1','col2']))
hide_spines = [axes[4].spines[x].set_visible(False) for x in ['left','top','right']]
set_title   = axes[4].set_title('Strip Plot', fontweight='bold')
tick_params = axes[4].tick_params(axis='y', left=False)
grid = axes[4].grid(axis='y', dashes=(8,3), alpha=0.3, color='gray')

# Slope chart
for i in range(25):
    axes[5].plot([0,1], [df.col1[i], df.col2[i]], color='k')
align = ['left', 'right']
for i in range(1,3): 
    axes[5].text(x = i - 1, y = 0, s = 'col' + str(i), 
                 fontsize=14, fontweight='bold', ha=align[i-1])
set_title   = axes[5].set_title('Slope chart', fontweight='bold')
axes[5].axis('off')


def style_func(ax, title):
    hide_spines = [ax.spines[x].set_visible(False) for x in ['left','top','right']]
    set_title   = ax.set_title(title, fontweight='bold')
    set_xlim    = ax.set_xlim(-25,25)
    x_locator   = ax.xaxis.set_major_locator(ticker.MultipleLocator(5))
    y_locator   = ax.yaxis.set_major_locator(ticker.FixedLocator(np.arange(1,26, 2)))
    spine_width = ax.spines['bottom'].set_linewidth(1.5)
    x_tick_params = ax.tick_params(axis='x', length=8, width=1.5)
    x_tick_params = ax.tick_params(axis='y', left=False)
like image 57
Coup Avatar answered Sep 19 '25 06:09

Coup