How do I align the nodes in a Sankey Diagram in Python?

Question

Need some help aligning my Sankey diagram properly.

I've been working on a Sankey diagram project but can't seem to get the nodes aligned properly, they seem to be a bit off. I've tried to manually set and x/y coordinates, but still not up to what I want. Here is a bit of the generated data I am working with, along with all the script for the diagram.

import pandas as pd
import plotly.graph_objects as go
from io import StringIO

# Load data
csv = StringIO("""
from_cat,to_cat,percent
rpf,bp,3.55314197051978
rpf,cc,6.19084561675718
rpf,es,1.21024049650892
rpf,ic,2.46702870442203
rpf,rpf,2.26532195500388
rpf,sc,6.54771140418929
bp,bp,0.977501939487975
bp,cc,0.403413498836307
bp,es,0.108611326609775
bp,ic,4.7944142746315
bp,rpf,0.387897595034911
bp,sc,1.81536074476338
ic,bp,0.124127230411171
ic,cc,0.21722265321955
ic,es,0.0155159038013964
ic,ic,0.170674941815361
ic,rpf,0.0155159038013964
ic,sc,0.294802172226532
cc,bp,1.25678820791311
cc,cc,7.50969743987587
cc,es,9.41815360744763
cc,ic,0.775795190069822
cc,rpf,1.05508145849496
cc,sc,20.8068269976726
cc,sr,0.0465477114041893
sc,bp,0.0155159038013964
sc,cc,0.325833979829325
sc,es,1.92397207137316
sc,rpf,0.0155159038013964
sc,sc,4.43754848719938
sr,bp,0.0620636152055857
sr,cc,1.55159038013964
sr,es,5.10473235065943
sr,ic,0.0155159038013964
sr,rpf,0.0155159038013964
sr,sc,9.71295577967417
sr,sr,0.0775795190069822
es,bp,0.108611326609775
es,cc,0.574088440651668
es,es,1.48952676493406
es,ic,0.0310318076027929
es,rpf,0.0620636152055857
es,sc,2.00155159038014
es,sr,0.0465477114041893
""")
df = pd.read_csv(csv, skipinitialspace=True)

# Define category order
cat_order = [
    "es",
    "sr",
    "sc",
    "cc",
    "ic",
    "bp",
    "rpf"
]

df["from_cat"] = pd.Categorical(df["from_cat"], categories=cat_order, ordered=True)
df["to_cat"] = pd.Categorical(df["to_cat"], categories=cat_order, ordered=True)

# sort for deterministic ordering
df = df.sort_values(["from_cat", "to_cat"]).reset_index(drop=True)

# left/right hierarchies and labels 
left_order = cat_order  
right_order = cat_order  

n_left = len(left_order)
n_right = len(right_order)

labels = [f"{c} (L)" for c in left_order] + [f"{c} (R)" for c in right_order]
label_to_index = {label: i for i, label in enumerate(labels)}

# Map and coerce to int
df["source_idx"] = df["from_cat"].map(lambda c: label_to_index.get(f"{c} (L)", -1))
df["target_idx"] = df["to_cat"].map(lambda c: label_to_index.get(f"{c} (R)", -1))

# Convert to numeric ints explicitly
df["source_idx"] = pd.to_numeric(df["source_idx"], downcast="integer", errors="coerce").fillna(-1).astype(int)
df["target_idx"] = pd.to_numeric(df["target_idx"], downcast="integer", errors="coerce").fillna(-1).astype(int)

# Color definitions
CATEGORY_COLORS = {
    "es":          "#F6C57A",
    "sr":          "#A6D8F0",
    "sc":          "#7BDCB5",
    "cc":          "#FFC20A",
    "ic":          "#88BDE6",
    "bp":          "#F4A582",
    "rpf":         "#DDA0DD",
    "Unknown":     "#D3D3D3"
}

# Node and link colors
node_colors = [CATEGORY_COLORS[c] for c in left_order] + [CATEGORY_COLORS[c] for c in right_order]
link_colors = [node_colors[src] for src in df["source_idx"].tolist()]

# x/y coordinates to center Sankey 
x = [
    0.001, 0.001, 0.001, 0.001, 0.001, 0.001, 0.001,
    0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999
]

y = [
    0.05, 0.18, 0.31, 0.44, 0.57, 0.70, 0.83,
    0.05, 0.18, 0.31, 0.44, 0.57, 0.70, 0.83   
]

# Build Sankey diagram 
fig = go.Figure(go.Sankey(
    arrangement="fixed",
    node=dict(
        pad=40,        
        thickness=25,  
        line=dict(color="black", width=0.5),
        label=labels,
        x=x,
        y=y,
        color=node_colors
    ),
    link=dict(
        source=df["source_idx"].tolist(),
        target=df["target_idx"].tolist(),
        value=df["percent"].tolist(),
        color=link_colors,
        hovertemplate="%{source.label} → %{target.label}<br><b>%{value:.2f}%</b><extra></extra>"
    ),
    valueformat=".2f",
    valuesuffix="%"
))

fig.update_layout(
    title="Flow",
    font_size=12,
    paper_bgcolor="#f7f7f7",
    plot_bgcolor="#f7f7f7",
    margin=dict(l=30, r=30, t=60, b=30),
    width=1000,   
    height=800   
)
fig.show()

strawdog · Accepted Answer

The most tricky thing is to calculate Y coordinates for your nodes. Y designates the position of the center of the node. To get the appropriate coordinates, we have to sum the width of all previous nodes and add half of the current node width. I used additional dataframe for time sake, but you might find some more elegant way to do this.

import pandas as pd
import plotly.graph_objects as go
from io import StringIO

# Load data
csv = StringIO("""
from_cat,to_cat,percent
rpf,bp,3.55314197051978
rpf,cc,6.19084561675718
rpf,es,1.21024049650892
rpf,ic,2.46702870442203
rpf,rpf,2.26532195500388
rpf,sc,6.54771140418929
bp,bp,0.977501939487975
bp,cc,0.403413498836307
bp,es,0.108611326609775
bp,ic,4.7944142746315
bp,rpf,0.387897595034911
bp,sc,1.81536074476338
ic,bp,0.124127230411171
ic,cc,0.21722265321955
ic,es,0.0155159038013964
ic,ic,0.170674941815361
ic,rpf,0.0155159038013964
ic,sc,0.294802172226532
cc,bp,1.25678820791311
cc,cc,7.50969743987587
cc,es,9.41815360744763
cc,ic,0.775795190069822
cc,rpf,1.05508145849496
cc,sc,20.8068269976726
cc,sr,0.0465477114041893
sc,bp,0.0155159038013964
sc,cc,0.325833979829325
sc,es,1.92397207137316
sc,rpf,0.0155159038013964
sc,sc,4.43754848719938
sr,bp,0.0620636152055857
sr,cc,1.55159038013964
sr,es,5.10473235065943
sr,ic,0.0155159038013964
sr,rpf,0.0155159038013964
sr,sc,9.71295577967417
sr,sr,0.0775795190069822
es,bp,0.108611326609775
es,cc,0.574088440651668
es,es,1.48952676493406
es,ic,0.0310318076027929
es,rpf,0.0620636152055857
es,sc,2.00155159038014
es,sr,0.0465477114041893
""")
df = pd.read_csv(csv, skipinitialspace=True)

# Define category order
cat_order = [
    "es",
    "sr",
    "sc",
    "cc",
    "ic",
    "bp",
    "rpf"
]

df["from_cat"] = pd.Categorical(df["from_cat"], categories=cat_order, ordered=True)
df["to_cat"] = pd.Categorical(df["to_cat"], categories=cat_order, ordered=True)

# sort for deterministic ordering
df = df.sort_values(["from_cat", "to_cat"]).reset_index(drop=True)

# left/right hierarchies and labels 
left_order = cat_order  
right_order = cat_order  

n_left = len(left_order)
n_right = len(right_order)

labels = [f"{c} (L)" for c in left_order] + [f"{c} (R)" for c in right_order]
label_to_index = {label: i for i, label in enumerate(labels)}

# Map and coerce to int
df["source_idx"] = df["from_cat"].map(lambda c: label_to_index.get(f"{c} (L)", -1))
df["target_idx"] = df["to_cat"].map(lambda c: label_to_index.get(f"{c} (R)", -1))

# Convert to numeric ints explicitly
df["source_idx"] = pd.to_numeric(df["source_idx"], downcast="integer", errors="coerce").fillna(-1).astype(int)
df["target_idx"] = pd.to_numeric(df["target_idx"], downcast="integer", errors="coerce").fillna(-1).astype(int)

# Color definitions
CATEGORY_COLORS = {
    "es":          "#F6C57A",
    "sr":          "#A6D8F0",
    "sc":          "#7BDCB5",
    "cc":          "#FFC20A",
    "ic":          "#88BDE6",
    "bp":          "#F4A582",
    "rpf":         "#DDA0DD",
    "Unknown":     "#D3D3D3"
}

# Node and link colors
node_colors = [CATEGORY_COLORS[c] for c in left_order] + [CATEGORY_COLORS[c] for c in right_order]
link_colors = [node_colors[src] for src in df["source_idx"].tolist()]

# x/y coordinates to center Sankey 
x = [0.001 for x in df["from_cat"].unique()] + [0.999 for x in df["to_cat"].unique()]

# calculatin Y positions for left and right nodes. Y designates the  _center_ of a node
# so you need to calculate sum of wigths of all predecessor nodes + hals of wifth of a current one
# We then normalize Y to fit the canvas
res = pd.DataFrame()
res["L1"] = df.groupby('from_cat', observed=True)['percent'].sum()
res["Y1"] = res["L1"].cumsum().sub(res["L1"]/2).div(100)
res["L2"] = df.groupby('to_cat', observed=True)['percent'].sum()
res["Y2"] = res["L2"].cumsum().sub(res["L2"]/2).div(100)
y = res["Y1"].tolist() + res["Y2"].tolist()

# Build Sankey diagram 
fig = go.Figure(go.Sankey(
    arrangement="fixed",
    node=dict(
        pad=40,        
        thickness=25,  
        line=dict(color="black", width=0.5),
        label=labels,
        x=x,
        y=y,
        color=node_colors
    ),
    link=dict(
        source=df["source_idx"].tolist(),
        target=df["target_idx"].tolist(),
        value=df["percent"].tolist(),
        color=link_colors,
        hovertemplate="%{source.label} → %{target.label}<br><b>%{value:.2f}%</b><extra></extra>"
    ),
    valueformat=".2f",
    valuesuffix="%"
))

fig.update_layout(
    title="Flow",
    font_size=12,
    paper_bgcolor="#f7f7f7",
    plot_bgcolor="#f7f7f7",
    margin=dict(l=30, r=30, t=60, b=30),
    width=1000,   
    height=800   
)
fig.show()

this will produce something like this:

enter image description here

How do I align the nodes in a Sankey Diagram in Python?

Tags:

python

sankey-diagram

GSA

1 Answers

strawdog

Recent Activity

Donate For Us

How do I align the nodes in a Sankey Diagram in Python?

Tags:

python

sankey-diagram

GSA

1 Answers

strawdog

Related questions

Recent Activity

Donate For Us