In pandas I can use the from_dummies method to reverse one-hot encoding. There doesn't seem to be a built in method for this in polars. Here is a basic example:
pl.DataFrame({
  "col1_hi": [0,0,0,1,1],
  "col1_med": [0,0,1,0,0],
  "col1_lo": [1,1,0,0,0],
  "col2_yes": [1,1,0,1,0],
  "col2_no": [0,0,1,0,1],
})
┌─────────┬──────────┬─────────┬──────────┬─────────┐
│ col1_hi ┆ col1_med ┆ col1_lo ┆ col2_yes ┆ col2_no │
│ ---     ┆ ---      ┆ ---     ┆ ---      ┆ ---     │
│ i64     ┆ i64      ┆ i64     ┆ i64      ┆ i64     │
╞═════════╪══════════╪═════════╪══════════╪═════════╡
│ 0       ┆ 0        ┆ 1       ┆ 1        ┆ 0       │
│ 0       ┆ 0        ┆ 1       ┆ 1        ┆ 0       │
│ 0       ┆ 1        ┆ 0       ┆ 0        ┆ 1       │
│ 1       ┆ 0        ┆ 0       ┆ 1        ┆ 0       │
│ 1       ┆ 0        ┆ 0       ┆ 0        ┆ 1       │
└─────────┴──────────┴─────────┴──────────┴─────────┘
Reversing the to_dummies operation should result in something like this:
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ lo   ┆ yes  │
│ lo   ┆ yes  │
│ med  ┆ no   │
│ hi   ┆ yes  │
│ hi   ┆ no   │
└──────┴──────┘
My first thought was to use a pivot. How could I go about implementing this functionality?
You could utilize pl.coalesce
(df
 .with_columns(
    pl.when(pl.col(col) == 1)
      .then(pl.lit(col).str.extract(r"([^_]+$)"))
      .alias(col) 
    for col in df.columns)
 .select(
    pl.coalesce(pl.col(f"^{prefix}_.+$")).alias(prefix) 
    for prefix in dict.fromkeys(
       col.rsplit("_", maxsplit=1)[0]
       for col in df.columns
    )
))
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ lo   ┆ yes  │
│ lo   ┆ yes  │
│ med  ┆ no   │
│ hi   ┆ yes  │
│ hi   ┆ no   │
└──────┴──────┘
Update: @Rodalm's approach is much neater:
def from_dummies(df, separator="_"):
    col_exprs = {}
    
    for col in df.columns:
        name, value = col.rsplit(separator, maxsplit=1)
        expr = pl.when(pl.col(col) == 1).then(value) 
        col_exprs.setdefault(name, []).append(expr)
    return df.select(
        pl.coalesce(exprs) # keep the first non-null expression value by row
          .alias(name)
        for name, exprs in col_exprs.items()
    )
A similar approach to @jqurious's answer using pl.coalesce:
from collections import defaultdict
import polars as pl
df = pl.DataFrame({
  "col1_hi": [0,0,0,1,1],
  "col1_med": [0,0,1,0,0],
  "col1_lo": [1,1,0,0,0],
  "col2_yes": [1,1,0,1,0],
  "col2_no": [0,0,1,0,1],
})
def from_dummies(df, sep="_"):
    col_exprs = defaultdict(list)
    for col in df.columns:
        name, value = col.split(sep)
        expr = pl.when(pl.col(col) == 1).then(value) # null otherwise
        col_exprs[name].append(expr)
    res = df.select(**{
        name: pl.coalesce(exprs) # keep the first non-null expression value by row
        for name, exprs in col_exprs.items()
    })
    return res
Or generalizing @warwick12's approach using multiple when and thens chained:
def from_dummies(df, sep="_"):
    col_exprs = {}
    for col in df.columns:
        name, value = col.split(sep)
        if name not in col_exprs:
            col_exprs[name] = pl.when(pl.col(col) == 1).then(value)
        else:
            col_exprs[name] = col_exprs[name].when(pl.col(col) == 1).then(value)
 
    return df.select(**col_exprs)
Output:
>>> from_dummies(df)
shape: (5, 2)
┌──────┬──────┐
│ col1 ┆ col2 │
│ ---  ┆ ---  │
│ str  ┆ str  │
╞══════╪══════╡
│ lo   ┆ yes  │
│ lo   ┆ yes  │
│ med  ┆ no   │
│ hi   ┆ yes  │
│ hi   ┆ no   │
└──────┴──────┘
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With