Sklearn Pipeline: Get feature names after OneHotEncode In ColumnTransformer

I want to get feature names after I fit the pipeline.

categorical_features = ['brand', 'category_name', 'sub_category'] categorical_transformer = Pipeline(steps=[     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),     ('onehot', OneHotEncoder(handle_unknown='ignore'))])      numeric_features = ['num1', 'num2', 'num3', 'num4'] numeric_transformer = Pipeline(steps=[     ('imputer', SimpleImputer(strategy='median')),     ('scaler', StandardScaler())])  preprocessor = ColumnTransformer(     transformers=[         ('num', numeric_transformer, numeric_features),         ('cat', categorical_transformer, categorical_features)]) 


clf = Pipeline(steps=[('preprocessor', preprocessor),                       ('regressor', GradientBoostingRegressor())]) 

After fitting with pandas dataframe, I can get feature importances from


and I tried clf.steps[0][1].get_feature_names() but I got an error

AttributeError: Transformer num (type Pipeline) does not provide get_feature_names. 

How can I get feature names from this?

You can access the feature_names using the following snippet!

clf.named_steps['preprocessor'].transformers_[1][1]\    .named_steps['onehot'].get_feature_names(categorical_features) 

Using sklearn >= 0.21 version, we can make it more simpler:

clf['preprocessor'].transformers_[1][1]\     ['onehot'].get_feature_names(categorical_features) 

Reproducible example:

import numpy as np import pandas as pd from sklearn.impute import SimpleImputer from sklearn.preprocessing import OneHotEncoder, StandardScaler from sklearn.pipeline import Pipeline from sklearn.compose import ColumnTransformer from sklearn.linear_model import LinearRegression  df = pd.DataFrame({'brand': ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],                    'category': ['asdf', 'asfa', 'asdfas', 'as'],                    'num1': [1, 1, 0, 0],                    'target': [0.2, 0.11, 1.34, 1.123]})  numeric_features = ['num1'] numeric_transformer = Pipeline(steps=[     ('imputer', SimpleImputer(strategy='median')),     ('scaler', StandardScaler())])  categorical_features = ['brand', 'category'] categorical_transformer = Pipeline(steps=[     ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),     ('onehot', OneHotEncoder(handle_unknown='ignore'))])  preprocessor = ColumnTransformer(     transformers=[         ('num', numeric_transformer, numeric_features),         ('cat', categorical_transformer, categorical_features)])  clf = Pipeline(steps=[('preprocessor', preprocessor),                       ('regressor',  LinearRegression())]) clf.fit(df.drop('target', 1), df['target'])  clf.named_steps['preprocessor'].transformers_[1][1]\    .named_steps['onehot'].get_feature_names(categorical_features)  # ['brand_NaN' 'brand_aaaa' 'brand_asdfasdf' 'brand_sadfds' 'category_as' #  'category_asdf' 'category_asdfas' 'category_asfa'] 
Scikit-Learn 1.0 now has new features to keep track of feature names.

from sklearn.compose import make_column_transformer from sklearn.impute import SimpleImputer from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler  # SimpleImputer does not have get_feature_names_out, so we need to add it # manually. This should be fixed in Scikit-Learn 1.0.1: all transformers will # have this method. # g SimpleImputer.get_feature_names_out = (lambda self, names=None:                                        self.feature_names_in_)  num_pipeline = make_pipeline(SimpleImputer(), StandardScaler()) transformer = make_column_transformer(     (num_pipeline, ["age", "height"]),     (OneHotEncoder(), ["city"])) pipeline = make_pipeline(transformer, LinearRegression())    df = pd.DataFrame({"city": ["Rabat", "Tokyo", "Paris", "Auckland"],                    "age": [32, 65, 18, 24],                    "height": [172, 163, 169, 190],                    "weight": [65, 62, 54, 95]},                   index=["Alice", "Bunji", "Cécile", "Dave"])    pipeline.fit(df, df["weight"])   ## get pipeline feature names pipeline[:-1].get_feature_names_out()   ## specify feature names as your columns pd.DataFrame(pipeline[:-1].transform(df),              columns=pipeline[:-1].get_feature_names_out(),              index=df.index) 
