import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','as'],
'num1' : [1, 1, 0, 0] ,
'target' : [0.2,0.11,1.34,1.123]})
train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
preprocess = make_column_transformer(
(StandardScaler(),train_continuous_cols),
(OneHotEncoder(), train_categorical_cols)
)
df= preprocess.fit_transform(df)
Just trying to get all the feature names:
preprocess.get_feature_names()
Getting this error:
Transformer standardscaler (type StandardScaler) does not provide get_feature_names
How can I solve it? The examples online use pipeline
and I'm trying to avoid that.
Applies transformers to columns of an array or pandas DataFrame. This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space.
Column Transformer is a sciket-learn class used to create and apply separate transformers for numerical and categorical data. To create transformers we need to specify the transformer object and pass the list of transformations inside a tuple along with the column on which you want to apply the transformation.
To use the ColumnTransformer, you must specify a list of transformers. Each transformer is a three-element tuple that defines the name of the transformer, the transform to apply, and the column indices to apply it to. For example: (Name, Object, Columns)
The following re-implementation of the ColumnTransformer returns a pandas DataFrame. Note that it should only be used if you input a pandas DataFrame to your pipeline.
All kudos go to Johannes Haupt who provided the get_feature_names()
function that is resilient to transformers that don't have this function (see blogpost Extracting Column Names from the ColumnTransformer). I commented off the warnings because I did not want them and also pre-prending the transformation step to the column name; but it is easy to un-comment as you like.
#import warnings
import sklearn
import pandas as pd
class ColumnTransformerWithNames(ColumnTransformer):
def get_feature_names(column_transformer):
"""Get feature names from all transformers.
Returns
-------
feature_names : list of strings
Names of the features produced by transform.
"""
# Remove the internal helper function
#check_is_fitted(column_transformer)
# Turn loopkup into function for better handling with pipeline later
def get_names(trans):
# >> Original get_feature_names() method
if trans == 'drop' or (
hasattr(column, '__len__') and not len(column)):
return []
if trans == 'passthrough':
if hasattr(column_transformer, '_df_columns'):
if ((not isinstance(column, slice))
and all(isinstance(col, str) for col in column)):
return column
else:
return column_transformer._df_columns[column]
else:
indices = np.arange(column_transformer._n_features)
return ['x%d' % i for i in indices[column]]
if not hasattr(trans, 'get_feature_names'):
# >>> Change: Return input column names if no method avaiable
# Turn error into a warning
# warnings.warn("Transformer %s (type %s) does not "
# "provide get_feature_names. "
# "Will return input column names if available"
# % (str(name), type(trans).__name__))
# For transformers without a get_features_names method, use the input
# names to the column transformer
if column is None:
return []
else:
return [#name + "__" +
f for f in column]
return [#name + "__" +
f for f in trans.get_feature_names()]
### Start of processing
feature_names = []
# Allow transformers to be pipelines. Pipeline steps are named differently, so preprocessing is needed
if type(column_transformer) == sklearn.pipeline.Pipeline:
l_transformers = [(name, trans, None, None) for step, name, trans in column_transformer._iter()]
else:
# For column transformers, follow the original method
l_transformers = list(column_transformer._iter(fitted=True))
for name, trans, column, _ in l_transformers:
if type(trans) == sklearn.pipeline.Pipeline:
# Recursive call on pipeline
_names = column_transformer.get_feature_names(trans)
# if pipeline has no transformer that returns names
if len(_names)==0:
_names = [#name + "__" +
f for f in column]
feature_names.extend(_names)
else:
feature_names.extend(get_names(trans))
return feature_names
def transform(self, X):
indices = X.index.values.tolist()
original_columns = X.columns.values.tolist()
X_mat = super().transform(X)
new_cols = self.get_feature_names()
new_X = pd.DataFrame(X_mat.toarray(), index=indices, columns=new_cols)
return new_X
def fit_transform(self, X, y=None):
super().fit_transform(X, y)
return self.transform(X)
Then you can replace the calls to ColumnTransformer
to ColumnTransformerWithNames
. The output is a DataFrame and this step now has a working get_feature_names()
.
I am assuming you are looking for ways to access the result of the transformer, which yields a numpy array.
ColumnTransfomer has an attribute called transformers_
:`
From the documentation:
transformers_ : list The collection of fitted transformers as tuples of (name, fitted_transformer, column). `fitted_transformer` can be an estimator, 'drop', or 'passthrough'. In case there were no columns selected, this will be the unfitted transformer. If there are remaining columns, the final element is a tuple of the form: ('remainder', transformer, remaining_columns) corresponding to the ``remainder`` parameter. If there are remaining columns, then ``len(transformers_)==len(transformers)+1``, otherwise ``len(transformers_)==len(transformers)``.
So that provides unfortunately only information on the transformer itself and the column it has been applied to, however not on the location of the resulting data except for the following :
notes: The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the
transformers
list.
So we know that the order of the output columns is the same as the order in which the columns are specified in the transformers list. Plus, we also know for our transformer steps how much columns they yield, as a StandardScaler() yields the same number of columns as the original data and OneHotEncoder() yields number of columns equal to the number of categories.
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','asd'],
'num1' : [1, 1, 0, 0] ,
'target' : [0.2,0.11,1.34,1.123]})
train_continuous_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
train_categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
# get n_categories for categorical features
n_categories = [df[x].nunique() for x in train_categorical_cols]
preprocess = make_column_transformer(
(StandardScaler(),train_continuous_cols),
(OneHotEncoder(), train_categorical_cols)
)
preprocessed_df = preprocess.fit_transform(df)
# the scaler yield 1 column each
indexes_scaler = list(range(0,len(train_continuous_cols)))
# the encoder yields a number of columns equal to the number of categories in the data
cum_index_encoder = [0] + list(np.cumsum(n_categories))
# the encoder indexes come after the scaler indexes
start_index_encoder = indexes_scaler[-1]+1
indexes_encoder = [x + start_index_encoder for x in cum_index_encoder]
# get both lower and uper bound of index
index_pairs= zip (indexes_encoder[:-1],indexes_encoder[1:])
This results in the following output:
print ('Transformed {} continious cols resulting in a df with shape:'.format(len(train_continuous_cols)))
print (preprocessed_df[: , indexes_scaler].shape)
Transformed 2 continious cols resulting in a df with shape: (4, 2)
for column, (start_id, end_id) in zip (train_categorical_cols,index_pairs):
print('Transformed column {} resulted in a df with shape:'.format(column))
print(preprocessed_df[:, start_id:end_id].shape)
Transformed column brand resulted in a df with shape: (4, 4)
Transformed column category resulted in a df with shape: (4, 4)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With