I want to create a stable pipeline in scikit-learn for preprocessing the data. The first step that I am trying to complete is the imputation of None
values applied with different strategies (i.e. replacing with mean, median or other descriptive statistics) for different columns in the dataframe. However I
I started using SimpleImputer
transformer together with ColumnTransformer
. Because SimpleImputer
returns numpy array instead of pandas dataframe, I wrote a new transformer which uses SimpleImputer
under the hood but adds pandas columns and indices back to the numpy array. Why I need exactly pandas dataframe back? Because I see my pipeline like this:
pipeline = Pipeline([
('imputation', ImputationColumnTransformer),
('feature_encoding', EncodingColumnTransformer),
('model', MLModel)
])
Without column access, the second step of feature encoding simply wouldn't be able to proceed.
The problem is that when I am using customized transformers I always get some errors from internal scikit-learn validation code.
I created a simple example to show the type of errors I get:
# Creating a toy dataset
m = np.random.randn(3, 3)
m[0, 1] = np.nan
m[2, 2] = np.nan
df = pd.DataFrame(m, columns=['a', 'b', 'c'])
class Imputer(BaseEstimator, TransformerMixin):
# This transformer returns dataframe instead of default ndarray
def __init__(self, ImputerCls, strategy):
self.imputer = ImputerCls(strategy=strategy)
def fit(self, X, y=None):
self.imputer.fit(X, y)
return self
def transform(self, X):
res = self.imputer.transform(X)
res = pd.DataFrame(res)
res.columns = X.columns
res.index = X.index
return res
imputation = ColumnTransformer([
('categorial_imputer', Imputer(SimpleImputer, strategy='most_frequent'), ['a']),
('numeric_imputer', Imputer(SimpleImputer, strategy='mean'), ['b', 'c'])
])
imputation.fit_transform(df)
I expect pandas dataframe with all the columns preserved, however I am getting a long traceback log which I can not fully understand to find the problem. It seems that at some stage ImputerCls
is None.
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-70-0ea27e638c36> in <module>
3 ('numeric_imputer', Imputer(SimpleImputer, strategy='most_frequent'), ['b', 'c'])
4 ])
----> 5 imputation.fit_transform(df)
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
466 self._validate_remainder(X)
467
--> 468 result = self._fit_transform(X, y, _fit_transform_one)
469
470 if not result:
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _fit_transform(self, X, y, func, fitted)
410 message=self._log_message(name, idx, len(transformers)))
411 for idx, (name, trans, column, weight) in enumerate(
--> 412 self._iter(fitted=fitted, replace_strings=True), 1))
413 except ValueError as e:
414 if "Expected 2D array, got 1D array instead" in str(e):
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
752 tasks = BatchedCalls(itertools.islice(iterator, batch_size),
753 self._backend.get_nested_backend(),
--> 754 self._pickle_cache)
755 if len(tasks) == 0:
756 # No more tasks available in the iterator: tell caller to stop.
~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __init__(self, iterator_slice, backend_and_jobs, pickle_cache)
208
209 def __init__(self, iterator_slice, backend_and_jobs, pickle_cache=None):
--> 210 self.items = list(iterator_slice)
211 self._size = len(self.items)
212 if isinstance(backend_and_jobs, tuple):
~/anaconda3/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in <genexpr>(.0)
409 message_clsname='ColumnTransformer',
410 message=self._log_message(name, idx, len(transformers)))
--> 411 for idx, (name, trans, column, weight) in enumerate(
412 self._iter(fitted=fitted, replace_strings=True), 1))
413 except ValueError as e:
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in clone(estimator, safe)
63 for name, param in new_object_params.items():
64 new_object_params[name] = clone(param, safe=False)
---> 65 new_object = klass(**new_object_params)
66 params_set = new_object.get_params(deep=False)
67
<ipython-input-57-a319579eaf68> in __init__(self, ImputerCls, strategy)
2 # This class returns dataframe instead of default ndarray
3 def __init__(self, ImputerCls, strategy):
----> 4 self.imputer = ImputerCls(strategy=strategy)
5
6 def fit(self, X, y=None):
TypeError: 'NoneType' object is not callable
I got it working this way. I think the Imputer was not being instantiated:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import pandas as pd
import numpy as np
# Creating a toy dataset
m = np.random.randn(3, 3)
m[0, 1] = np.nan
m[2, 2] = np.nan
df = pd.DataFrame(m, columns=['a', 'b', 'c'])
class Imputer(BaseEstimator, TransformerMixin):
# This transformer returns dataframe instead of default ndarray
def __init__(self, imputer, strategy):
self.imputer = imputer
self.strategy = strategy
def fit(self, X, y=None):
self.imputer = self.imputer(strategy=self.strategy)
self.imputer.fit(X, y)
return self
def transform(self, X, *_):
return self.imputer.transform(X)
imputation = ColumnTransformer([
('categorial_imputer', Imputer(SimpleImputer, strategy='most_frequent'), ['a']),
('numeric_imputer', Imputer(SimpleImputer, strategy='mean'), ['b', 'c'])
])
df = pd.DataFrame(imputation.fit_transform(df), columns=df.columns, index=df.index)
And that's it!
I think the problem is that ColumnTransformer returns a numpy darray. The above solution still converts the ColumnTransformer result to pandas dataframe outside the pipeline. Thus, the solution cannot be used as a step in sklearn Pipeline as the original poster desires.
I used inheritance to create a solution which can be used in a Pipeline. It works if the ColumnTransformer does not change the shape or order of columns or rows of input.
class PandasColumnTransformer(ColumnTransformer):
def transform(self, X: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame(super().transform(X), columns=X.columns, index=X.index)
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
return pd.DataFrame(super().fit_transform(X), columns=X.columns, index=X.index)
imputation = PandasColumnTransformer([
('categorial_imputer', SimpleImputer(strategy='most_frequent'), ['a']),
('numeric_imputer', SimpleImputer(strategy='mean'), ['b', 'c'])
])
df = imputation.fit_transform(df)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With