I am trying to remove rows that are labeled outliers. I have this partially working, but not in the context of a pipeline and I am not sure why.
from sklearn.datasets import make_classification
X1, y1 = make_classification(n_samples=100, n_features=10, n_informative=5, n_classes=3)
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import IsolationForest
import numpy as np
class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, contamination=0.05):
self.contamination = contamination
self.isolation_forest = IsolationForest(contamination=self.contamination)
def fit(self, X, y=None):
self.isolation_forest.fit(X)
mask = self.isolation_forest.predict(X) == 1
self.mask = mask
return self
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask]
def fit_transform(self, X, y=None):
self.fit(X, y)
return self.transform(X, y)
working = IsolationForestOutlierRemover().fit_transform(X1, y1)
working[0].shape
# 95
working
# %%
pipelinet = Pipeline(
[
("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
("random_forest", RandomForestClassifier()),
]
)
notworking = pipelinet.fit(X1, y1)
notworking
Getting the following error:
ValueError Traceback (most recent call last)
/home/mmann1123/Documents/github/YM_TZ_crop_classifier/4_model.py in line 10
349 # %%
351 pipelinet = Pipeline(
352 [
353 ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
354 ("random_forest", RandomForestClassifier()),
355 ]
356 )
---> 358 notworking = pipelinet.fit(X1, y1)
359 notworking
File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/pipeline.py:406, in Pipeline.fit(self, X, y, **fit_params)
404 if self._final_estimator != "passthrough":
405 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 406 self._final_estimator.fit(Xt, y, **fit_params_last_step)
408 return self
File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:346, in BaseForest.fit(self, X, y, sample_weight)
344 if issparse(y):
345 raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 346 X, y = self._validate_data(
347 X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
348 )
...
--> 185 array = numpy.asarray(array, order=order, dtype=dtype)
186 return xp.asarray(array, copy=copy)
187 else:
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 95) + inhomogeneous part.
I don't have your specific package versions, and I am not using conda, but I was able to replicate your problem and fix it.
RandomForestClassifier expects two arrays X and y for the fit method. After the outlier removal, the transformed X and y need to be passed to the next step in the pipeline, but your current transform method in the IsolationForestOutlierRemover class returns a single tuple when y is not None, which is causing the issue.
To fix this, you need to update the Pipeline to correctly pass the X and y to the RandomForestClassifier. There are a couple of ways to do this; I did it with overriding.
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
def __init__(self, contamination=0.05):
self.contamination = contamination
self.isolation_forest = IsolationForest(contamination=self.contamination)
def fit(self, X, y=None):
self.isolation_forest.fit(X)
mask = self.isolation_forest.predict(X) == 1
self.mask = mask
return self
def transform(self, X, y=None):
if y is not None:
return X[self.mask], y[self.mask]
else:
return X[self.mask]
def fit_transform(self, X, y=None, **fit_params):
self = self.fit(X, y, **fit_params)
return self.transform(X, y)
pipeline = Pipeline(
[
("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
("random_forest", RandomForestClassifier()),
]
)
pipeline.fit(X1, y1)
One thing to note... fit_transform is used during the fit call of the Pipeline object from sklearn. fit() is also only called for the final estimator.
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With