sklearn transformer for outlier removal - returning xy?

Question

I am trying to remove rows that are labeled outliers. I have this partially working, but not in the context of a pipeline and I am not sure why.

from sklearn.datasets import make_classification

X1, y1 = make_classification(n_samples=100, n_features=10, n_informative=5, n_classes=3)

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import IsolationForest
import numpy as np


class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.05):
        self.contamination = contamination
        self.isolation_forest = IsolationForest(contamination=self.contamination)

    def fit(self, X, y=None):
        self.isolation_forest.fit(X)
        mask = self.isolation_forest.predict(X) == 1
        self.mask = mask
        return self

    def transform(self, X, y=None):
        if y is not None:
            return X[self.mask], y[self.mask]
        else:
            return X[self.mask]

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)


working = IsolationForestOutlierRemover().fit_transform(X1, y1)
working[0].shape
# 95 
working

# %%

pipelinet = Pipeline(
    [
        ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
        ("random_forest", RandomForestClassifier()),
    ]
)

notworking = pipelinet.fit(X1, y1)
notworking

Getting the following error:

ValueError                                Traceback (most recent call last)
/home/mmann1123/Documents/github/YM_TZ_crop_classifier/4_model.py in line 10
      349 # %%
      351 pipelinet = Pipeline(
      352     [
      353         ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
      354         ("random_forest", RandomForestClassifier()),
      355     ]
      356 )
---> 358 notworking = pipelinet.fit(X1, y1)
     359 notworking

File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/pipeline.py:406, in Pipeline.fit(self, X, y, **fit_params)
    404     if self._final_estimator != "passthrough":
    405         fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 406         self._final_estimator.fit(Xt, y, **fit_params_last_step)
    408 return self

File ~/miniconda3/envs/crop_class/lib/python3.8/site-packages/sklearn/ensemble/_forest.py:346, in BaseForest.fit(self, X, y, sample_weight)
    344 if issparse(y):
    345     raise ValueError("sparse multilabel-indicator for y is not supported.")
--> 346 X, y = self._validate_data(
    347     X, y, multi_output=True, accept_sparse="csc", dtype=DTYPE
    348 )
...
--> 185     array = numpy.asarray(array, order=order, dtype=dtype)
    186     return xp.asarray(array, copy=copy)
    187 else:

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 2 dimensions. The detected shape was (2, 95) + inhomogeneous part.

artemis · Accepted Answer

I don't have your specific package versions, and I am not using conda, but I was able to replicate your problem and fix it.

RandomForestClassifier expects two arrays X and y for the fit method. After the outlier removal, the transformed X and y need to be passed to the next step in the pipeline, but your current transform method in the IsolationForestOutlierRemover class returns a single tuple when y is not None, which is causing the issue.

To fix this, you need to update the Pipeline to correctly pass the X and y to the RandomForestClassifier. There are a couple of ways to do this; I did it with overriding.

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

class IsolationForestOutlierRemover(BaseEstimator, TransformerMixin):
    def __init__(self, contamination=0.05):
        self.contamination = contamination
        self.isolation_forest = IsolationForest(contamination=self.contamination)

    def fit(self, X, y=None):
        self.isolation_forest.fit(X)
        mask = self.isolation_forest.predict(X) == 1
        self.mask = mask
        return self

    def transform(self, X, y=None):
        if y is not None:
            return X[self.mask], y[self.mask]
        else:
            return X[self.mask]

    def fit_transform(self, X, y=None, **fit_params):
        self = self.fit(X, y, **fit_params)
        return self.transform(X, y)

pipeline = Pipeline(
    [
        ("outlier_removal", IsolationForestOutlierRemover(contamination=0.05)),
        ("random_forest", RandomForestClassifier()),
    ]
)

pipeline.fit(X1, y1)

One thing to note... fit_transform is used during the fit call of the Pipeline object from sklearn. fit() is also only called for the final estimator.

sklearn transformer for outlier removal - returning xy?

Tags:

python

scikit-learn

pipeline

mmann1123

1 Answers

artemis

Recent Activity

Donate For Us

sklearn transformer for outlier removal - returning xy?

Tags:

python

scikit-learn

pipeline

mmann1123

1 Answers

artemis

Related questions

Recent Activity

Donate For Us