I have a dataset of 20 features and 840 rows. I have already optimized the classifier (random forest). My parameters are n_estimators=100 and max_features=5. I want to do a classification for each feature. I mean with each of the features I want to know the prediction accuracy. But when I use my code I get an error. I am using scikit ver. 18.
How can I fix the problem?
for name in ["AWA"]:
x=sio.loadmat('/home/TrainVal/{}_Fp1.mat'.format(name))['x']
s_y=sio.loadmat('/home/TrainVal/{}_Fp1.mat'.format(name))['y']
y=np.ravel(s_y)
print(name, x.shape, y.shape)
print("")
clf = make_pipeline(preprocessing.RobustScaler(), RandomForestClassifier(n_estimators = 100,
max_features=5, n_jobs=-1))
#########10x10 SSS##############
print("10x10")
for i in range(x.shape[1]):
xA=x[:, i].reshape(-1,1)
xSSSmean = []
for j in range(10):
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=j)
scoresSSS = cross_val_score(clf, xA, y, cv=sss)
xSSSmean.append(scoresSSS.mean())
result_list.append(np.mean(xSSSmean))
plt.bar(i, np.mean(xSSSmean)*100, align = 'center')
plt.ylabel('Accuracy')
plt.xlabel('Features')
plt.title('Accuracy per feature: {}_RF_Fp1(20)'.format(name))
xticks=np.arange(i+1)
plt.xticks(xticks, rotation = 'vertical')
plt.show()
#THE ERROR
ValueError Traceback (most recent call last)
<ipython-input-2-a5faae7f83a2> in <module>()
24
25 sss = StratifiedShuffleSplit(n_splits=10, test_size=0.1, random_state=j)#ver18
---> 26 scoresSSS = cross_val_score(clf, xA, y, cv=sss)
27 xSSSmean.append(scoresSSS.mean())
28 #print(scoresSSS)
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
138 train, test, verbose, None,
139 fit_params)
--> 140 for train, test in cv_iter)
141 return np.array(scores)[:, 0]
142
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
236 estimator.fit(X_train, **fit_params)
237 else:
--> 238 estimator.fit(X_train, y_train, **fit_params)
239
240 except Exception as e:
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
--> 270 self._final_estimator.fit(Xt, y, **fit_params)
271 return self
272
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/ensemble/forest.py in fit(self, X, y, sample_weight)
324 t, self, X, y, sample_weight, i, len(trees),
325 verbose=self.verbose, class_weight=self.class_weight)
--> 326 for i, t in enumerate(trees))
327
328 # Collect newly grown trees
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/ensemble/forest.py in _parallel_build_trees(tree, forest, X, y, sample_weight, tree_idx, n_trees, verbose, class_weight)
118 curr_sample_weight *= compute_sample_weight('balanced', y, indices)
119
--> 120 tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
121 else:
122 tree.fit(X, y, sample_weight=sample_weight, check_input=False)
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
737 sample_weight=sample_weight,
738 check_input=check_input,
--> 739 X_idx_sorted=X_idx_sorted)
740 return self
741
/home/karim/anaconda2/envs/scikit18/lib/python3.5/site-packages/sklearn/tree/tree.py in fit(self, X, y, sample_weight, check_input, X_idx_sorted)
244 raise ValueError("max_depth must be greater than zero. ")
245 if not (0 < max_features <= self.n_features_):
--> 246 raise ValueError("max_features must be in (0, n_features]")
247 if not isinstance(max_leaf_nodes, (numbers.Integral, np.integer)):
248 raise ValueError("max_leaf_nodes must be integral number but was "
ValueError: max_features must be in (0, n_features]
max_features: These are the maximum number of features Random Forest is allowed to try in individual tree. There are multiple options available in Python to assign maximum features.
max_features: The number of features to consider when looking for the best split. If this value is not set, the decision tree will consider all features available to make the best split. Depending on your application, it's often a good idea to tune this parameter.
max_features: Random forest takes random subsets of features and tries to find the best split. max_features helps to find the number of features to take into account in order to make the best split. It can take four values “auto“, “sqrt“, “log2” and None.
So I managed to solve the problem!!! :) In scikit page says:
*If float, then max_features is a percentage and int(max_features * n_features) features are considered at each split.*
My value:
n_features=20. This is in int. It is the number of features that I have in my dataset.
max_features: this is the number of features that I want to use. But they are in int so I have to turn them into float
To turn it into float I have to use the formula that is in scikit:
int(max_features * n_features)
int(x * 20)=2
x=0.1
We have to assume that I want to use only 2 features from the 20.
x is the percentage in float
I changed the value in max_features from int to float. Just like this:
max_features:
(int) (float)
20 = 1.0
15 = 0.75
10 = 0.5
5 = 0.25
2 = 0.1
EXAMPLE
#Instead of:
clf = make_pipeline(preprocessing.RobustScaler(), RandomForestClassifier(n_estimators = 100,
max_features=5, n_jobs=-1))
#I did:
clf = make_pipeline(preprocessing.RobustScaler(), RandomForestClassifier(n_estimators = 100,
max_features=0.25, n_jobs=-1))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With