Want to use Gridsearch to find best parameters and use f1 as the scoring metric.
If i remove the scoring function, all works well and i get no errors.
Here is my code:
from sklearn import grid_search
parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
train_classifier(reg, X_train, y_train)
train_f1_score = predict_labels(reg, X_train, y_train)
print reg.best_params_
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(predict_labels(reg, X_test, y_test))
When i execute i get pages upon pages as errors, and i cannot make heads or tails of it :(
ValueError Traceback (most recent call last)
<ipython-input-17-3083ff8a20ea> in <module>()
3 parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
4 reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
----> 5 train_classifier(reg, X_train, y_train)
6 train_f1_score = predict_labels(reg, X_train, y_train)
7 print reg.best_params_
<ipython-input-9-b56ce25fd90b> in train_classifier(clf, X_train, y_train)
5 print "Training {}...".format(clf.__class__.__name__)
6 start = time.time()
----> 7 clf.fit(X_train, y_train)
8 end = time.time()
9 print "Done!\nTraining time (secs): {:.3f}".format(end - start)
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
802 self._iterating = True
803
--> 804 while self.dispatch_one_batch(iterator):
805 pass
806
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
660 return False
661 else:
--> 662 self._dispatch(tasks)
663 return True
664
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
568
569 if self._pool is None:
--> 570 job = ImmediateComputeBatch(batch)
571 self._jobs.append(job)
572 self.n_dispatched_batches += 1
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
181 # Don't delay the application, to avoid keeping the input
182 # arguments in memory
--> 183 self.results = batch()
184
185 def get(self):
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1604 score = scorer(estimator, X_test)
1605 else:
-> 1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
//anaconda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, estimator, X, y_true, sample_weight)
88 else:
89 return self._sign * self._score_func(y_true, y_pred,
---> 90 **self._kwargs)
91
92
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
637 return fbeta_score(y_true, y_pred, 1, labels=labels,
638 pos_label=pos_label, average=average,
--> 639 sample_weight=sample_weight)
640
641
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
754 average=average,
755 warn_for=('f-score',),
--> 756 sample_weight=sample_weight)
757 return f
758
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
982 else:
983 raise ValueError("pos_label=%r is not a valid label: %r" %
--> 984 (pos_label, present_labels))
985 labels = [pos_label]
986 if labels is None:
ValueError: pos_label=1 is not a valid label: array(['no', 'yes'],
dtype='|S3')
Seems that you have label array with values 'no' and 'yes', you should convert them to binary 1-0 numerical representation, because your error states that scoring function cannot understand where 0's and 1's are in your label array.
Other possible way to solve it without modifying your label array:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="yes")
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring=f1_scorer)
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With