Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Grid search with f1 as scoring function, several pages of error message

Want to use Gridsearch to find best parameters and use f1 as the scoring metric.

If i remove the scoring function, all works well and i get no errors.

Here is my code:

from sklearn import grid_search 
parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
train_classifier(reg, X_train, y_train)
train_f1_score = predict_labels(reg, X_train, y_train)
print reg.best_params_
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(predict_labels(reg, X_test, y_test))

When i execute i get pages upon pages as errors, and i cannot make heads or tails of it :(

ValueError                                Traceback (most recent call last)
<ipython-input-17-3083ff8a20ea> in <module>()
      3 parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
      4 reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
----> 5 train_classifier(reg, X_train, y_train)
      6 train_f1_score = predict_labels(reg, X_train, y_train)
      7 print reg.best_params_

<ipython-input-9-b56ce25fd90b> in train_classifier(clf, X_train, y_train)
      5     print "Training {}...".format(clf.__class__.__name__)
      6     start = time.time()
----> 7     clf.fit(X_train, y_train)
      8     end = time.time()
      9     print "Done!\nTraining time (secs): {:.3f}".format(end - start)

//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
    802 
    803         """
--> 804         return self._fit(X, y, ParameterGrid(self.param_grid))
    805 
    806 

//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
    551                                     self.fit_params, return_parameters=True,
    552                                     error_score=self.error_score)
--> 553                 for parameters in parameter_iterable
    554                 for train, test in cv)
    555 

//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
    802             self._iterating = True
    803 
--> 804             while self.dispatch_one_batch(iterator):
    805                 pass
    806 

//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
    660                 return False
    661             else:
--> 662                 self._dispatch(tasks)
    663                 return True
    664 

//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
    568 
    569         if self._pool is None:
--> 570             job = ImmediateComputeBatch(batch)
    571             self._jobs.append(job)
    572             self.n_dispatched_batches += 1

//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
    181         # Don't delay the application, to avoid keeping the input
    182         # arguments in memory
--> 183         self.results = batch()
    184 
    185     def get(self):

//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):

//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
   1548 
   1549     else:
-> 1550         test_score = _score(estimator, X_test, y_test, scorer)
   1551         if return_train_score:
   1552             train_score = _score(estimator, X_train, y_train, scorer)

//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
   1604         score = scorer(estimator, X_test)
   1605     else:
-> 1606         score = scorer(estimator, X_test, y_test)
   1607     if not isinstance(score, numbers.Number):
   1608         raise ValueError("scoring must return a number, got %s (%s) instead."

//anaconda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, estimator, X, y_true, sample_weight)
     88         else:
     89             return self._sign * self._score_func(y_true, y_pred,
---> 90                                                  **self._kwargs)
     91 
     92 

//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
    637     return fbeta_score(y_true, y_pred, 1, labels=labels,
    638                        pos_label=pos_label, average=average,
--> 639                        sample_weight=sample_weight)
    640 
    641 

//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
    754                                                  average=average,
    755                                                  warn_for=('f-score',),
--> 756                                                  sample_weight=sample_weight)
    757     return f
    758 

//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
    982                 else:
    983                     raise ValueError("pos_label=%r is not a valid label: %r" %
--> 984                                      (pos_label, present_labels))
    985             labels = [pos_label]
    986     if labels is None:

ValueError: pos_label=1 is not a valid label: array(['no', 'yes'], 
      dtype='|S3')
like image 369
hmmmbob Avatar asked Dec 09 '22 00:12

hmmmbob


1 Answers

Seems that you have label array with values 'no' and 'yes', you should convert them to binary 1-0 numerical representation, because your error states that scoring function cannot understand where 0's and 1's are in your label array.

Other possible way to solve it without modifying your label array:

from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer

f1_scorer = make_scorer(f1_score, pos_label="yes")
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring=f1_scorer)
like image 121
Ibraim Ganiev Avatar answered Dec 10 '22 13:12

Ibraim Ganiev