Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

Load pickled classifier data : Vocabulary not fitted Error

I have read all related questions here but couldn't find a working solution :

My classifier creation :

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))

tf = StemmedTfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=200000, stop_words = 'english')


def create_tfidf(f):
    docs = []
    targets = []
    with open(f, "r") as sentences_file:
        reader = csv.reader(sentences_file, delimiter=';')
        reader.next()
        for row in reader:
            docs.append(row[1])
            targets.append(row[0])

    tfidf_matrix = tf.fit_transform(docs)
    print tfidf_matrix.shape
    # print tf.get_feature_names()
    return tfidf_matrix, targets


X,y = create_tfidf("l0.csv")
clf = LinearSVC().fit(X,y)

_ = joblib.dump(clf, 'linearL0_3gram_100K.pkl', compress=9)

This bit works, and generates the .pkl, which I then try to use as such in a different script:

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))

tf = StemmedTfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=200000, stop_words = 'english')


clf = joblib.load('linearL0_3gram_100K.pkl')

print clf
test = "My super elaborate test string to test predictions"
print test + clf.predict(tf.transform([test]))[0]

And I get ValueError: Vocabulary wasn't fitted or is empty!

Edit : Error Traceback as requested

 File "classifier.py", line 27, in <module>
    print test + clf.predict(tf.transform([test]))[0]
  File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 1313, in transform
    X = super(TfidfVectorizer, self).transform(raw_documents)
  File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 850, in transform
    self._check_vocabulary()
  File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/feature_extraction/text.py", line 271, in _check_vocabulary
    check_is_fitted(self, 'vocabulary_', msg=msg),
  File "/home/ec2-user/.local/lib/python2.7/site-packages/sklearn/utils/validation.py", line 627, in check_is_fitted
    raise NotFittedError(msg % {'name': type(estimator).__name__})
sklearn.utils.validation.NotFittedError: StemmedTfidfVectorizer - Vocabulary wasn't fitted.
like image 477
xShirase Avatar asked Jul 31 '15 10:07

xShirase


1 Answers

Ok, I solved the issue by using a pipeline to get my vectorizer saved within the .plk

Here's how it looks (also, way simpler) :

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
import Stemmer
import pickle

english_stemmer = Stemmer.Stemmer('en')


class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))


def create_tfidf(f):
    docs = []
    targets = []
    with open(f, "r") as sentences_file:
        reader = csv.reader(sentences_file, delimiter=';')
        reader.next()
        for row in reader:
            docs.append(row[1])
            targets.append(row[0])
    return docs, targets


docs,y = create_tfidf("l1.csv")
tf = StemmedTfidfVectorizer(analyzer='word', ngram_range=(1,2), min_df = 0, max_features=200000, stop_words = 'english')
clf = LinearSVC()

vec_clf = Pipeline([('tfvec', tf), ('svm', clf)])

vec_clf.fit(docs,y)

_ = joblib.dump(vec_clf, 'linearL0_3gram_100K.pkl', compress=9)

And on the other side :

from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.externals import joblib
import Stemmer
import pickle

english_stemmer = Stemmer.Stemmer('en')

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: english_stemmer.stemWords(analyzer(doc))


clf = joblib.load('linearL0_3gram_100K.pkl')
test = ["My super elaborate test string to test predictions"]
print test + clf.predict(test)[0]

Important things to mention :

The transformer is part of the pipeline, as is tf, so there's no need either to redeclare a new vectorizer (which was the failing point earlier as it needed the vocabulary from the trained data), or to .transform() the test string.

like image 124
xShirase Avatar answered Nov 01 '22 04:11

xShirase