add stemming support to CountVectorizer (sklearn)


I'm trying to add stemming to my pipeline in NLP with sklearn.

from nltk.stem.snowball import FrenchStemmer  stop = stopwords.words('french') stemmer = FrenchStemmer()   class StemmedCountVectorizer(CountVectorizer):     def __init__(self, stemmer):         super(StemmedCountVectorizer, self).__init__()         self.stemmer = stemmer      def build_analyzer(self):         analyzer = super(StemmedCountVectorizer, self).build_analyzer()         return lambda doc:(self.stemmer.stem(w) for w in analyzer(doc))  stem_vectorizer = StemmedCountVectorizer(stemmer) text_clf = Pipeline([('vect', stem_vectorizer), ('tfidf', TfidfTransformer()), ('clf', SVC(kernel='linear', C=1)) ]) 

When using this pipeline with the CountVectorizer of sklearn it works. And if I create manually the features like this it works also.

vectorizer = StemmedCountVectorizer(stemmer) vectorizer.fit_transform(X) tfidf_transformer = TfidfTransformer() X_tfidf = tfidf_transformer.fit_transform(X_counts) 


If I try this pipeline on my IPython Notebook it displays the [*] and nothing happens. When I look at my terminal, it gives this error :

Process PoolWorker-12: Traceback (most recent call last):   File "C:\Anaconda2\lib\multiprocessing\process.py", line 258, in _bootstrap     self.run()   File "C:\Anaconda2\lib\multiprocessing\process.py", line 114, in run     self._target(*self._args, **self._kwargs)   File "C:\Anaconda2\lib\multiprocessing\pool.py", line 102, in worker     task = get()   File "C:\Anaconda2\lib\site-packages\sklearn\externals\joblib\pool.py", line 360, in get     return recv() AttributeError: 'module' object has no attribute 'StemmedCountVectorizer' 


Here is the complete example

from sklearn.pipeline import Pipeline from sklearn import grid_search from sklearn.svm import SVC from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from nltk.stem.snowball import FrenchStemmer  stemmer = FrenchStemmer() analyzer = CountVectorizer().build_analyzer()  def stemming(doc):     return (stemmer.stem(w) for w in analyzer(doc))  X = ['le chat est beau', 'le ciel est nuageux', 'les gens sont gentils', 'Paris est magique', 'Marseille est tragique', 'JCVD est fou'] Y = [1,0,1,1,0,0]  text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SVC())]) parameters = { 'vect__analyzer': ['word', stemming]}  gs_clf = grid_search.GridSearchCV(text_clf, parameters, n_jobs=-1) gs_clf.fit(X, Y) 

If you remove stemming from the parameters it works otherwise it doesn't work.


The problem seems to be in the parallelization process because when removing n_jobs=-1 the problem disappear.

2 Answers

You can pass a callable as analyzer to the CountVectorizer constructor to provide a custom analyzer. This appears to work for me.

from sklearn.feature_extraction.text import CountVectorizer from nltk.stem.snowball import FrenchStemmer  stemmer = FrenchStemmer() analyzer = CountVectorizer().build_analyzer()  def stemmed_words(doc):     return (stemmer.stem(w) for w in analyzer(doc))  stem_vectorizer = CountVectorizer(analyzer=stemmed_words) print(stem_vectorizer.fit_transform(['Tu marches dans la rue'])) print(stem_vectorizer.get_feature_names()) 

Prints out:

  (0, 4)    1   (0, 2)    1   (0, 0)    1   (0, 1)    1   (0, 3)    1 [u'dan', u'la', u'march', u'ru', u'tu'] 
I know I am little late in posting my answer. But here it is, in case someone still needs help.

Following is the cleanest approach to add language stemmer to count vectorizer by overriding build_analyser()

from sklearn.feature_extraction.text import CountVectorizer import nltk.stem  french_stemmer = nltk.stem.SnowballStemmer('french') class StemmedCountVectorizer(CountVectorizer):     def build_analyzer(self):         analyzer = super(StemmedCountVectorizer, self).build_analyzer()         return lambda doc: ([french_stemmer.stem(w) for w in analyzer(doc)])  vectorizer_s = StemmedCountVectorizer(min_df=3, analyzer="word", stop_words='french') 

You can freely call fit and transform functions of CountVectorizer class over your vectorizer_s object

