Is there any way to implement skip-gram in scikit-learn
library?
I have manually generated a list with n-skip-grams, and pass that to skipgrams as vocabulary for the CountVectorizer()
method.
Unfortunately, its performance on prediction is very poor: only 63% accuracy.
However, I get an accuracy of 77-80% on CountVectorizer()
using ngram_range(min,max)
from the default code.
Is there a better way to implement skip-grams in scikit learn?
Here is my part of code:
corpus = GetCorpus() # This one get text from file as a list
vocabulary = list(GetVocabulary(corpus,k,n))
# this one returns a k-skip n-gram
vec = CountVectorizer(
tokenizer=lambda x: x.split(),
ngram_range=(2,2),
stop_words=stopWords,
vocabulary=vocabulary)
To vectorize text with skip-grams in scikit-learn simply passing the skip gram tokens as the vocabulary to CountVectorizer
will not work. You need to modify the way tokens are processed which can be done with a custom analyzer. Below is an example vectorizer that produces 1-skip-2-grams,
from toolz import itertoolz, compose
from toolz.curried import map as cmap, sliding_window, pluck
from sklearn.feature_extraction.text import CountVectorizer
class SkipGramVectorizer(CountVectorizer):
def build_analyzer(self):
preprocess = self.build_preprocessor()
stop_words = self.get_stop_words()
tokenize = self.build_tokenizer()
return lambda doc: self._word_skip_grams(
compose(tokenize, preprocess, self.decode)(doc),
stop_words)
def _word_skip_grams(self, tokens, stop_words=None):
# handle stop words
if stop_words is not None:
tokens = [w for w in tokens if w not in stop_words]
return compose(cmap(' '.join), pluck([0, 2]), sliding_window(3))(tokens)
For instance, on this Wikipedia example,
text = ['the rain in Spain falls mainly on the plain']
vect = SkipGramVectorizer()
vect.fit(text)
vect.get_feature_names()
this vectorizer would yield the following tokens,
['falls on', 'in falls', 'mainly the', 'on plain',
'rain spain', 'spain mainly', 'the in']
I came up with my own implementation of a skip-gram vectorizer. It is inspired by this post. I also limited skip-grams to not cross sentence boundaries (using nltk.sent_tokenize
), to limit the feature space. Here is my code:
import nltk
from itertools import combinations
from toolz import compose
from sklearn.feature_extraction.text import CountVectorizer
class SkipGramVectorizer(CountVectorizer):
def __init__(self, k=1, **kwds):
super(SkipGramVectorizer, self).__init__(**kwds)
self.k=k
def build_sent_analyzer(self, preprocess, stop_words, tokenize):
return lambda sent : self._word_skip_grams(
compose(tokenize, preprocess, self.decode)(sent),
stop_words)
def build_analyzer(self):
preprocess = self.build_preprocessor()
stop_words = self.get_stop_words()
tokenize = self.build_tokenizer()
sent_analyze = self.build_sent_analyzer(preprocess, stop_words, tokenize)
return lambda doc : self._sent_skip_grams(doc, sent_analyze)
def _sent_skip_grams(self, doc, sent_analyze):
skip_grams = []
for sent in nltk.sent_tokenize(doc):
skip_grams.extend(sent_analyze(sent))
return skip_grams
def _word_skip_grams(self, tokens, stop_words=None):
"""Turn tokens into a sequence of n-grams after stop words filtering"""
# handle stop words
if stop_words is not None:
tokens = [w for w in tokens if w not in stop_words]
# handle token n-grams
min_n, max_n = self.ngram_range
k = self.k
if max_n != 1:
original_tokens = tokens
if min_n == 1:
# no need to do any slicing for unigrams
# just iterate through the original tokens
tokens = list(original_tokens)
min_n += 1
else:
tokens = []
n_original_tokens = len(original_tokens)
# bind method outside of loop to reduce overhead
tokens_append = tokens.append
space_join = " ".join
for n in xrange(min_n,
min(max_n + 1, n_original_tokens + 1)):
for i in xrange(n_original_tokens - n + 1):
# k-skip-n-grams
head = [original_tokens[i]]
for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
tokens_append(space_join(head + list(skip_tail)))
return tokens
def test(text, ngram_range, k):
vectorizer = SkipGramVectorizer(ngram_range=ngram_range, k=k)
vectorizer.fit_transform(text)
print(vectorizer.get_feature_names())
def main():
text = ['Insurgents killed in ongoing fighting.']
# 2-skip-bi-grams
test(text, (2,2), 2)
# 2-skip-tri-grams
test(text, (3,3), 2)
###############################################################################################
if __name__ == '__main__':
main()
This would generate the following feature names:
[u'in fighting', u'in ongoing', u'insurgents in', u'insurgents killed', u'insurgents ongoing', u'killed fighting', u'killed in', u'killed ongoing', u'ongoing fighting']
[u'in ongoing fighting', u'insurgents in fighting', u'insurgents in ongoing', u'insurgents killed fighting', u'insurgents killed in', u'insurgents killed ongoing', u'insurgents ongoing fighting', u'killed in fighting', u'killed in ongoing', u'killed ongoing fighting']
Notice, that I basically took the _word_ngrams
function from the VectorizerMixin
class and replaced the line
tokens_append(space_join(original_tokens[i: i + n]))
with the following:
head = [original_tokens[i]]
for skip_tail in combinations(original_tokens[i+1:i+n+k], n-1):
tokens_append(space_join(head + list(skip_tail)))
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With