Hey I am trying to use a Naive Bayes classifier to classify some text. I am using NLTK. Whenever I test the classifier using the classify() method it always returns the correct classification for the first item, and the same classification for every other line of text I classify. The following is my code:
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
import nltk
import random
import nltk.data
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def bag_of_words(words):
return dict([word,True] for word in words)
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
text1="i love this city"
text2="i hate this city"
feats1=bag_of_words(word_tokenize(text1))
feats2=bag_of_words(word_tokenize(text2))
print classifier.classify(feats1)
print classifier.classify(feats2)
This code will print pos twice where as if I flipped the last 2 lines of the code it will print neg twice. Can anyone help?
Change
features['contains(%s)' % word] = (word in document_words)
to
features[word] = (word in document)
Otherwise the classifier only knows about "words" of the form "contains(...)", and is therefore clueless about the words in "i love this city"
import nltk.tokenize as tokenize
import nltk
import random
random.seed(3)
def bag_of_words(words):
return dict([word, True] for word in words)
def document_features(document):
features = {}
for word in word_features:
features[word] = (word in document)
# features['contains(%s)' % word] = (word in document_words)
return features
movie_reviews = nltk.corpus.movie_reviews
documents = [(set(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
train_set = [(document_features(d), c) for (d, c) in documents[:200]]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features()
for word in ('love', 'hate'):
# No hope in passing the tests if word is not in word_features
assert word in word_features
print('probability {w!r} is positive: {p:.2%}'.format(
w = word, p = classifier.prob_classify({word : True}).prob('pos')))
tests = ["i love this city",
"i hate this city"]
for test in tests:
words = tokenize.word_tokenize(test)
feats = bag_of_words(words)
print('{s} => {c}'.format(s = test, c = classifier.classify(feats)))
yields
Most Informative Features
worst = True neg : pos = 15.5 : 1.0
ridiculous = True neg : pos = 11.5 : 1.0
batman = True neg : pos = 7.6 : 1.0
drive = True neg : pos = 7.6 : 1.0
blame = True neg : pos = 7.6 : 1.0
terrible = True neg : pos = 6.9 : 1.0
rarely = True pos : neg = 6.4 : 1.0
cliches = True neg : pos = 6.0 : 1.0
$ = True pos : neg = 5.9 : 1.0
perfectly = True pos : neg = 5.5 : 1.0
probability 'love' is positive: 61.52%
probability 'hate' is positive: 36.71%
i love this city => pos
i hate this city => neg
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With