import nltk
from nltk.corpus import movie_reviews
from nltk.tokenize import word_tokenize
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
word_features = list(all_words.keys())[:3000]
def find_features(document):
words = set(document)
features = {}
for w in word_features:
features[w] = (w in words)
return features
featuresets = [(find_features(rev), category) for (rev, category) in documents]
training_set = featuresets[500:1500]
testing_set = featuresets[:1500]
classifier = nltk.DecisionTreeClassifier.train(training_set)
print "Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100 , "%"
string = raw_input("Enter the string: ")
print (classifier.classify(find_features(word_tokenize(string))))
This code will display the accuracy of the classifier and then get input from user. And it returns the polarity of the string input by the user.
But here's my question: since I can obtain the accuracy by using nltk.accuracy()
, is it possible to get its precision and recall as well?
If you're using the nltk package, then it appears you can use the recall
and precision
functions from nltk.metrics.scores
(see the docs).
The functions should be available after invoking
from nltk.metrics.scores import (precision, recall)
Then you need to call them with reference
(known labels) and test
(the output of your classifier on the test set) sets.
Something like the code below should produce these sets as refsets
and testsets
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testing_set):
refsets[label].add(i)
observed = classifier.classify(feats)
testsets[observed].add(i)
Then, you can see the precision and recall for positive predictions with something like
print( 'Precision:', nltk.metrics.precision(refsets['pos'], testsets['pos']) )
print( 'Recall:', nltk.metrics.recall(refsets['pos'], testsets['pos']) )
# `'pos'` is for the "positive" (as opposed to "negative") label
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With