In [2]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

Introduction to Machine Learning

Sentiment Analysis with NLTK

http://www.nltk.org/api/nltk.sentiment.html

https://www.kaggle.com/ngyptr/python-nltk-sentiment-analysis

In [3]:
n_instances = 100
In [4]:
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
In [6]:
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
In [7]:
len(subj_docs), len(obj_docs)
Out[7]:
(100, 100)
In [9]:
subj_docs[1]
Out[9]:
(['color',
  ',',
  'musical',
  'bounce',
  'and',
  'warm',
  'seas',
  'lapping',
  'on',
  'island',
  'shores',
  '.',
  'and',
  'just',
  'enough',
  'science',
  'to',
  'send',
  'you',
  'home',
  'thinking',
  '.'],
 'subj')
In [10]:
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:]
In [11]:
train_docs = train_subj_docs + train_obj_docs
test_docs = test_obj_docs + test_subj_docs
In [12]:
clf = SentimentAnalyzer()
In [13]:
all_words_neg = clf.all_words([mark_negation(doc) for doc in train_docs])
In [14]:
unigram_features = clf.unigram_word_feats(all_words_neg, min_freq = 4)
In [15]:
len(unigram_features)
Out[15]:
83
In [16]:
clf.add_feat_extractor(extract_unigram_feats, unigrams = unigram_features)
In [17]:
train_set = clf.apply_features(train_docs)
test_set = clf.apply_features(test_docs)
In [18]:
trainer = NaiveBayesClassifier.train
In [21]:
classifier = clf.train(trainer, train_set)
Training classifier
In [23]:
for key,value in sorted(clf.evaluate(test_set).items()):
     print('{0}: {1}'.format(key, value))
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8

Basic Example

Below is a similar problem with some food review data.

In [40]:
from nltk.tokenize import word_tokenize
In [41]:
train = [("Great place to be when you are in Bangalore.", "pos"),
  ("The place was being renovated when I visited so the seating was limited.", "neg"),
  ("Loved the ambience, loved the food", "pos"),
  ("The food is delicious but not over the top.", "neg"),
  ("Service - Little slow, probably because too many people.", "neg"),
  ("The place is not easy to locate", "neg"),
  ("Mushroom fried rice was spicy", "pos"),
]
In [42]:
dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0]))
In [43]:
dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0]))
In [44]:
t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in train]
In [45]:
classifier = nltk.NaiveBayesClassifier.train(t)
In [46]:
test_data = "Manchurian was hot and spicy"
test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary}
print (classifier.classify(test_data_features))
pos

Using Vader

There is an additional tool for sentiment analysis built in to nltk that includes another sentiment analysis analyzer.

In [24]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [25]:
paragraph = "It was one of the worst movies I've seen, despite good reviews. Unbelievably bad acting!! Poor direction. VERY poor production. The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

In [26]:
from nltk import tokenize
In [27]:
lines_list = tokenize.sent_tokenize(paragraph)
In [28]:
lines_list
Out[28]:
["It was one of the worst movies I've seen, despite good reviews.",
 'Unbelievably bad acting!!',
 'Poor direction.',
 'VERY poor production.',
 'The movie was bad.',
 'Very bad movie.',
 'VERY bad movie.',
 'VERY BAD movie.',
 'VERY BAD movie!']
In [29]:
sid = SentimentIntensityAnalyzer()
for sent in lines_list:
    print(sent)
    ss = sid.polarity_scores(sent)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end = '')
    print()
It was one of the worst movies I've seen, despite good reviews.
compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0,
Unbelievably bad acting!!
compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0,
Poor direction.
compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0,
VERY poor production.
compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0,
The movie was bad.
compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0,
Very bad movie.
compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0,
VERY bad movie.
compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0,
VERY BAD movie.
compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0,
VERY BAD movie!
compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0,