In [2]:

from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

Introduction to Machine Learning¶

Sentiment Analysis with NLTK¶

http://www.nltk.org/api/nltk.sentiment.html

https://www.kaggle.com/ngyptr/python-nltk-sentiment-analysis

In [3]:

n_instances = 100

In [4]:

subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]

In [6]:

obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

In [7]:

len(subj_docs), len(obj_docs)

Out[7]:

(100, 100)

In [9]:

subj_docs[1]

Out[9]:

(['color',
  ',',
  'musical',
  'bounce',
  'and',
  'warm',
  'seas',
  'lapping',
  'on',
  'island',
  'shores',
  '.',
  'and',
  'just',
  'enough',
  'science',
  'to',
  'send',
  'you',
  'home',
  'thinking',
  '.'],
 'subj')

In [10]:

train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:]

In [11]:

train_docs = train_subj_docs + train_obj_docs
test_docs = test_obj_docs + test_subj_docs

In [12]:

clf = SentimentAnalyzer()

In [13]:

all_words_neg = clf.all_words([mark_negation(doc) for doc in train_docs])

In [14]:

unigram_features = clf.unigram_word_feats(all_words_neg, min_freq = 4)

In [15]:

len(unigram_features)

Out[15]:

In [16]:

clf.add_feat_extractor(extract_unigram_feats, unigrams = unigram_features)

In [17]:

train_set = clf.apply_features(train_docs)
test_set = clf.apply_features(test_docs)

In [18]:

trainer = NaiveBayesClassifier.train

In [21]:

classifier = clf.train(trainer, train_set)

Training classifier

In [23]:

for key,value in sorted(clf.evaluate(test_set).items()):
     print('{0}: {1}'.format(key, value))

Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8

Basic Example¶

Below is a similar problem with some food review data.

In [40]:

from nltk.tokenize import word_tokenize

In [41]:

train = [("Great place to be when you are in Bangalore.", "pos"),
  ("The place was being renovated when I visited so the seating was limited.", "neg"),
  ("Loved the ambience, loved the food", "pos"),
  ("The food is delicious but not over the top.", "neg"),
  ("Service - Little slow, probably because too many people.", "neg"),
  ("The place is not easy to locate", "neg"),
  ("Mushroom fried rice was spicy", "pos"),
]

In [42]:

dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0]))

In [43]:

dictionary = set(word.lower() for passage in train for word in word_tokenize(passage[0]))

In [44]:

t = [({word: (word in word_tokenize(x[0])) for word in dictionary}, x[1]) for x in train]

In [45]:

classifier = nltk.NaiveBayesClassifier.train(t)

In [46]:

test_data = "Manchurian was hot and spicy"
test_data_features = {word.lower(): (word in word_tokenize(test_data.lower())) for word in dictionary}
print (classifier.classify(test_data_features))

pos

Using Vader¶

There is an additional tool for sentiment analysis built in to nltk that includes another sentiment analysis analyzer.

In [24]:

from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [25]:

paragraph = "It was one of the worst movies I've seen, despite good reviews. Unbelievably bad acting!! Poor direction. VERY poor production. The movie was bad. Very bad movie. VERY bad movie. VERY BAD movie. VERY BAD movie!"

In [26]:

from nltk import tokenize

In [27]:

lines_list = tokenize.sent_tokenize(paragraph)

In [28]:

lines_list

Out[28]:

["It was one of the worst movies I've seen, despite good reviews.",
 'Unbelievably bad acting!!',
 'Poor direction.',
 'VERY poor production.',
 'The movie was bad.',
 'Very bad movie.',
 'VERY bad movie.',
 'VERY BAD movie.',
 'VERY BAD movie!']

In [29]:

sid = SentimentIntensityAnalyzer()
for sent in lines_list:
    print(sent)
    ss = sid.polarity_scores(sent)
    for k in sorted(ss):
        print('{0}: {1}, '.format(k, ss[k]), end = '')
    print()

It was one of the worst movies I've seen, despite good reviews.
compound: -0.7584, neg: 0.394, neu: 0.606, pos: 0.0,
Unbelievably bad acting!!
compound: -0.6572, neg: 0.686, neu: 0.314, pos: 0.0,
Poor direction.
compound: -0.4767, neg: 0.756, neu: 0.244, pos: 0.0,
VERY poor production.
compound: -0.6281, neg: 0.674, neu: 0.326, pos: 0.0,
The movie was bad.
compound: -0.5423, neg: 0.538, neu: 0.462, pos: 0.0,
Very bad movie.
compound: -0.5849, neg: 0.655, neu: 0.345, pos: 0.0,
VERY bad movie.
compound: -0.6732, neg: 0.694, neu: 0.306, pos: 0.0,
VERY BAD movie.
compound: -0.7398, neg: 0.724, neu: 0.276, pos: 0.0,
VERY BAD movie!
compound: -0.7616, neg: 0.735, neu: 0.265, pos: 0.0,