Assign a sentiment score to reviews (0.0 to 1.0 where a zero score is a full negative sentiment).
For this task, we will use a reviews file, which contains a snapshot of reviews from recent months (first line is a header line) for a travel company
We would like to do following :
We will do following steps to create unsupervised sentiment labeling for the review data.
# Load libraries and some settings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sys
import nltk
import math
# print graphs inline
%matplotlib inline
plt.rcParams["figure.figsize"] = (20,3)
pd.options.mode.chained_assignment = None # default='warn'
# download Punkt sentence tokenizer and pos tagger.
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# load sentiment dictionary
sentiment_dict=pd.read_table('data/sentiment/AFINN-111.txt', header=None)
sentiment_dict.columns=['word','valence']
sentiment_dict['word_type']='sentiment'
#sentiment_dict.head()
# load negation words
negative_words=pd.read_table('data/sentiment/negate.txt', header=None)
negative_words.columns=['word']
# negative_words['valence']=0
negative_words['word_type']='negation'
# load degree words
degree_words=pd.read_table('data/sentiment/degree_words.txt', header=None)
degree_words.columns=['word','degree']
degree_words['word_type']='degree'
# degree_words['valence']=0
degree_words.head()
#also load 1 more source of positive negative sentiment words
ptrckprry_negative_words=pd.read_table('data/sentiment/negative-words.txt', header=None,comment=';')
ptrckprry_negative_words.columns=['word']
ptrckprry_negative_words['word_type']='sentiment'
ptrckprry_negative_words['valence']=-1
ptrckprry_positive_words=pd.read_table('data/sentiment/positive-words.txt', header=None,comment=';')
ptrckprry_positive_words.columns=['word']
ptrckprry_positive_words['word_type']='sentiment'
ptrckprry_positive_words['valence']=1
# concat to create final sentiment lexicon
sentiment_lexicon=pd.concat([sentiment_dict,ptrckprry_positive_words,ptrckprry_negative_words,negative_words,degree_words], ignore_index=True)
sentiment_lexicon['valence']=sentiment_lexicon['valence'].fillna(0)
sentiment_lexicon['degree']=sentiment_lexicon['degree'].fillna('none')
sentiment_lexicon.sample(n=10)
Some information about the data in the sentiment lexicon . As we use lexicon information, a better and more richer lexicon will lead to better results
print 'total word types in sentiment frame : ' , set(sentiment_lexicon['word_type'])
print 'total sentiment words : ' , len(sentiment_lexicon[sentiment_lexicon['word_type']=='sentiment'])
print 'total negation words : ' , len(sentiment_lexicon[sentiment_lexicon['word_type']=='negation'])
print 'total positive sentiment words : ' , len(sentiment_lexicon[sentiment_lexicon['valence'] > 0])
print 'total negative sentiment words : ' , len(sentiment_lexicon[sentiment_lexicon['valence'] < 0])
print 'total degree words : ' , len(sentiment_lexicon[sentiment_lexicon['degree']!='none'])
df = pd.read_csv('data/reviews.csv')
print 'Reviews dataframe shape ' , df.shape
print 'Total unique review titles: ' , len(set(df['review_title']))
# print df head
df.head()
We will define many functions to help with the data analysis and data wrangling part
# paths
nltk_splitter=nltk.data.load('data/nltk/tokenizers/punkt/english.pickle')
nltk_tokenizer = nltk.tokenize.TreebankWordTokenizer()
# function to split paragraph and then tokenize a sentence to words
def isnan(value):
try:
return math.isnan(float(value))
except:
return False
def split_text(data):
'split paragraph into sentences and then sentences into words'
tokenized_sentences=[]
#print data
if not isnan(data):
sentences = nltk_splitter.sentences_from_text(data)
for sentence in sentences:
for word in nltk_tokenizer.tokenize(sentence) :
tokenized_sentences.append(word)
#tokenized_sentences = [words in nltk_tokenizer.tokenize(sentence) for sentence in sentences]
return tokenized_sentences
# function to tag pos and create data for word in format : [word, pos]
def tag_pos(sentence):
'tag sentences with pos information and output words in format : [word,pos]'
pos = nltk.pos_tag(sentence)
#print pos
return pos
# adds sentiment info for the word using the sentiment lexicon
def getSentimentWordDictionary(word,pos):
"adds sentiment info for the word using the sentiment lexicon"
#word=word.lower()
dictionary={}
dictionary['pos']=pos
dictionary['word']=word
if word in sentiment_lexicon['word'].values:
#print sentiment_lexicon['word']
dictword=sentiment_lexicon[sentiment_lexicon['word']==word]
dictionary['type']=dictword['word_type'].values[0]
dictionary['valence']=float(dictword['valence'].values[0])
dictionary['degree']=dictword['degree'].values[0]
else:
dictionary['type']='none'
dictionary['valence']=0
return dictionary
We will also create functions to create the sentiment dictonary from the reviews data.It is also possible for us toit extend sentiment lexicon using reviews data but we won't do it for now
def review_sentiment_dict(dataframe):
"create sentiment dictionary for the reviews data using the sentiment lexicon. For the first word in the sentence\
, we lowercase it. If the word is found in the sentiment lexicon, we create sentiment dictionary for that word"
review_sent_dict={}
# seen_set=set()
pos=''
data=[]
# we will add sentences from both reviews and review_title
data=dataframe['tokenized'].tolist()
data.extend(dataframe['review_title_tokenized'].tolist())
for review in dataframe.tokenized:
# lowercase the words
#review=[word.lower() for word in review]
review=[word for word in review]
sentiment_words=[]
for index,word in enumerate(review):
if index==0:
word=word.lower()
if word not in review_sent_dict :
# seen_set.add(word)
word_dict=getSentimentWordDictionary(word,pos)
# if len(word_dict.keys())> 2 and word_dict['type'] !='none' :
review_sent_dict[word]=word_dict
return review_sent_dict
We will create a copy of the data frame and add more useful information to it.
df_copy=df.copy()
sub=df_copy
Lets add two columns for tokenized version of review and review_title.
sub['tokenized']=sub['review_comments'].apply(split_text)
sub['review_title_tokenized']=sub['review_title'].apply(split_text)
Lets also create the review sentiment dictionary at this stage based on tokenized version of reviews.
review_sent_dict=review_sentiment_dict(sub)
print 'total entries in the review sentiment dict : ' , len(review_sent_dict.keys())
#review_sent_dict
We will now add following more columns to the data:
1. Sentiment words present in review title
2. Sentiment words present in reviews
3. Positive sentiment words
4. Negative sentiment words
5. Degree words
6. COlumns for total words length and total sentiment words length
The following codes will add many more columns to the dataframe
# Function to create more information columns
def collect(sentence,word_type):
filtered=[]
for words in sentence:
if words in review_sent_dict:
wdict=review_sent_dict.get(words)
#print wdict
if wdict['type'] == word_type:
filtered.append(words)
return filtered
def sentiment_polarity(sentence,polarity):
# (word,word_type,valence,degree)
# e.g ('clean', 'sentiment', 2.0, 'none')
filtered=[]
for words in sentence:
if words in review_sent_dict:
wdict=review_sent_dict.get(words)
word_polarity=wdict['valence']
if polarity=='pos':
if word_polarity>=1:
filtered.append(words)
else:
if word_polarity<0:
filtered.append(words)
return filtered
sub['title_sentiment_words']=sub['review_title_tokenized'].apply(collect,args=['sentiment'])
sub['sentiment_words']=sub['tokenized'].apply(collect,args=['sentiment'])
sub['pos_sentiment']=sub['tokenized'].apply(sentiment_polarity,args=['pos'])
sub['neg_sentiment']=sub['tokenized'].apply(sentiment_polarity,args=['neg'])
sub['negation_words']=sub['tokenized'].apply(collect,args=['negation'])
sub['degree_words']=sub['tokenized'].apply(collect,args=['degree'])
sub['total_words']=sub['tokenized'].apply(len)
sub['total_sentiment_words']=sub['sentiment_words'].apply(len)
Lets look at how the dataframe look now :
sub.head()
# histogram of total words per review
plt.figure(1)
x=sub.total_words
plt.subplot(1, 2, 1)
plt.hist(x, bins=50, facecolor='blue')
plt.xlabel('total words in a review')
plt.ylabel('count')
plt.title('total words in a review')
plt.figure(2)
arr=[len(row) for row in sub.sentiment_words]
plt.subplot(1, 2, 1)
plt.hist(arr, bins=20, facecolor='green')
plt.xlabel('sentiment words in a review')
plt.ylabel('count')
plt.title('sentiment words in a review')
plt.figure(3)
arr=[len(row) for row in sub.pos_sentiment]
plt.subplot(1, 2, 1)
plt.hist(arr, bins=20, facecolor='green')
plt.xlabel('positive sentiment words in a review')
plt.ylabel('count')
plt.title('+ve sentiment words in a review')
arr=[len(row) for row in sub.neg_sentiment]
plt.subplot(1, 2, 2)
plt.hist(arr, bins=20, facecolor='red')
plt.xlabel('-ve sentiment words in a review')
plt.ylabel('count')
plt.title('-ve sentiment words in a review')
plt.figure(4)
plt.subplot(1, 2, 1)
arr=[len(row) for row in sub.negation_words]
plt.hist(arr, bins=30, facecolor='orange')
plt.xlabel('negation words in a review')
plt.ylabel('count')
plt.title('negation words in a review')
plt.tight_layout()
plt
We will now work with adding code to assign sentiment scores to reviews . We use following ideas
There are lot more things to cover in the scoring system which are not done in the report for now.
pos_degree=0.3
neg_degree=0.3
def normalize(sentiments,alpha=15):
"""
Normalize the score to be between 0 and 1. We have used norm_score function mentioned in one \
VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text
"""
minV=min(sentiments)
maxV=max(sentiments)
score=float(sum(sentiments))
norm_score = score/math.sqrt((score*score) + alpha)
#print norm_score
if minV==maxV==0: #Neutral
norm_score=0.5
elif norm_score < 0:
norm_score= 0.0
elif norm_score > 1.0:
norm_score=1.0
return norm_score
def calculate_polarity_scores(sentence):
#print sentence
sentiments = []
#print sentence
sent= ' '.join(elem[0] for elem in sentence)
def createSentimentList(sentence):
for index, elem in enumerate(sentence):
#print(index, elem)
valence = 0
word=elem[0]
lexicon=elem[1]
#print lexicon
if lexicon['type']=='sentiment':
valence=lexicon['valence']
elif lexicon['type']=='degree':
if lexicon['degree']=='incr':
valence=pos_degree
else:
valence=neg_degree
sentiments.append(valence)
#print sentiments
return sentiments
def organize_sentiment_scores(sentiments):
# want separate positive versus negative sentiment scores
pos_sum = 0.0
neg_sum = 0.0
neu_count = 0
for sentiment_score in sentiments:
if sentiment_score > 0:
pos_sum += (float(sentiment_score) +1) # compensates for neutral words that are counted as 1
if sentiment_score < 0:
neg_sum += (float(sentiment_score) -1) # when used with math.fabs(), compensates for neutrals
if sentiment_score == 0:
neu_count += 1
return pos_sum, neg_sum, neu_count
def score_valence(sentence,sentiments):
sentiment_dict={}
#print 'sentiments ' , sentiments
if sentiments:
sum_s = float(sum(sentiments))
# discriminate between positive, negative and neutral sentiment scores
pos_sum, neg_sum, neu_count = organize_sentiment_scores(sentiments)
#print pos_sum, neg_sum, neu_count
total = pos_sum + math.fabs(neg_sum) + neu_count
pos = math.fabs(pos_sum / total)
neg = math.fabs(neg_sum / total)
neu = math.fabs(neu_count / total)
# compound= normalize(float(pos+(-1.0 *neg) +neu))
compound = normalize(sentiments)
sentiment_dict = \
{"neg" : round(neg, 3),
"neu" : round(neu, 3),
"pos" : round(pos, 3),
"compound" : round(compound, 4)}
return sentiment_dict
def updateSentimentList(sentence,sentiments):
"update the sentiment list based on rules"
sentiments=check_for_negation_case(sentence,sentiments)
return sentiments
def check_for_negation_case(sentence,sentiments):
"check for negation case in a sentence"
for index, elem in enumerate(sentence):
#print(index, elem)
word=elem[0]
lexicon=elem[1]
valence=lexicon['valence']
word_type=lexicon['type']
found_neg_object=False
if word_type == 'negation':
# negation_list[index]=-1
found_neg_object=True
lookup_range=range(max(index-1,0), min(index+3,len(sentence)-1))
#print lookup_range , ' for negation at index : ', index
for i in lookup_range:
#print sentiments[i]
if sentiments[i] > 0 or sentiments[i] < 0:
found_neg_object=True
sentiments[i]= -1. * sentiments[i]
#print 'negation found and updated ', sentiments[i]
if not found_neg_object:
sentiments[index]=-1.0
#print negation_list , 'look up range : ', lookup_range, 'updated sentiment list ', sentiments
return sentiments
# create the sentiment list on word tokens in sentence
sentiments=createSentimentList(sentence)
# update the sentiment list based on rules
sentiments=updateSentimentList(sentence,sentiments)
# calculate the final sentiment scores
scores=score_valence(sentence,sentiments)
#print sent
#print sentence
#print ' '
#print '-> score: ' , scores
# print ' '
return scores
# function to word level sentiment
def addWordSentiment(sentence,lower_first_word=True):
sentimentwordsInSentence=[]
#print 'addWordSentiment ', sentence
for index,(word,postag) in enumerate(sentence):
if lower_first_word and index==0:
word=word.lower()
sentimentwordsInSentence.append((word,getSentimentInfo(word,postag)))
# sentimentwordsInSentence = [(word,getSentimentInfo(word,postag)) for (word, postag) in sentence]
#print sentimentwordsInSentence
return sentimentwordsInSentence
# adds sentiment info for the word using the sentiment lexicon
def getSentimentInfo(word,pos):
#word=word.lower()
dictionary={}
dictionary['pos']=pos
dictionary['word']=word
if word in review_sent_dict:
dictionary=review_sent_dict[word]
#print 'word , ' ,dictionary
else:
# print 'word not found ', word
dictionary['type']='none'
dictionary['valence']=0
return dictionary
def calculate_sentiment(review,review_sentiment_dict):
"calculate sentiment scores in format : score: {'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.5}"
sentences_with_pos= tag_pos(review)
#print sentences_with_pos
#print sentence
sentence=addWordSentiment(sentences_with_pos,True)
score=calculate_polarity_scores(sentence)
return score
def calculate_one_sentiment(review,review_sentiment_dict):
"Calculate one compound sentiment score"
score={}
score['compound']=0
if len(review)>=1:
score=calculate_sentiment(review,review_sentiment_dict)
# else:
# print 'no words in review', review
return score['compound']
lets predict sentiment scores for first 10 reviews
reviews=sub['tokenized'][1:10].tolist()
#string='My stay at hotel was fantastic !'
#reviews=[string.split()]
#print reviews
for review in reviews:
score =calculate_sentiment(review,review_sentiment_dict)
sent= ' '.join(elem for elem in review)
print 'review: ' , sent
print 'score: ', score
print ''
lets Tag the review_title and reviews with the sentiment scores now. We will also tag review_title to see if we can find some insight
%%time
reviews_with_scores=sub
reviews_with_scores=reviews_with_scores
reviews_with_scores['review_score']= reviews_with_scores['tokenized'].apply(calculate_one_sentiment,args=[review_sentiment_dict])
Lets also tag the review titles with sentiment scores
%%time
reviews_with_scores['review_title_score']= reviews_with_scores['review_title_tokenized'].apply(calculate_one_sentiment,args=[review_sent_dict])
save_reviews_with_scores='./data/reviews_with_scores.csv'
save_df=reviews_with_scores[['hotel_review_id','review_title','review_title_score','review_comments','review_score']]
save_df.to_csv(save_reviews_with_scores,index=False)
result=pd.read_csv(save_reviews_with_scores)
result.head()
A=result['review_score'].tolist()
B=result['review_title_score'].tolist()
A=np.asarray(A)
B=np.asarray(B)
print "difference:", A - B
print "SAD:", np.sum(np.abs(A - B))
print "SSD:", np.sum(np.square(A - B))
print "correlation:", np.corrcoef(np.array((A, B)))[0, 1]
import scipy
scipy.stats.pearsonr(A, B)
plt.plot(A-B)
#plt.plot(x, y)
plt.show()