Wednesday, December 23, 2015

Gensim: "Modern Methods for Sentiment Analysis by Michael Czerny"

This is purely based on https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis#disqus_thread and the comments on the page. A few small changes were needed so I captured the updates here.

1. Preparation

Download 7z if you don't have it yet from http://www.7-zip.org/download.html .

Download GoogleNews-vectors-negative300.bin.gz from https://code.google.com/p/word2vec/ .

Download IMDB review data from http://bit.ly/1FizNyc .

As suggested in the original webpage, go to http://www.enchantedlearning.com/wordlist/ and collect words for food, sports, and weather, and put the words in food_words.txt, sports_words.txt, and weather_words.txt.

In Ubuntu, you may need to use the C compile by setting up:
sudo apt-get install build-essential


2. Test 1

In Spyder IPython window, paste the following

from gensim.models.word2vec import Word2Vec
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

and you should get 

 [('queen', 0.7118191719055176),
 ('monarch', 0.6189674139022827),
 ('princess', 0.5902431011199951),
 ('crown_prince', 0.5499460697174072),
 ('prince', 0.5377321243286133)]

You will need at least about 8GB memory. I tried also with 4GB RAM, and it gave the result after more than one hour, which is too slow.

3. Test 2 (a continuation of Test 1)

import numpy as np

with open('food_words.txt', 'r', ) as infile:
    food_words = infile.readlines()

with open('sports_words.txt', 'r') as infile:
    sports_words = infile.readlines()

with open('weather_words.txt', 'r') as infile:
    weather_words = infile.readlines()

def getWordVecs(words):
    vecs = []
    for word in words:
        word = word.replace('\n', '')
        try:
            vecs.append(model[word].reshape((1,300)))
        except KeyError:
            continue
    vecs = np.concatenate(vecs)
    return np.array(vecs, dtype='float') #TSNE expects float type values

food_vecs = getWordVecs(food_words)
sports_vecs = getWordVecs(sports_words)
weather_vecs = getWordVecs(weather_words)

If you run into error for reading the text files (which I encountered in some systems but not always), change to:


import numpy as np

with open('food_words.txt', 'r', encoding='utf8') as infile:
    food_words = infile.readlines()

with open('sports_words.txt', 'r', encoding='utf8') as infile:
    sports_words = infile.readlines()

with open('weather_words.txt', 'r', encoding='utf8') as infile:
    weather_words = infile.readlines()

Then



from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

ts = TSNE(2)
reduced_vecs = ts.fit_transform(np.concatenate((food_vecs, sports_vecs, weather_vecs)))

#color points by word group to see if Word2Vec can separate them
for i in range(len(reduced_vecs)):
    if i < len(food_vecs):
        #food words colored blue
        color = 'b'
    elif i >= len(food_vecs) and i < (len(food_vecs) + len(sports_vecs)):
        #sports words colored red
        color = 'r'
    else:
        #weather words colored green
        color = 'g'
    plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker='o', color=color, markersize=8)

Then you should see a plot of 3 clustered colored dots.


3. Test 3

This is modified from the original tweeter data based test. However, as we don't have tweeter data, we substitute with the pos.txt and neg.txt from the IMDB review data. So this is just for the sake of testing code.

from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec

with open('pos.txt', 'r', encoding='utf8') as infile:
    pos_tweets = infile.readlines()

with open('neg.txt', 'r', encoding='utf8') as infile:
    neg_tweets = infile.readlines()

#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))

x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2)

#Do some very minor text preprocessing
def cleanText(corpus):
    corpus = [z.lower().replace('\n','').split() for z in corpus]
    return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)

n_dim = 300
#Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)

#Train the model over train_reviews (this may take several minutes)
imdb_w2v.train(x_train)

I got an output 8684307.

#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec

from sklearn.preprocessing import scale
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs = scale(train_vecs)

#Train word2vec on test tweets
imdb_w2v.train(x_test)


I got:
WARNING:gensim.models.word2vec:supplied example count (10000) did not equal expected count (40000)
Out[11]: 2172554


#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)

#Use classification algorithm (i.e. Stochastic Logistic Regression) on training set, then assess model performance on test set
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)

print( 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test))


I got
Test Accuracy: 0.72
Note that I needed to add parentheses for the last statement to run correctly.




#Create ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

pred_probas = lr.predict_proba(test_vecs)[:,1]

fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')

plt.show()



4. Test 4

import gensim

LabeledSentence = gensim.models.doc2vec.LabeledSentence

from sklearn.cross_validation import train_test_split
import numpy as np

with open('pos.txt','r') as infile:
    pos_reviews = infile.readlines()

with open('neg.txt','r') as infile:
    neg_reviews = infile.readlines()

with open('unsup.txt','r') as infile:
    unsup_reviews = infile.readlines()

#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))

x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_reviews, neg_reviews)), y, test_size=0.2)

#Do some very minor text preprocessing
def cleanText(corpus):
    punctuation = """.,?!:;(){}[]"""
    corpus = [z.lower().replace('\n','') for z in corpus]
    corpus = [z.replace('<br />', ' ') for z in corpus]

    #treat punctuation as individual words
    for c in punctuation:
        corpus = [z.replace(c, ' %s '%c) for z in corpus]
    corpus = [z.split() for z in corpus]
    return corpus

x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_reviews = cleanText(unsup_reviews)

#Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
#We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
#a dummy index of the review.
def labelizeReviews(reviews, label_type):
    labelized = []
    for i,v in enumerate(reviews):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews, 'UNSUP')


import random

size = 400

#instantiate our DM and DBOW models
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)




#build vocab over all reviews
model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))
#you may run into error here: "Python int too large to convert to C long." If this occurs, change the hashfxn in Word2Vec constructor __init__: from 
        self.cbow_mean = int(cbow_mean)
        self.hashfxn = hashfxn
        self.iter = iter
to
        self.cbow_mean = int(cbow_mean)
        #self.hashfxn = hashfxn
        def hash32(value):
     return hash(value) & 0xffffffff
        self.hashfxn = hash32
        self.iter = iter


https://www.kaggle.com/c/word2vec-nlp-tutorial/forums/t/11197/gensim-word2vec-cython-on-windows/93787 


X=x_train + x_test + unsup_reviews
model_dm.build_vocab(X)
model_dbow.build_vocab(X)

On one system (Ubuntu 12.04, 12), it has the following error on 
#Get training set vectors from our models
def getVecs(model, corpus, size):
    vecs = [np.array(model[z.labels[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)

train_vecs_dm = getVecs(model_dm, x_train, size)
with the following error: AttributeError: 'LabeledSentence' object has no attribute 'labels'
Checking the other system which it worked, x_train[0].labels = ['TRAIN_0'}, that's why it worked. But on this system, it has tags=['TRAIN_0']. So I changed to
#Get training set vectors from our models
def getVecs(model, corpus, size):
    vecs = [np.array(model[z.tags[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)

train_vecs_dm = getVecs(model_dm, x_train, size)

However, this generated another error:
 model_dm[x_train[0]]

  File "/home/anaconda3/lib/python3.5/site-packages/gensim-0.12.3-py3.5-linux-x86_64.egg/gensim/models/word2vec.py", line 1293, in <listcomp>
    return vstack([self.syn0[self.vocab[word].index] for word in words])

Reverting to gensim 0.10.3 seems to resolve this problem temporarily.


#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
all_train_reviews = np.concatenate((x_train, unsup_reviews))
#if this is too slow, may need to change it range(1) or even range(0), but accuracy would be reduced
for epoch in range(10):  
    perm = np.random.permutation(all_train_reviews.shape[0])
    model_dm.train(all_train_reviews[perm])
    model_dbow.train(all_train_reviews[perm])

#Get training set vectors from our models
def getVecs(model, corpus, size):
    vecs = [np.array(model[z.labels[0]]).reshape((1, size)) for z in corpus]
    return np.concatenate(vecs)

train_vecs_dm = getVecs(model_dm, x_train, size)
train_vecs_dbow = getVecs(model_dbow, x_train, size)

train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))

#train over test set
x_test = np.array(x_test)

for epoch in range(10):
    perm = np.random.permutation(x_test.shape[0])
    model_dm.train(x_test[perm])
    model_dbow.train(x_test[perm])

#Construct vectors for test reviews
test_vecs_dm = getVecs(model_dm, x_test, size)
test_vecs_dbow = getVecs(model_dbow, x_test, size)

test_vecs = np.hstack((test_vecs_dm, test_vecs_dbow))
from sklearn.linear_model import SGDClassifier

lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)

print( 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test))
#Create ROC curve
from sklearn.metrics import roc_curve, auc
%matplotlib inline
import matplotlib.pyplot as plt

pred_probas = lr.predict_proba(test_vecs)[:,1]

fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')

plt.show()


No comments:

Post a Comment