This is purely based on https://districtdatalabs.silvrback.com/modern-methods-for-sentiment-analysis#disqus_thread and the comments on the page. A few small changes were needed so I captured the updates here.
Download 7z if you don't have it yet from http://www.7-zip.org/download.html .
from https://code.google.com/p/word2vec/ .
.
As suggested in the original webpage, go to http://www.enchantedlearning.com/wordlist/ and collect words for food, sports, and weather, and put the words in food_words.txt, sports_words.txt, and weather_words.txt.
2. Test 1
3. Test 2 (a continuation of Test 1)
import numpy as np
with open('food_words.txt', 'r', ) as infile:
food_words = infile.readlines()
with open('sports_words.txt', 'r') as infile:
sports_words = infile.readlines()
with open('weather_words.txt', 'r') as infile:
weather_words = infile.readlines()
def getWordVecs(words):
vecs = []
for word in words:
word = word.replace('\n', '')
try:
vecs.append(model[word].reshape((1,300)))
except KeyError:
continue
vecs = np.concatenate(vecs)
return np.array(vecs, dtype='float') #TSNE expects float type values
food_vecs = getWordVecs(food_words)
sports_vecs = getWordVecs(sports_words)
weather_vecs = getWordVecs(weather_words)
If you run into error for reading the text files (which I encountered in some systems but not always), change to:
import numpy as np
with open('food_words.txt', 'r', encoding='utf8') as infile:
food_words = infile.readlines()
with open('sports_words.txt', 'r', encoding='utf8') as infile:
sports_words = infile.readlines()
with open('weather_words.txt', 'r', encoding='utf8') as infile:
weather_words = infile.readlines()
Then
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
ts = TSNE(2)
reduced_vecs = ts.fit_transform(np.concatenate((food_vecs, sports_vecs, weather_vecs)))
#color points by word group to see if Word2Vec can separate them
for i in range(len(reduced_vecs)):
if i < len(food_vecs):
#food words colored blue
color = 'b'
elif i >= len(food_vecs) and i < (len(food_vecs) + len(sports_vecs)):
#sports words colored red
color = 'r'
else:
#weather words colored green
color = 'g'
plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker='o', color=color, markersize=8)
Then you should see a plot of 3 clustered colored dots.
3. Test 3
This is modified from the original tweeter data based test. However, as we don't have tweeter data, we substitute with the pos.txt and neg.txt from the IMDB review data. So this is just for the sake of testing code.
from sklearn.cross_validation import train_test_split
from gensim.models.word2vec import Word2Vec
with open('pos.txt', 'r', encoding='utf8') as infile:
pos_tweets = infile.readlines()
with open('neg.txt', 'r', encoding='utf8') as infile:
neg_tweets = infile.readlines()
#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2)
#Do some very minor text preprocessing
def cleanText(corpus):
corpus = [z.lower().replace('\n','').split() for z in corpus]
return corpus
x_train = cleanText(x_train)
x_test = cleanText(x_test)
n_dim = 300
#Initialize model and build vocab
imdb_w2v = Word2Vec(size=n_dim, min_count=10)
imdb_w2v.build_vocab(x_train)
#Train the model over train_reviews (this may take several minutes)
imdb_w2v.train(x_train)
I got an output 8684307.
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale
def buildWordVector(text, size):
vec = np.zeros(size).reshape((1, size))
count = 0.
for word in text:
try:
vec += imdb_w2v[word].reshape((1, size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
from sklearn.preprocessing import scale
train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs = scale(train_vecs)
#Train word2vec on test tweets
imdb_w2v.train(x_test)
I got:
WARNING:gensim.models.word2vec:supplied example count (10000) did not equal expected count (40000)
Out[11]: 2172554
#Build test tweet vectors then scale
test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs = scale(test_vecs)
#Use classification algorithm (i.e. Stochastic Logistic Regression) on training set, then assess model performance on test set
from sklearn.linear_model import SGDClassifier
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)
print( 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test))
I got
Test Accuracy: 0.72
Note that I needed to add parentheses for the last statement to run correctly.
#Create ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
pred_probas = lr.predict_proba(test_vecs)[:,1]
fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()
4. Test 4
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence
from sklearn.cross_validation import train_test_split
import numpy as np
with open('pos.txt','r') as infile:
pos_reviews = infile.readlines()
with open('neg.txt','r') as infile:
neg_reviews = infile.readlines()
with open('unsup.txt','r') as infile:
unsup_reviews = infile.readlines()
#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_reviews, neg_reviews)), y, test_size=0.2)
#Do some very minor text preprocessing
def cleanText(corpus):
punctuation = """.,?!:;(){}[]"""
corpus = [z.lower().replace('\n','') for z in corpus]
corpus = [z.replace('<br />', ' ') for z in corpus]
#treat punctuation as individual words
for c in punctuation:
corpus = [z.replace(c, ' %s '%c) for z in corpus]
corpus = [z.split() for z in corpus]
return corpus
x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_reviews = cleanText(unsup_reviews)
#Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
#We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
#a dummy index of the review.
def labelizeReviews(reviews, label_type):
labelized = []
for i,v in enumerate(reviews):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews, 'UNSUP')
import random
size = 400
#instantiate our DM and DBOW models
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)
#build vocab over all reviews
model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))
#you may run into error here: "Python int too large to convert to C long." If this occurs, change the hashfxn in Word2Vec constructor __init__: from
self.cbow_mean = int(cbow_mean)
self.hashfxn = hashfxn
self.iter = iter
to
self.cbow_mean = int(cbow_mean)
#self.hashfxn = hashfxn
def hash32(value):
return hash(value) & 0xffffffff
self.hashfxn = hash32
self.iter = iter
https://www.kaggle.com/c/word2vec-nlp-tutorial/forums/t/11197/gensim-word2vec-cython-on-windows/93787
X=x_train + x_test + unsup_reviews
model_dm.build_vocab(X)
model_dbow.build_vocab(X)
On one system (Ubuntu 12.04, 12), it has the following error on
#Get training set vectors from our models
def getVecs(model, corpus, size):
vecs = [np.array(model[z.labels[0]]).reshape((1, size)) for z in corpus]
return np.concatenate(vecs)
train_vecs_dm = getVecs(model_dm, x_train, size)
with the following error: AttributeError: 'LabeledSentence' object has no attribute 'labels'
Checking the other system which it worked, x_train[0].labels = ['TRAIN_0'}, that's why it worked. But on this system, it has tags=['TRAIN_0']. So I changed to
#Get training set vectors from our models
def getVecs(model, corpus, size):
vecs = [np.array(model[z.tags[0]]).reshape((1, size)) for z in corpus]
return np.concatenate(vecs)
train_vecs_dm = getVecs(model_dm, x_train, size)
However, this generated another error:
model_dm[x_train[0]]
File "/home/anaconda3/lib/python3.5/site-packages/gensim-0.12.3-py3.5-linux-x86_64.egg/gensim/models/word2vec.py", line 1293, in <listcomp>
return vstack([self.syn0[self.vocab[word].index] for word in words])
Reverting to gensim 0.10.3 seems to resolve this problem temporarily.
#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
all_train_reviews = np.concatenate((x_train, unsup_reviews))
#if this is too slow, may need to change it range(1) or even range(0), but accuracy would be reduced
for epoch in range(10):
perm = np.random.permutation(all_train_reviews.shape[0])
model_dm.train(all_train_reviews[perm])
model_dbow.train(all_train_reviews[perm])
#Get training set vectors from our models
def getVecs(model, corpus, size):
vecs = [np.array(model[z.labels[0]]).reshape((1, size)) for z in corpus]
return np.concatenate(vecs)
train_vecs_dm = getVecs(model_dm, x_train, size)
train_vecs_dbow = getVecs(model_dbow, x_train, size)
train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))
#train over test set
x_test = np.array(x_test)
for epoch in range(10):
perm = np.random.permutation(x_test.shape[0])
model_dm.train(x_test[perm])
model_dbow.train(x_test[perm])
#Construct vectors for test reviews
test_vecs_dm = getVecs(model_dm, x_test, size)
test_vecs_dbow = getVecs(model_dbow, x_test, size)
test_vecs = np.hstack((test_vecs_dm, test_vecs_dbow))
from sklearn.linear_model import SGDClassifier
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)
print( 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test))
#Create ROC curve
from sklearn.metrics import roc_curve, auc
%matplotlib inline
import matplotlib.pyplot as plt
pred_probas = lr.predict_proba(test_vecs)[:,1]
fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()