1. Preparation
Download 7z if you don't have it yet from http://www.7-zip.org/download.html .
Download GoogleNews-vectors-negative300.bin.gz from https://code.google.com/p/word2vec/ .
Download IMDB review data from http://bit.ly/1FizNyc .
As suggested in the original webpage, go to http://www.enchantedlearning.com/wordlist/ and collect words for food, sports, and weather, and put the words in food_words.txt, sports_words.txt, and weather_words.txt.
In Ubuntu, you may need to use the C compile by setting up:
sudo apt-get install build-essential
2. Test 1
In Spyder IPython window, paste the following
from gensim.models.word2vec import Word2Vec
model = Word2Vec.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
and you should get
[('queen', 0.7118191719055176),
You will need at least about 8GB memory. I tried also with 4GB RAM, and it gave the result after more than one hour, which is too slow.
('monarch', 0.6189674139022827),
('princess', 0.5902431011199951),
('crown_prince', 0.5499460697174072),
('prince', 0.5377321243286133)]
You will need at least about 8GB memory. I tried also with 4GB RAM, and it gave the result after more than one hour, which is too slow.
3. Test 2 (a continuation of Test 1)
import numpy as np with open('food_words.txt', 'r', ) as infile: food_words = infile.readlines() with open('sports_words.txt', 'r') as infile: sports_words = infile.readlines() with open('weather_words.txt', 'r') as infile: weather_words = infile.readlines() def getWordVecs(words): vecs = [] for word in words: word = word.replace('\n', '') try: vecs.append(model[word].reshape((1,300))) except KeyError: continue vecs = np.concatenate(vecs) return np.array(vecs, dtype='float') #TSNE expects float type values food_vecs = getWordVecs(food_words) sports_vecs = getWordVecs(sports_words) weather_vecs = getWordVecs(weather_words)
If you run into error for reading the text files (which I encountered in some systems but not always), change to:
import numpy as np with open('food_words.txt', 'r', encoding='utf8') as infile: food_words = infile.readlines() with open('sports_words.txt', 'r', encoding='utf8') as infile:
sports_words = infile.readlines() with open('weather_words.txt', 'r', encoding='utf8') as infile:
weather_words = infile.readlines()
Then
from sklearn.manifold import TSNE import matplotlib.pyplot as plt ts = TSNE(2) reduced_vecs = ts.fit_transform(np.concatenate((food_vecs, sports_vecs, weather_vecs))) #color points by word group to see if Word2Vec can separate them for i in range(len(reduced_vecs)): if i < len(food_vecs): #food words colored blue color = 'b' elif i >= len(food_vecs) and i < (len(food_vecs) + len(sports_vecs)): #sports words colored red color = 'r' else: #weather words colored green color = 'g' plt.plot(reduced_vecs[i,0], reduced_vecs[i,1], marker='o', color=color, markersize=8)
Then you should see a plot of 3 clustered colored dots.
3. Test 3
This is modified from the original tweeter data based test. However, as we don't have tweeter data, we substitute with the pos.txt and neg.txt from the IMDB review data. So this is just for the sake of testing code.
from sklearn.cross_validation import train_test_split from gensim.models.word2vec import Word2Vec with open('pos.txt', 'r', encoding='utf8') as infile:
pos_tweets = infile.readlines() with open('neg.txt', 'r', encoding='utf8') as infile:
neg_tweets = infile.readlines() #use 1 for positive sentiment, 0 for negative y = np.concatenate((np.ones(len(pos_tweets)), np.zeros(len(neg_tweets)))) x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_tweets, neg_tweets)), y, test_size=0.2) #Do some very minor text preprocessing def cleanText(corpus): corpus = [z.lower().replace('\n','').split() for z in corpus] return corpus x_train = cleanText(x_train) x_test = cleanText(x_test) n_dim = 300 #Initialize model and build vocab imdb_w2v = Word2Vec(size=n_dim, min_count=10) imdb_w2v.build_vocab(x_train) #Train the model over train_reviews (this may take several minutes) imdb_w2v.train(x_train)
I got an output 8684307.
#Build word vector for training set by using the average value of all word vectors in the tweet, then scale def buildWordVector(text, size): vec = np.zeros(size).reshape((1, size)) count = 0. for word in text: try: vec += imdb_w2v[word].reshape((1, size)) count += 1. except KeyError: continue if count != 0: vec /= count return vec
from sklearn.preprocessing import scale train_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_train]) train_vecs = scale(train_vecs) #Train word2vec on test tweets imdb_w2v.train(x_test)
I got:
WARNING:gensim.models.word2vec:supplied example count (10000) did not equal expected count (40000) Out[11]: 2172554
#Build test tweet vectors then scale test_vecs = np.concatenate([buildWordVector(z, n_dim) for z in x_test]) test_vecs = scale(test_vecs)
#Use classification algorithm (i.e. Stochastic Logistic Regression) on training set, then assess model performance on test set from sklearn.linear_model import SGDClassifier lr = SGDClassifier(loss='log', penalty='l1') lr.fit(train_vecs, y_train) print( 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test))
I got
Test Accuracy: 0.72
Note that I needed to add parentheses for the last statement to run correctly.
#Create ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
pred_probas = lr.predict_proba(test_vecs)[:,1]
fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()
4. Test 4
import gensim
LabeledSentence = gensim.models.doc2vec.LabeledSentence
from sklearn.cross_validation import train_test_split
import numpy as np
with open('pos.txt','r') as infile:
pos_reviews = infile.readlines()
with open('neg.txt','r') as infile:
neg_reviews = infile.readlines()
with open('unsup.txt','r') as infile:
unsup_reviews = infile.readlines()
#use 1 for positive sentiment, 0 for negative
y = np.concatenate((np.ones(len(pos_reviews)), np.zeros(len(neg_reviews))))
x_train, x_test, y_train, y_test = train_test_split(np.concatenate((pos_reviews, neg_reviews)), y, test_size=0.2)
#Do some very minor text preprocessing
def cleanText(corpus):
punctuation = """.,?!:;(){}[]"""
corpus = [z.lower().replace('\n','') for z in corpus]
corpus = [z.replace('<br />', ' ') for z in corpus]
#treat punctuation as individual words
for c in punctuation:
corpus = [z.replace(c, ' %s '%c) for z in corpus]
corpus = [z.split() for z in corpus]
return corpus
x_train = cleanText(x_train)
x_test = cleanText(x_test)
unsup_reviews = cleanText(unsup_reviews)
#Gensim's Doc2Vec implementation requires each document/paragraph to have a label associated with it.
#We do this by using the LabeledSentence method. The format will be "TRAIN_i" or "TEST_i" where "i" is
#a dummy index of the review.
def labelizeReviews(reviews, label_type):
labelized = []
for i,v in enumerate(reviews):
label = '%s_%s'%(label_type,i)
labelized.append(LabeledSentence(v, [label]))
return labelized
x_train = labelizeReviews(x_train, 'TRAIN')
x_test = labelizeReviews(x_test, 'TEST')
unsup_reviews = labelizeReviews(unsup_reviews, 'UNSUP')
import random
size = 400
#instantiate our DM and DBOW models
model_dm = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, workers=3)
model_dbow = gensim.models.Doc2Vec(min_count=1, window=10, size=size, sample=1e-3, negative=5, dm=0, workers=3)
#build vocab over all reviews
model_dm.build_vocab(np.concatenate((x_train, x_test, unsup_reviews)))
#you may run into error here: "Python int too large to convert to C long." If this occurs, change the hashfxn in Word2Vec constructor __init__: from
self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn self.iter = iter
to
self.cbow_mean = int(cbow_mean) #self.hashfxn = hashfxn
def hash32(value):
return hash(value) & 0xffffffff
self.hashfxn = hash32
self.iter = iter
https://www.kaggle.com/c/word2vec-nlp-tutorial/forums/t/11197/gensim-word2vec-cython-on-windows/93787
X=x_train + x_test + unsup_reviews
model_dm.build_vocab(X)
model_dbow.build_vocab(X)
On one system (Ubuntu 12.04, 12), it has the following error on
#Get training set vectors from our models
def getVecs(model, corpus, size):
vecs = [np.array(model[z.labels[0]]).reshape((1, size)) for z in corpus]
return np.concatenate(vecs)
train_vecs_dm = getVecs(model_dm, x_train, size)
with the following error: AttributeError: 'LabeledSentence' object has no attribute 'labels'
Checking the other system which it worked, x_train[0].labels = ['TRAIN_0'}, that's why it worked. But on this system, it has tags=['TRAIN_0']. So I changed to
#Get training set vectors from our models
def getVecs(model, corpus, size):
vecs = [np.array(model[z.tags[0]]).reshape((1, size)) for z in corpus]
return np.concatenate(vecs)
train_vecs_dm = getVecs(model_dm, x_train, size)
However, this generated another error:
model_dm[x_train[0]]
File "/home/anaconda3/lib/python3.5/site-packages/gensim-0.12.3-py3.5-linux-x86_64.egg/gensim/models/word2vec.py", line 1293, in <listcomp>
return vstack([self.syn0[self.vocab[word].index] for word in words])
Reverting to gensim 0.10.3 seems to resolve this problem temporarily.
#We pass through the data set multiple times, shuffling the training reviews each time to improve accuracy.
all_train_reviews = np.concatenate((x_train, unsup_reviews))
#if this is too slow, may need to change it range(1) or even range(0), but accuracy would be reduced
for epoch in range(10):
perm = np.random.permutation(all_train_reviews.shape[0])
model_dm.train(all_train_reviews[perm])
model_dbow.train(all_train_reviews[perm])
#Get training set vectors from our models
def getVecs(model, corpus, size):
vecs = [np.array(model[z.labels[0]]).reshape((1, size)) for z in corpus]
return np.concatenate(vecs)
train_vecs_dm = getVecs(model_dm, x_train, size)
train_vecs_dbow = getVecs(model_dbow, x_train, size)
train_vecs = np.hstack((train_vecs_dm, train_vecs_dbow))
#train over test set
x_test = np.array(x_test)
for epoch in range(10):
perm = np.random.permutation(x_test.shape[0])
model_dm.train(x_test[perm])
model_dbow.train(x_test[perm])
#Construct vectors for test reviews
test_vecs_dm = getVecs(model_dm, x_test, size)
test_vecs_dbow = getVecs(model_dbow, x_test, size)
test_vecs = np.hstack((test_vecs_dm, test_vecs_dbow))
from sklearn.linear_model import SGDClassifier
lr = SGDClassifier(loss='log', penalty='l1')
lr.fit(train_vecs, y_train)
print( 'Test Accuracy: %.2f'%lr.score(test_vecs, y_test))
#Create ROC curve
from sklearn.metrics import roc_curve, auc
%matplotlib inline
import matplotlib.pyplot as plt
pred_probas = lr.predict_proba(test_vecs)[:,1]
fpr,tpr,_ = roc_curve(y_test, pred_probas)
roc_auc = auc(fpr,tpr)
plt.plot(fpr,tpr,label='area = %.2f' %roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.legend(loc='lower right')
plt.show()
No comments:
Post a Comment