import numpy as np
import pandas as pd
vocabulary = ['I','like','to','play','football','rome','paris','mango','apple']
one_hot_matrix={}
for i in range(len(vocabulary)):
l = [0]*len(vocabulary)
l[i]=1
one_hot_matrix[vocabulary[i]] = l
one_hot_matrix
The word2vec algorithms uses a neural network model to learn word associations from a large corpus of text. Once trained, such a model can detect synonymous words or suggest additional words for a partial sentence. In learning, these models take into account the context of the corpus in which the word occurs
In Skipgram: Each word starts with a [1x V] input, where V is the vocabulary size, W1:[V x E] and W2:[E x V] --> so the final output is softmaxed over [1xV] vector, giving the probability of the context word w.r.t the target word
In CBOW: Just as presented in skipgram, two mapping weight vectors are used, although here, the target word is predicted throught the aggregation of context words
W1,W2 also known as word-vector lookup table
According to [1], it is found that Skip-Gram works well with small datasets, and can better represent less frequent words. However, CBOW is found to train faster than Skip-Gram, and can better represent more frequent words.
[1]Efficient Estimation of Word Representations in Vector Space (Mikolov et. al)
Gensim is a efficient suite of NLP tools for topic modeling. Most notably for this tutorial, it supports an implementation of the Word2Vec word embedding for learning new word vectors from text.
Some important parameters for word2vec algorithms;
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
common_texts
model = Word2Vec(sentences=common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
# you can access the word vector of the word
# you can find the most similar set of words
vector = model.wv['computer'] # get numpy vector of a word
sims = model.wv.most_similar('computer', topn=10) # get other similar words
vector
sims
# Create Skip Gram model
model_sg = Word2Vec(sentences=common_texts, size=100, window=5, min_count=1,
workers=4, sg = 1)
sims = model_sg.wv.most_similar('survey', topn=10)
sims
FastText is an extension to Word2Vec proposed by Facebook in 2016. Instead of feeding individual words into the Neural Network, FastText breaks words into several n-grams (sub-words). For instance, the tri-grams for the word apple is app, ppl, and ple (ignoring the starting and ending of boundaries of words). The word embedding vector for apple will be the sum of all these n-grams. After training the Neural Network, we will have word embeddings for all the n-grams given the training dataset. Rare words can now be properly represented since it is highly likely that some of their n-grams also appears in other words. I will show you how to use FastText with Gensim in the following section.
from gensim.models import FastText
from gensim.test.utils import common_texts
model_FastText = FastText(size=4, window=3, min_count=1)
model_FastText.build_vocab(sentences=common_texts)
model_FastText.train(sentences=common_texts, total_examples=len(common_texts), epochs=10)
doc = lxml.etree.parse('/content/drive/MyDrive/ted_en-20160408.xml')
input_text = '\n'.join(doc.xpath('//content/text()'))
# remove parenthesis
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)
# store as list of sentences
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
sentences_strings_ted.extend(sent for sent in m.groupdict()['postcolon'].split('.') if sent)
# store as list of lists of words
sentences_ted = []
for sent_str in sentences_strings_ted:
tokens = re.sub(r"[^a-z0-9]+", " ", sent_str.lower()).split()
sentences_ted.append(tokens)
from gensim.models import FastText
model_ted = FastText(sentences_ted, size=100, window=5, min_count=5, workers=4,sg=1)
model_ted.wv.most_similar("Gastroenteritis")
from gensim.models import Word2Vec
model_ted_w2v = Word2Vec(sentences=sentences_ted, size=100, window=5, min_count=5, workers=4, sg=0)
model_ted_w2v.wv.most_similar("Gastroenteritis")
Even though the word Gastroenteritis does not exist in the training dataset, it is still capable of figuring out this word is closely related to some medical terms. If we try this in the Word2Vec defined previously, it would pop out error because such word does not exist in the training dataset. Although it takes longer time to train a FastText model (number of n-grams > number of words), it performs better than Word2Vec and allows rare words to be represented appropriately.
distri = TSNE(n_components=2)
words = list(emmbed_dict.keys())
vectors = [emmbed_dict[word] for word in words]
y = distri.fit_transform(vectors[100:250])
plt.figure(figsize=(14,8))
plt.scatter(y[:, 0],y[:,1])
for label,x,y in zip(words,y[:, 0],y[:,1]):
plt.annotate(label,xy=(x,y),xytext=(0,0),textcoords='offset points')
plt.show()
The advantage of GloVe is that, unlike Word2vec, GloVe does not rely just on local statistics (local context information of words), but incorporates global statistics (word co-occurrence) to obtain word vectors.
import numpy as np
import os
from random import shuffle
import re
import os
import urllib.request
import matplotlib.pyplot as plt
from scipy import spatial
from sklearn.manifold import TSNE
import numpy as np
import urllib.request
https://nlp.stanford.edu/projects/glove/
GloVe is an unsupervised learning algorithm for obtaining vector representations for words. Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and the resulting representations showcase interesting linear substructures of the word vector space.
urllib.request.urlretrieve('https://nlp.stanford.edu/data/glove.6B.zip','glove.6B.zip')
!unzip "/content/glove.6B.zip" -d "/content/"
# here you can see that each of the 100d,200d etc. represents the dimension of the embedding
emmbed_dict = {}
with open('/content/glove.6B.200d.txt','r') as f:
for line in f:
values = line.split()
word = values[0]
vector = np.asarray(values[1:],'float32')
emmbed_dict[word]=vector
def find_similar_word(emmbedes):
nearest = sorted(emmbed_dict.keys(), key=lambda word: spatial.distance.euclidean(emmbed_dict[word], emmbedes))
return nearest
find_similar_word(emmbed_dict['river'])[0:10]
# GloVe improves performance as the embedding space allows us to develop analogies and not just similarity
find_similar_word(emmbed_dict['king'] + emmbed_dict['queen'] + emmbed_dict['prince'])[0:10]
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
# Now we can load it and perform the same (king – man) + woman = ?
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)