CS224n Assignment1:exploring_word_vectors


# All Import Statements Defined Here
# Note: Do not add to this list.
# All the dependencies you need, can be installed by running .
# ----------------

import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]

import nltk
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA


# ----------------


#定义read_corpus函数,在输入语句的前后分别加 START 和 END ,并将所有的字母改为小写
def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
            category (string): category name
            list of lists, with words from each of the processed files
    files = reuters.fileids(category)
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]

1.1 Implement distinct_words

def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
            corpus (list of list of strings): corpus of documents
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    corpus_words = []
    num_corpus_words = -1
    # ------------------
    # Write your implementation here.
    corpus = [w for _ in corpus for w in _]
    corpus_words = list(set(corpus))
    corpus_words = sorted(corpus_words)
    num_corpus_words = len(corpus_words)

    # ------------------
    print (corpus_words[:],num_corpus_words)
    return corpus_words, num_corpus_words

Question 1.2: Implement compute_co_occurrence_matrix

def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
            M (numpy matrix of shape (number of corpus words, number of corpus words)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}
    # ------------------
    # Write your implementation here.
    #定义一个空的词共现矩阵,这里采用零矩阵,因为M为对称阵,所以尺寸为num_words * num_words
    M = np.zeros(shape = (num_words,num_words),dtype = np.int32)
    for i in range(num_words):
        word2Ind[words[i]] = i
    for sent in corpus:
        for p in range(len(sent)):
            ci = word2Ind[sent[p]]
            for w in sent[max(0,p-window_size):p]:
                wi = word2Ind[w]
                M[ci][wi] += 1 
            for w in sent[p + 1:p + 1 + window_size]:
                wi = word2Ind[w]
                M[ci][wi] += 1
    # ------------------

    return M, word2Ind

Question 1.3: Implement reduce_to_k_dim



  • n_components : int, default = 2
  • algorithm : string, default = “randomized”
  • n_iter : int, optional (default 5)
  • random_state : int, RandomState instance or None, optional, default = None
  • tol : float, optional
  • components_ : array, shape (n_components, n_features)

  • explained_variance_ : array, shape (n_components,)
    The variance of the training samples transformed by a projection to each component.

  • explained_variance_ratio_ : array, shape (n_components,)
    Percentage of variance explained by each of the selected components.

  • singular_values_ : array, shape (n_components,)
    The singular values corresponding to each of the selected components. The singular values are equal to the 2-norms of the n_components variables in the lower-dimensional space.

def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
            M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    # ------------------
    # Write your implementation here.
    svd = TruncatedSVD(n_components = k)
    M_reduced = svd.components_.T
    # ------------------

    return M_reduced

Question 1.4: Implement plot_embeddings

def plot_embeddings(M_reduced, word2Ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2Ind.
        Include a label next to each point.
            M_reduced (numpy matrix of shape (number of unique words in the corpus , k)): matrix of k-dimensioal word embeddings
            word2Ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize

    # ------------------
    # Write your implementation here.
    for _ in words:
        x = M_reduced[word2Ind[_]][0]
        y = M_reduced[word2Ind[_]][1]
        plt.scatter(x,y,marker= 'x')
    # ------------------

Question 2.1: Word2Vec Plot Analysis

words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
plot_embeddings(M_reduced, word2Ind, words)

Question 2.2: Polysemous Words


Question 2.3: Synonyms & Antonyms

w1 = "happy"
w2 = "cheerful"
w3 = "sad"
w1_w2_dist = wv_from_bin.distance(w1, w2)
w1_w3_dist = wv_from_bin.distance(w1, w3)

print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist))
print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))

Question 2.4: Finding Analogies

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'king'], negative=['man']))

Question 2.5: Incorrect Analogy

pprint.pprint(wv_from_bin.most_similar(positive=['woman','him'], negative=['man']))

Question 2.6: Guided Analysis of Bias in Word Vectors

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'boss'], negative=['man']))
pprint.pprint(wv_from_bin.most_similar(positive=['man', 'boss'], negative=['woman']))

Question 2.7: Independent Analysis of Bias in Word Vectors

pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'doctor'], negative=['man']))
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'doctor'], negative=['man']))
