In [1]:
import pandas as pd import numpy as np import gensim from fuzzywuzzy import fuzz from nltk.corpus import stopwords from tqdm import tqdm_notebook from nltk import word_tokenize from scipy.stats import skew, kurtosis from scipy.spatial.distance import cosine, cityblock, jaccard, canberra, euclidean, minkowski, braycurtis from nltk import word_tokenize stop_words = stopwords.words('english') from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.decomposition import PCA from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score
C:\Users\ljt\Anaconda3\lib\site-packages\fuzzywuzzy\fuzz.py:35: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
In [2]:
df = pd.read_csv('quora_train.csv') df = df.dropna(how="any").reset_index(drop=True) df.head(5)
Out[2]:
id | qid1 | qid2 | question1 | question2 | is_duplicate | |
---|---|---|---|---|---|---|
0 | 0 | 1 | 2 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 |
1 | 1 | 3 | 4 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 |
2 | 2 | 5 | 6 | How can I increase the speed of my internet co... | How can Internet speed be increased by hacking... | 0 |
3 | 3 | 7 | 8 | Why am I mentally very lonely? How can I solve... | Find the remainder when [math]23^{24}[/math] i... | 0 |
4 | 4 | 9 | 10 | Which one dissolve in water quikly sugar, salt... | Which fish would survive in salt water? | 0 |
In [3]:
a = 0 for i in range(a,a+10): print(df.question1[i]) print(df.question2[i]) print()
What is the step by step guide to invest in share market in india? What is the step by step guide to invest in share market? What is the story of Kohinoor (Koh-i-Noor) Diamond? What would happen if the Indian government stole the Kohinoor (Koh-i-Noor) diamond back? How can I increase the speed of my internet connection while using a VPN? How can Internet speed be increased by hacking through DNS? Why am I mentally very lonely? How can I solve it? Find the remainder when [math]23^{24}[/math] is divided by 24,23? Which one dissolve in water quikly sugar, salt, methane and carbon di oxide? Which fish would survive in salt water? Astrology: I am a Capricorn Sun Cap moon and cap rising...what does that say about me? I'm a triple Capricorn (Sun, Moon and ascendant in Capricorn) What does this say about me? Should I buy tiago? What keeps childern active and far from phone and video games? How can I be a good geologist? What should I do to be a great geologist? When do you use シ instead of し? When do you use "&" instead of "and"? Motorola (company): Can I hack my Charter Motorolla DCX3400? How do I hack Motorola DCX3400 for free internet?
The WMD measures the dissimilarity between two text documents as the minimum amount of distance that the embedded words of one document need to "travel" to reach the embedded words of another document.
利用Word2Vec训练词向量过程
In [12]:
question1 = 'What would a Trump presidency mean for current international master’s students on an F1 visa?' question2 = 'How will a Trump presidency affect the students presently in US or planning to study in US?' question1 = question1.lower().split() question2 = question2.lower().split() question1 = [w for w in question1 if w not in stop_words] question2 = [w for w in question2 if w not in stop_words] print(question1)
['would', 'trump', 'presidency', 'mean', 'current', 'international', 'master’s', 'students', 'f1', 'visa?']
We will be using word2vec pre-trained Google News corpus. We load these into a Gensim Word2Vec model class.
In [7]:
import gensim from gensim.models import Word2Vec model = gensim.models.KeyedVectors.load_word2vec_format('../../word2vec-GoogleNews-vectors/GoogleNews-vectors-negative300.bin.gz', binary=True) model
Out[7]:
let's compute WMD of these two sentence using the wmdistance method. These two sentences are expressing the same meaning, and they are duplicate.
In [9]:
distance = model.wmdistance(question1, question2) print('distance = %.4f' % distance)
distance = 1.8293
This question pair is labled as duplicate, but the distance between these two sentences is pretty large. This brings us to normalized WMD.
When using the wmdistance method, it is beneficial to normalize the word2vec vectors first, so they all have equal length. To do this, simply call model.init_sims(replace=True) and Gensim will take care of that for you.
Usually, one measures the distance between two word2vec vectors using the cosine distance (see cosine similarity), which measures the angle between vectors. WMD, on the other hand, uses the Euclidean distance. The Euclidean distance between two vectors might be large because their lengths differ, but the cosine distance is small because the angle between them is small; we can mitigate some of this by normalizing the vectors.
In [10]:
model.init_sims(replace=True) distance = model.wmdistance(question1, question2) print('normalized distance = %.4f' % distance)
normalized distance = 0.7589
After normalization, the distance became much smaller.
To put it in perspective, let's try one more pair. This time, these two questions are not duplicate.
In [14]:
question3 = 'Why am I mentally very lonely? How can I solve it?' question4 = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?' question3 = question3.lower().split() question4 = question4.lower().split() question3 = [w for w in question3 if w not in stop_words] question4 = [w for w in question4 if w not in stop_words] distance = model.wmdistance(question3, question4) print('distance = %.4f' % distance)
distance = 1.2637
In [15]:
model.init_sims(replace=True) distance = model.wmdistance(question3, question4) print('normalized distance = %.4f' % distance)
normalized distance = 1.2637
After normalization, the distance remains the same. WMD thinks the 2nd pair is not as similar as the 1st pair. It worked!
In [16]:
from gensim import corpora documents = [question1, question2, question3, question4] dictionary = corpora.Dictionary(documents) corpus = [dictionary.doc2bow(document) for document in documents] # Convert the sentences into bag-of-words vectors. question1 = dictionary.doc2bow(question1) question2 = dictionary.doc2bow(question2) question3 = dictionary.doc2bow(question3) question4 = dictionary.doc2bow(question4) question4
Out[16]:
[(20, 1), (21, 1), (22, 1), (23, 1), (24, 1)]
In [22]:
import gensim.downloader as api w2v_model = api.load("glove-wiki-gigaword-50")
In [23]:
similarity_matrix = w2v_model.similarity_matrix(dictionary) similarity_matrix
C:\Users\ljt\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: DeprecationWarning: Call to deprecated `similarity_matrix` (Method will be removed in 4.0.0, use gensim.models.keyedvectors.WordEmbeddingSimilarityIndex instead). if __name__ == '__main__':
Out[23]:
<25x25 sparse matrix of type '' with 47 stored elements in Compressed Sparse Column format>
In [24]:
from gensim.matutils import softcossim similarity = softcossim(question1, question2, similarity_matrix) print('similarity = %.4f' % similarity)
C:\Users\ljt\Anaconda3\lib\site-packages\ipykernel\__main__.py:3: DeprecationWarning: Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead). app.launch_new_instance()
similarity = 0.4756
The similarity for the 1st pair is relative large, this means soft cosine thinks these two sentence are very similar.
In [25]:
similarity = softcossim(question3, question4, similarity_matrix) print('similarity = %.4f' % similarity)
C:\Users\ljt\Anaconda3\lib\site-packages\ipykernel\__main__.py:1: DeprecationWarning: Call to deprecated `softcossim` (Function will be removed in 4.0.0, use gensim.similarities.termsim.SparseTermSimilarityMatrix.inner_product instead). if __name__ == '__main__':
similarity = 0.0942
On the other hand, the similarity for the 2nd pair is very small, this means soft cosine thinks this pair are not similar.
We have covered some basics on Fuzzy String Matching in Python, let's have a quick peak on whether FuzzyWuzzy can help with our question dedupe problem.
In [26]:
from fuzzywuzzy import fuzz question1 = 'What would a Trump presidency mean for current international master’s students on an F1 visa?' question2 = 'How will a Trump presidency affect the students presently in US or planning to study in US?' fuzz.ratio(question1, question2)
Out[26]:
51
In [27]:
fuzz.partial_token_set_ratio(question1, question2)
Out[27]:
100
In [28]:
question3 = 'Why am I mentally very lonely? How can I solve it?' question4 = 'Find the remainder when [math]23^{24}[/math] is divided by 24,23?' fuzz.ratio(question3, question4)
Out[28]:
7
In [29]:
fuzz.partial_token_set_ratio(question3, question4)
Out[29]:
32
FuzzyWuzzy does not think these two sentence have the similar meaning. That's good.
The other features will be the length of word, the length of character, the length of common word between question1 and question2, the length difference between question1 and question2.
In [6]:
def wmd(q1, q2): q1 = str(q1).lower().split() q2 = str(q2).lower().split() stop_words = stopwords.words('english') q1 = [w for w in q1 if w not in stop_words] q2 = [w for w in q2 if w not in stop_words] return model.wmdistance(q1, q2)
In [7]:
def norm_wmd(q1, q2): q1 = str(q1).lower().split() q2 = str(q2).lower().split() stop_words = stopwords.words('english') q1 = [w for w in q1 if w not in stop_words] q2 = [w for w in q2 if w not in stop_words] return norm_model.wmdistance(q1, q2)
In [8]:
def sent2vec(s): words = str(s).lower() words = word_tokenize(words) words = [w for w in words if not w in stop_words] words = [w for w in words if w.isalpha()] M = [] for w in words: try: M.append(model[w]) except: continue M = np.array(M) v = M.sum(axis=0) return v / np.sqrt((v ** 2).sum())
In [9]:
df.drop(['id', 'qid1', 'qid2'], axis=1, inplace=True)
In [10]:
df['len_q1'] = df.question1.apply(lambda x: len(str(x))) df['len_q2'] = df.question2.apply(lambda x: len(str(x))) df['diff_len'] = df.len_q1 - df.len_q2 df['len_char_q1'] = df.question1.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) df['len_char_q2'] = df.question2.apply(lambda x: len(''.join(set(str(x).replace(' ', ''))))) df['len_word_q1'] = df.question1.apply(lambda x: len(str(x).split())) df['len_word_q2'] = df.question2.apply(lambda x: len(str(x).split())) df['common_words'] = df.apply(lambda x: len(set(str(x['question1']).lower().split()).intersection(set(str(x['question2']).lower().split()))), axis=1) df['fuzz_ratio'] = df.apply(lambda x: fuzz.ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_partial_ratio'] = df.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_partial_token_set_ratio'] = df.apply(lambda x: fuzz.partial_token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_partial_token_sort_ratio'] = df.apply(lambda x: fuzz.partial_token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_token_set_ratio'] = df.apply(lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1) df['fuzz_token_sort_ratio'] = df.apply(lambda x: fuzz.token_sort_ratio(str(x['question1']), str(x['question2'])), axis=1)
In [11]:
df.head(2)
Out[11]:
question1 | question2 | is_duplicate | len_q1 | len_q2 | diff_len | len_char_q1 | len_char_q2 | len_word_q1 | len_word_q2 | common_words | fuzz_ratio | fuzz_partial_ratio | fuzz_partial_token_set_ratio | fuzz_partial_token_sort_ratio | fuzz_token_set_ratio | fuzz_token_sort_ratio | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 | 66 | 57 | 9 | 20 | 20 | 14 | 12 | 10 | 93 | 98 | 100 | 89 | 100 | 93 |
1 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 | 51 | 88 | -37 | 21 | 29 | 8 | 13 | 4 | 65 | 73 | 100 | 75 | 86 | 63 |
In [12]:
model = gensim.models.KeyedVectors.load_word2vec_format('./word2Vec_models/GoogleNews-vectors-negative300.bin.gz', binary=True) df['wmd'] = df.apply(lambda x: wmd(x['question1'], x['question2']), axis=1)
In [13]:
df.head(2)
Out[13]:
question1 | question2 | is_duplicate | len_q1 | len_q2 | diff_len | len_char_q1 | len_char_q2 | len_word_q1 | len_word_q2 | common_words | fuzz_ratio | fuzz_partial_ratio | fuzz_partial_token_set_ratio | fuzz_partial_token_sort_ratio | fuzz_token_set_ratio | fuzz_token_sort_ratio | wmd | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 | 66 | 57 | 9 | 20 | 20 | 14 | 12 | 10 | 93 | 98 | 100 | 89 | 100 | 93 | 0.564615 |
1 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 | 51 | 88 | -37 | 21 | 29 | 8 | 13 | 4 | 65 | 73 | 100 | 75 | 86 | 63 | 3.772346 |
In [14]:
norm_model = gensim.models.KeyedVectors.load_word2vec_format('./word2Vec_models/GoogleNews-vectors-negative300.bin.gz', binary=True) norm_model.init_sims(replace=True) df['norm_wmd'] = df.apply(lambda x: norm_wmd(x['question1'], x['question2']), axis=1)
In [15]:
df.head(2)
Out[15]:
question1 | question2 | is_duplicate | len_q1 | len_q2 | diff_len | len_char_q1 | len_char_q2 | len_word_q1 | len_word_q2 | common_words | fuzz_ratio | fuzz_partial_ratio | fuzz_partial_token_set_ratio | fuzz_partial_token_sort_ratio | fuzz_token_set_ratio | fuzz_token_sort_ratio | wmd | norm_wmd | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | What is the step by step guide to invest in sh... | What is the step by step guide to invest in sh... | 0 | 66 | 57 | 9 | 20 | 20 | 14 | 12 | 10 | 93 | 98 | 100 | 89 | 100 | 93 | 0.564615 | 0.217555 |
1 | What is the story of Kohinoor (Koh-i-Noor) Dia... | What would happen if the Indian government sto... | 0 | 51 | 88 | -37 | 21 | 29 | 8 | 13 | 4 | 65 | 73 | 100 | 75 | 86 | 63 | 3.772346 | 1.368796 |
In [16]:
question1_vectors = np.zeros((df.shape[0], 300)) for i, q in enumerate(tqdm_notebook(df.question1.values)): question1_vectors[i, :] = sent2vec(q) question2_vectors = np.zeros((df.shape[0], 300)) for i, q in enumerate(tqdm_notebook(df.question2.values)): question2_vectors[i, :] = sent2vec(q)
C:\Users\SusanLi\AppData\Local\Continuum\anaconda3\lib\site-packages\ipykernel_launcher.py:14: RuntimeWarning: invalid value encountered in double_scalars
In [17]:
df['cosine_distance'] = [cosine(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['cityblock_distance'] = [cityblock(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['jaccard_distance'] = [jaccard(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['canberra_distance'] = [canberra(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['euclidean_distance'] = [euclidean(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['minkowski_distance'] = [minkowski(x, y, 3) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['braycurtis_distance'] = [braycurtis(x, y) for (x, y) in zip(np.nan_to_num(question1_vectors), np.nan_to_num(question2_vectors))] df['skew_q1vec'] = [skew(x) for x in np.nan_to_num(question1_vectors)] df['skew_q2vec'] = [skew(x) for x in np.nan_to_num(question2_vectors)] df['kur_q1vec'] = [kurtosis(x) for x in np.nan_to_num(question1_vectors)] df['kur_q2vec'] = [kurtosis(x) for x in np.nan_to_num(question2_vectors)]
C:\Users\SusanLi\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\spatial\distance.py:698: RuntimeWarning: invalid value encountered in double_scalars dist = 1.0 - uv / np.sqrt(uu * vv) C:\Users\SusanLi\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\spatial\distance.py:853: RuntimeWarning: invalid value encountered in double_scalars dist = np.double(unequal_nonzero.sum()) / np.double(nonzero.sum()) C:\Users\SusanLi\AppData\Local\Continuum\anaconda3\lib\site-packages\scipy\spatial\distance.py:1138: RuntimeWarning: invalid value encountered in double_scalars return l1_diff.sum() / l1_sum.sum()
In [18]:
df['is_duplicate'].value_counts()
Out[18]:
0 255024 1 149263 Name: is_duplicate, dtype: int64
In [19]:
df.isnull().sum()
Out[19]:
question1 0 question2 0 is_duplicate 0 len_q1 0 len_q2 0 diff_len 0 len_char_q1 0 len_char_q2 0 len_word_q1 0 len_word_q2 0 common_words 0 fuzz_ratio 0 fuzz_partial_ratio 0 fuzz_partial_token_set_ratio 0 fuzz_partial_token_sort_ratio 0 fuzz_token_set_ratio 0 fuzz_token_sort_ratio 0 wmd 0 norm_wmd 0 cosine_distance 1775 cityblock_distance 0 jaccard_distance 522 canberra_distance 0 euclidean_distance 0 minkowski_distance 0 braycurtis_distance 522 skew_q1vec 0 skew_q2vec 0 kur_q1vec 0 kur_q2vec 0 dtype: int64
In [20]:
df.drop(['question1', 'question2'], axis=1, inplace=True) df = df[pd.notnull(df['cosine_distance'])] df = df[pd.notnull(df['jaccard_distance'])]
In [22]:
from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score
In [23]:
X = df.loc[:, df.columns != 'is_duplicate'] y = df.loc[:, df.columns == 'is_duplicate'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
In [24]:
import xgboost as xgb model = xgb.XGBClassifier(max_depth=50, n_estimators=80, learning_rate=0.1, colsample_bytree=.7, gamma=0, reg_alpha=4, objective='binary:logistic', eta=0.3, silent=1, subsample=0.8).fit(X_train, y_train.values.ravel()) prediction = model.predict(X_test) cm = confusion_matrix(y_test, prediction) print(cm) print('Accuracy', accuracy_score(y_test, prediction)) print(classification_report(y_test, prediction))
[[60757 15121] [12054 32822]] Accuracy 0.7749556950494394 precision recall f1-score support 0 0.83 0.80 0.82 75878 1 0.68 0.73 0.71 44876 micro avg 0.77 0.77 0.77 120754 macro avg 0.76 0.77 0.76 120754 weighted avg 0.78 0.77 0.78 120754