import pandas as pd
import os
import distance
import Levenshtein
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from catboost import CatBoostClassifier
from numba import jit
from sklearn import metrics
from sklearn.model_selection import KFold
train = pd.read_csv('data/train.csv',sep='\t',header=None)
train.columns=['q1','q2','label']
test=pd.read_csv('data/test.csv',sep='\t',header=None)
test.columns=['q1','q2']
test['label']=1
sample_submit=pd.read_csv('data/sample_submit.csv') #导入csv结构化文件
train.head()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1moMYzUS-1674743303486)(attachment:image.png)]
train.info()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-nLfhau1D-1674743303488)(attachment:image.png)]
test.info()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Jf8pfFES-1674743303489)(attachment:image.png)]
train['label'].value_counts(normalize=True)
normalize : boolean, default False 默认false,如为true,则以百分比的形式显示
sort : boolean, default True 默认为true,会对结果进行排序
ascending : boolean, default False 默认降序排序
bins : integer, 格式(bins=1),意义不是执行计算,而是把它们分成半开放的数据集合,只适用于数字数据
dropna : boolean, default True 默认删除na值
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dcG6nIKg-1674743303490)(attachment:image.png)]
data=pd.concat([train,test],axis=0).reset_index(drop=True)
train_size=len(train) #训练集长度
作用:用索引重置生成一个新的DataFramne或Series,当索引需要被视为列,或者索引没有意义,需要在另一个操作之前重置为默认值时。在机器学习中常常会对索引进行一定的处理,用于是否保留原有的索引。
## map根据提供的函数对指定序列做映射
data['q1_len'] = data['q1'].astype(str).map(len)
data['q2_len']=data['q2'].astype(str).map(len)
map()是 Python 内置的高阶函数,它接收一个函数 f 和一个 list,并通过把函数 f 依次作用在 list 的每个元素上,得到一个新的 list 并返回。
data['q1_len'].describe() #描述性统计
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Bz94sKiD-1674743303493)(attachment:image.png)]
data['q1q2_len_diff'] = data['q1_len'] - data['q2_len'] #长度差
data['q1q2_len_diff_abs']=np.abs(data['q1_len']-data['q2_len'])
data['q1q2_rate']=data['q1_len']/data['q2_len']
data['q2q1_rate']=data['q2_len']/data['q1_len']
data['q1_end_special']=data['q1'].str.endswith('?').astype(int)
data['q2_end_special']=data['q2'].str.endswith('?').astype(int)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tVpEql3S-1674743303494)(attachment:image.png)]
data['comm_q1q2char_nums']=data.apply(lambda row:len(set(row['q1'])&set(row['q2'])),axis=1)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FDVaMWVY-1674743303495)(attachment:image.png)]
lambda是一个匿名函数,
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rr3UDI9y-1674743303496)(attachment:image.png)]
创建一个无序不重复的元素集,可进行关系测试,删除重复数据,还可以计算交集、差集和并集等。
def char_match_pos(q1,q2,pos_i):
q1 = list(q1) #生成列表
q2 = list(q2) #生成列表
if pos_i < len(q1):
q2_len = min(len(q2),25) #q2_len只匹配前25个字
for pos_j in range(q2_len):
if q1[pos_i] == q2{pos_j}:
q_pos = pos_j + 1 #如果匹配上了,记录匹配位置
break
elif pos_j == q2_len - 1:
q_pos = 0 #如果没有匹配上,赋值为0
else:
q_pos = -1 #如果后续长度不存在,赋值为-1
return q_pos
for pos_i in range(8):
data['q1_pos_' + str(pos_i + 1)] = data.apply(
lambda row: char_match_pos(row['q1'], row['q2'], pos_i), axis=1).astype(np.int8)
data["q1_pos_1"]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-OTQlbUSF-1674743303498)(attachment:image.png)]
print("===========距离特征 =============")
sim_func_dict = {"jaccard": distance.jaccard,
"sorensen": distance.sorensen,
"levenshtein": distance.levenshtein,
"ratio": Levenshtein.ratio
}
for sim_func in tqdm(sim_func_dict, desc="距离特征"):
data[sim_func] = data.apply(lambda row: sim_func_dict[sim_func](row["q1"],row["q2"]), axis=1)
qt = [[3, 3], [3, 5], [5, 5], [5, 10], [10, 10], [10, 15], [15, 15], [15, 25]]
for qt_len in qt:
if qt_len[0] == 3 and sim_func == "levenshtein":
pass
else:
data[sim_func + '_q' + str(qt_len[0]) + '_t' + str(qt_len[1])] = data.apply(
lambda row: sim_func_dict[sim_func](row["q1"][:qt_len[0]],
row["q2"][:qt_len[1]]),
axis=1)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ELV6qH1o-1674743303499)(attachment:image.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-gjFqEsf7-1674743303499)(attachment:image-2.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-9lumY2UN-1674743303500)(attachment:image.png)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xVrRYVX2-1674743303501)(attachment:image.png)]
返回值是一个可迭代对象,迭代的每一个元素就是iterable的每一个参数。该返回值可以修改进度条信息
import os
import gensim
import jieba
import numpy as np
from gensim.models import KeyedVectors
from gensim.models import word2vec
data['q1_words_list'] = data['q1'].apply(lambda x:[w for w in jieba.cut(x) if w])
data['q2_words_list'] = data['q2'].apply(lambda x:[w for w in jieba.cut(x) if w])
data["q1_words_list"]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-f3H2sBca-1674743303502)(attachment:image.png)]
sentences=data['q1_words_list'].values.tolist()+data['q2_words_list'].values.tolist()
len(sentences)
sentences[:3]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-v1bQV2ow-1674743303503)(attachment:image.png)]
if not os.path.exists('models'):
os.mkdir('models')
w2v_model = word2vec.Word2Vec(sentences,
vector_size=100, window=10, min_count=1, workers=4,
sg=1)
w2v_model.save('models/' + 'word2vec.model')
w2v_model.wv.save_word2vec_format('models/' + 'word2vec.txt', binary=False)
if not os.path.exists('models'): #判断括号里文件是否存在
os.mkdir('models') #一级一级创建目录
w2v_model = word2vec.Word2Vec(sentences,
vector_size=100, window=10, min_count=1, workers=4,
sg=1)
w2v_model.save('models/' + 'word2vec.model')
w2v_model.wv.save_word2vec_format('models/' + 'word2vec.txt', binary=False)
len(w2v_model.wv.index_to_key) # index与key对应关系
from scipy.spatial.distance import cosine, cityblock, canberra, euclidean, \
minkowski, braycurtis, correlation, chebyshev, jensenshannon, mahalanobis, \
seuclidean, sqeuclidean
from tqdm import tqdm
tqdm.pandas()
# 计算词向量的相似度
def get_w2v(query, title, num):
q = np.zeros(100) #一列100个零
count = 0
for w in query:
if w in w2v_model.wv:
q += w2v_model.wv[w]
count += 1
if count == 0:
query_vec = q
query_vec = (q / count).tolist()
t = np.zeros(100)
count = 0
for w in title:
if w in w2v_model.wv:
t += w2v_model.wv[w]
count += 1
if count == 0:
title_vec = q
title_vec = (t / count).tolist()
if num == 1:
try:
vec_cosine = cosine(query_vec, title_vec)
return vec_cosine
except Exception as e:
return 0
if num == 2:
try:
vec_canberra = canberra(query_vec, title_vec) / len(query_vec)
return vec_canberra
except Exception as e:
return 0
if num == 3:
try:
vec_cityblock = cityblock(query_vec, title_vec) / len(query_vec)
return vec_cityblock
except Exception as e:
return 0
if num == 4:
try:
vec_euclidean = euclidean(query_vec, title_vec) # 欧氏距离
return vec_euclidean
except Exception as e:
return 0
if num == 5:
try:
vec_braycurtis = braycurtis(query_vec, title_vec)
return vec_braycurtis
except Exception as e:
return 0
if num == 6:
try:
vec_minkowski = minkowski(query_vec, title_vec) #闵可夫斯基距离
return vec_minkowski
except Exception as e:
return 0
if num == 7:
try:
vec_correlation = correlation(query_vec, title_vec) #相关系数
return vec_correlation
except Exception as e:
return 0
if num == 8:
try:
vec_chebyshev = chebyshev(query_vec, title_vec) #切比雪夫不等式
return vec_chebyshev
except Exception as e:
return 0
if num == 9:
try:
vec_jensenshannon = jensenshannon(query_vec, title_vec)
return vec_jensenshannon
except Exception as e:
return 0
if num == 10:
try:
vec_mahalanobis = mahalanobis(query_vec, title_vec)
return vec_mahalanobis
except Exception as e:
return 0
if num == 11:
try:
vec_seuclidean = seuclidean(query_vec, title_vec)
return vec_seuclidean
except Exception as e:
return 0
if num == 12:
try:
vec_sqeuclidean = sqeuclidean(query_vec, title_vec)
return vec_sqeuclidean
except Exception as e:
return 0
# 词向量相似度特征
data['vec_cosine'] = data.progress_apply(lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 1),
axis=1)
data['vec_canberra'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 2), axis=1)
data['vec_cityblock'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 3), axis=1)
data['vec_euclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 4), axis=1)
data['vec_braycurtis'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 5), axis=1)
data['vec_minkowski'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 6), axis=1)
data['vec_correlation'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 7), axis=1)
data['vec_chebyshev'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 8), axis=1)
data['vec_jensenshannon'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 9), axis=1)
data['vec_mahalanobis'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 10), axis=1)
data['vec_seuclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 11), axis=1)
data['vec_sqeuclidean'] = data.progress_apply(
lambda index: get_w2v(index['q1_words_list'], index['q2_words_list'], 12), axis=1)
data['vec_cosine'] = data['vec_cosine'].astype('float32')
data['vec_canberra'] = data['vec_canberra'].astype('float32')
data['vec_cityblock'] = data['vec_cityblock'].astype('float32')
data['vec_euclidean'] = data['vec_euclidean'].astype('float32')
data['vec_braycurtis'] = data['vec_braycurtis'].astype('float32')
data['vec_correlation'] = data['vec_correlation'].astype('float32')
Input In [3]
title_vec = q
^
IndentationError: expected an indented block
data['vec_cosine']
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dkZY1Ae1-1674743303504)(attachment:image.png)]
def w2v_sent2vec(words):
M = []
for word in words:
try:
M.append(w2v_model.wv[word])
except KeyError:
continue
M = np.array(M)
v = M.sum(axis = 0)
return (v / np.sqrt((v ** 2).sum())).astype(np.float32).tolist()
fea_names = ['q1_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q1_words_list']), result_type='expand', axis=1)
fea_names = ['q2_vec_{}'.format(i) for i in range(100)]
data[fea_names] = data.progress_apply(lambda row: w2v_sent2vec(row['q2_words_list'])
data.columns
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bke0FBhW-1674743303504)(attachment:image.png)]
no_feas = ['q1','q2','label','q1_words_list','q2_words_list']
features = [col for col in data.columns if col not in no_feas]
train,test = data[:train_size],data[train_size:]
print(len(features))
print(features)
X = train[features] # 训练集输入
y = train['label'] # 训练集标签
X_test = test[features] # 测试集输入
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True,random_state=1314)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'num_leaves': 5,
'max_depth': 6,
'min_data_in_leaf': 450,
'learning_rate': 0.1,
'feature_fraction': 0.9,
'bagging_fraction': 0.95,
'bagging_freq': 5,
'lambda_l1': 1,
'lambda_l2': 0.001, # 越小l2正则程度越高
'min_gain_to_split': 0.2,
}
oof = np.zeros(len(X))
prediction = np.zeros(len(X_test))
for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
X_train, X_valid = X[features].iloc[train_index], X[features].iloc[valid_index]
y_train, y_valid = y[train_index], y[valid_index]
model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1)
model.fit(X_train, y_train,
eval_set=[(X_train, y_train), (X_valid, y_valid)],
eval_metric='binary_logloss',
verbose=50, early_stopping_rounds=200)
y_pred_valid = model.predict(X_valid)
y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
oof[valid_index] = y_pred_valid.reshape(-1, )
prediction += y_pred
prediction /= n_fold
from sklearn.metrics import accuracy_score
y_pred = (oof > 0.5)
# score=accuracy_score(np.round(abs(oof)) ,train['label'].values)
score=accuracy_score(y_pred ,train['label'].values)
score
sub_pred = (prediction > 0.5).astype(int)
sample_submit['label']=sub_pred
sample_submit[['label']].to_csv('lgb.csv',index=None)
sample_submit['label'].value_counts()
会自己根据模板进行修改代码
会自己将gension.word2vec库学习完整都行啦回事与打算
会当成一个模板自己开始练习起来都行啦德样子与打算。
贡献矩阵与gensim.word2vec使用.
lgb.LGBMRegressor