主要是用lightmbg库:
# coding: utf-8
# In[1]:
# -*- coding: utf-8 -*-
import sys
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# 合并两个csv文件到
def combine(combine_file, filename1, filename2):
f = open("true_value.txt",'w')
len_merge_sum = 0
with open(combine_file, 'w') as fout:
with open(filename1, 'r') as f1:
for eachLine in f1:
lineno, sen1, sen2, label = eachLine.strip().split('\t')
fout.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
if int(label) == 1:
f.write(sen1 + '\t' + sen2 + '\t' + label + '\n')
len_merge_sum += 1
with open(filename2, 'r') as f1:
for eachLine in f1:
lineno, sen1, sen2, label = eachLine.strip().split('\t')
fout.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
if int(label) == 1:
f.write(sen1 + '\t' + sen2 + '\t' + label + '\n')
len_merge_sum += 1
fout.close()
f.close()
return combine_file, len_merge_sum
# In[2]:
# -*- coding: utf-8 -*-
from gensim.models import word2vec
import pandas as pd
import numpy as np
import sys
import time
import re
import jieba
import io
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')
def process_simi_stop(simiwords, stopwords, line):
for word, subword in simiwords.iteritems():
if word in line:
# print line
#line = re.sub(word, subword, line)
line = line.replace(word,subword)
# print subword
words1 = [w for w in jieba.cut(line) if w.strip()]
word1 = []
for i in words1:
if i not in stopwords:
word1.append(i)
return word1,line
def splitSentence(inputFile, inpath, segment, submit):
print u'分词开始!', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出当前时间
start = time.clock()
jieba.load_userdict("jieba_dict.txt")
corpus = []
simiwords = {}
with io.open("simiwords.txt", encoding='utf-8') as fr:
for line in fr:
words = re.split(",", line.strip())
simiwords[words[0]] = words[1]
stopwords = [] # 停用词
fstop = open('chinese_stopwords.txt', 'r')
for eachWord in fstop:
stopwords.append(eachWord.strip())
# print eachWord.strip()
fin = open(inputFile, 'r') # 以读的方式打开文件 inputFile
fin1 = open('sentences_1.txt','w')
for eachLine in fin:
# line = eachLine.strip() # 去除每行首尾可能出现的空格
# line = re.sub("[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……&*()]+", "", eachLine)
eachLine = re.sub("\*", " ", eachLine)
# jieba.del_word('年')
lineno, sen1, sen2, label = eachLine.strip().split('\t')
word1,sen_1 = process_simi_stop(simiwords, stopwords, sen1)
word2,sen_2 = process_simi_stop(simiwords, stopwords, sen2)
fin1.write(sen_1)
fin1.write("\n")
fin1.write(sen_2)
fin1.write("\n")
# sen_11 = ' '.join(sen_1.decode('utf8'))
# sen_12 = sen_11.split(" ")
# for s in sen_12:
# if s != sen_12[-1]:
# fin1.write(s+" ")
# else:
# fin1.write(s)
# fin1.write('\n')
# sen_21 = ' '.join(sen_2.decode('utf8'))
# sen_22 = sen_21.split(" ")
# for s in sen_22:
# if s != sen_22[-1]:
# fin1.write(s+" ")
# else:
# fin1.write(s)
# fin1.write('\n')
corpus.append(word1)
corpus.append(word2)
print len(corpus)
with open(inpath, 'r') as fin2: # inpath
for eachLine in fin2:
eachLine = re.sub("\*", " ", eachLine)
if submit:
lineno, sen1, sen2 = eachLine.strip().split('\t')
#print "ceshijieshisha:", sen1, sen2
else:
lineno, sen1, sen2, label = eachLine.strip().split('\t') # 无label
#print "ceshijieshisha:", sen1, sen2
word1,sen_1 = process_simi_stop(simiwords, stopwords, sen1)
word2,sen_2 = process_simi_stop(simiwords, stopwords, sen2)
fin1.write(sen_1)
fin1.write("\n")
fin1.write(sen_2)
fin1.write("\n")
# sen_11 = ' '.join(sen_1.decode('utf8'))
# sen_12 = sen_11.split(" ")
# for s in sen_12:
# if s != sen_12[-1]:
# fin1.write(s+" ")
# else:
# fin1.write(s)
# fin1.write('\n')
# sen_21 = ' '.join(sen_2.decode('utf8'))
# sen_22 = sen_21.split(" ")
# for s in sen_22:
# if s != sen_22[-1]:
# fin1.write(s+" ")
# else:
# fin1.write(s)
# fin1.write('\n')
corpus.append(word1)
corpus.append(word2)
print len(corpus) # 204954
fin1.close()
with open(segment, 'w') as fs:
for word in corpus:
# print type(word)
for w in word:
# print w
fs.write(w) # 将分词好的结果写入到输出文件
fs.writelines(' ')
fs.write('\n')
end = time.clock()
print u'分词实际用时:', end - start
return corpus
def filter_word_in_model(model, filename):
a = []
with open(filename, 'r') as file_to_read:
for line in file_to_read:
if True:
if not line:
break
a.append(line)
sentences = [] # 读sentences 里面的词
for i in range(len(a)):
b = a[i].strip().split()
sentences.append(b)
print 'sentences length:', len(sentences)
new_sentences = [] # 完成获取模型训练,剩余含有词向量序列
for i in range(len(sentences)):
new_sentence = []
for j in range(len(sentences[i])):
if sentences[i][j].decode('utf8') in model:
new_sentence.append(sentences[i][j])
new_sentences.append(new_sentence)
print 'new_sentences length: ', len(new_sentences)
# print(np.array(new_sentences).shape)
print u'new_sentences,用时', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出当前时间
with open('new_sentences.txt', 'w') as fs: # 写入new_sentences
for word in new_sentences:
for w in word:
fs.write(w) # 将分词好的结果写入到输出文件
fs.writelines(' ')
fs.write('\n')
return new_sentences
def eval_file(label1, pre):
tp, tn, fp, fn = 0.0000001, 0.0000001, 0.0000001, 0.00000001
for la, pr in zip(label1, pre):
if la == 1 and pr == 1:
tp += 1
elif la == 1 and pr == 0:
fn += 1
elif la == 0 and pr == 0:
tn += 1
elif la == 0 and pr == 1:
fp += 1
recall = float(tp)/float(tp+fn)
precision = float(tp)/float(tp+fp)
f11 = 2*recall*precision/(recall+precision)
return f11
def cos_Vector(x, y): # 用cos求夹角
if len(x) != len(y):
print u'error input,x and y is not in the same space'
return
x = np.array(x)
y = np.array(y)
num = (x * y.T)
num = float(num.sum())
if num == 0:
return 0
denom = np.linalg.norm(x) * np.linalg.norm(y)
if denom == 0:
return 0
cos = num / denom # 余弦值
sim = 0.5 + 0.5 * cos # 归一化
return sim
def vec_minus(x, y): # 相减
if len(x) != len(y):
print u'error input,x and y is not in the same space'
return
x = np.array(x)
y = np.array(y)
sim = abs(x-y)
return sim
def vec_multi(x, y): # 相乘
if len(x) != len(y):
print u'error input,x and y is not in the same space'
return
x = np.array(x)
y = np.array(y)
sim1 = x * y
return sim1
def calEuclideanDistance(x, y):
if len(x) != len(y):
print u'error input,x and y is not in the same space'
return
dist = np.sqrt(np.sum(np.square(x - y)))
return dist
#相同数组中相同元素的数量统计
def cal_jaccard(list1, list2):
set1 = set(list1)
set2 = set(list2)
avg_len = (len(set1) + len(set2)) / 2
min_len = min(len(set1), len(set2))
# return len(set1 & set2) * 1.0 / (len(set1) + len(set2) - len(set1 & set2))
if min_len == 0:
return 0
else:
return len(set1 & set2) * 1.0 / min_len
def zishu(X,useStatus):
if useStatus:
return 1.0 / (1 + np.exp(-(X)));
else:
return (X);
# In[3]:
def fenge(input_file,out_file,out_file1):
f = open(input_file,'r')
f_1 = open(out_file,'w')
f_2 = open(out_file1,'w')
lines = f.readlines()
Row = len(lines)
D = int(Row*0.85)
for i in range(Row):
if i < D:
lineno, sen1, sen2, label = lines[i].strip().split('\t')
f_1.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
else:
lineno, sen1, sen2, label = lines[i].strip().split('\t')
f_2.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
f.close()
f_1.close()
f_2.close()
return D
def feature_extraction(new_sentences,model,size):
vec_titles = [] # 获取句子的向量
for val in range(len(new_sentences)):
vec = np.zeros(shape=(1, size))
for i in range(len(new_sentences[val])):
vec += model[new_sentences[val][i].decode('utf8')]
if len(new_sentences[val]):
vec = vec/len(new_sentences[val])
vec_titles.append(vec)
return vec_titles
# In[4]:
'''
import threading
import time
gl_num = 0
vec_titles = []
lock = threading.RLock()
def feature_extraction(new_sentences,model,size,val_start):
lock.acquire()
global vec_titles
for val in range(val_start,len(new_sentences),4):
vec = np.zeros(shape=(1, size))
for i in range(len(new_sentences[val])):
vec += model[new_sentences[val][i].decode('utf8')]
if len(new_sentences[val]):
vec = vec/len(new_sentences[val])
vec_titles.append(vec)
time.sleep(1)
print len(vec_titles)
lock.release()
thread_list = []
for i in range(4):
t = threading.Thread(target=feature_extraction,args = (new_sentences,model,size,i))
thread_list.append(t)
for t in thread_list:
t.start()
'''
# In[5]:
# import jieba.analyse
# def get_keyword(model, num_keywords, new_sentences):
# # 获取关键词
# content = open('new_sentences.txt', 'rb').read()
# jieba.analyse.set_stop_words('chinese_stopwords.txt')
# keywords = jieba.analyse.extract_tags(content, topK=num_keywords, withWeight=False, allowPOS=())
# print u'keywords长度:', len(keywords)
# # 获取在模型中的关键词
# keywords_in_model = []
# for i in range(len(keywords)):
# if keywords[i].decode('utf8') in model:
# keywords_in_model.append(keywords[i])
# print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# # 计算每一个keywords中的一个词,与句子中所有词语最大的值
# keywords_indexes = [] # 20w * 2k
# for i in range(len(new_sentences)):
# keywords_million_value = []
# for val in range(num_keywords): # key_words 2k
# similar_values = []
# for j in range(len(new_sentences[i])): # 每一个title里的词
# try:
# value = model.similarity(new_sentences[i][j].decode('utf-8'), keywords_in_model[val].decode('utf-8'))
# similar_values.append(max(value, 0))
# except:
# print new_sentences[i][j]
# try:
# keywords_one_value = max(similar_values) # 得到第一个句子与第一词的相似度最大值
# except:
# keywords_one_value = 0
# print i
# keywords_million_value.append(keywords_one_value) # 1w个
# keywords_indexes.append(keywords_million_value)
# print np.array(keywords_indexes).shape
# # 每个标题 1w维向量!
# np.save("train_data_similar_vec.npy", keywords_indexes)
# print u'词袋生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# return keywords_indexes
# keywords_indexes = get_keyword(model,80,new_sentences)
# In[6]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def get_tiidf_vec(filename):
# 把new_sentences 写成 corpus要求的类型
corpus = [' '.join(a) for a in filename]
stopword = [u' ']
#vectorizer = CountVectorizer(min_df=0,stop_words=stopword,token_pattern='(?u)\\b\\w+\\b') # 词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
#vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
vectorizer = CountVectorizer(min_df=0,token_pattern=r"(?u)\W{1}|(?u)\b\w+\b")
result = vectorizer.fit_transform(corpus) # 文本转为词频矩阵
transformer = TfidfTransformer() # 统计每个词语的tf-idf权值
tfidf = transformer.fit_transform(result) # fit_transform是计算tf-idf
vecs = [] # 每一个值的tfidf值
# hz = result.toarray()
weight=tfidf.toarray()
word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
print len(word)
# print hz.shape
print weight.shape
# return word,weight
# for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
# print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"
# for j in range(len(word)):
# print word[j],weight[i][j]
tfidf_cos = []
hz_cos = []
jk_cos = []
for i in range(weight.shape[0]/2):
# numerator = np.sum(np.min(hz[2*i:2*i+2,:], axis=0))
# denominator = np.sum(np.max(hz[2*i:2*i+2,:], axis=0))
value = np.dot(weight[2*i], weight[2*i+1])
# value_hz = np.dot(hz[2*i], hz[2*i+1]) / (norm(hz[2*i]) * norm(hz[2*i+1]))
value = 0.5 + 0.5 * value
# value_hz = 0.5 + 0.5 * value_hz
tfidf_cos.append([value])
# hz_cos.append([value_hz])
# jk_cos.append([1.0 * numerator / denominator])
return tfidf_cos,hz_cos,jk_cos
# word_list,tfidf = get_tiidf_vec(new_sentences)
# In[5]:
# # -*- coding: utf-8 -*-
# import numpy as np
# import sys
# import time
# from gensim.models import word2vec
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split # 随机分割
# from scipy.linalg import norm
# # import bm25
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
# reload(sys)
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# sys.setdefaultencoding('utf-8')
# # 合并两个csv文件到
# filename1 = 'atec_nlp_sim_train.csv'
# filename2 = 'atec_nlp_sim_train_add.csv'
# combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
# if __name__ == '__main__':
# SUBMIT = False
# if SUBMIT:
# inpath, outpath = sys.argv[1], sys.argv[2]
# testpath = combine_file
# test_num = len_merge_sum
# else:
# num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
# inpath, outpath = 'merge_test.csv', 'output.csv'
# testpath = 'merge_train.csv'
# #test_num = 92228
# test_num = 87105
# # inpath, outpath = 'empty.csv', 'output.csv'
# # # testpath = 'merge_sum.csv'
# # # test_num = 92228
# filename = 'sentences.txt'
# corpus = splitSentence(testpath, inpath, filename, SUBMIT) # jieba 分词
# print u'语料corpus生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# # # 训练词向量模型
# sentences = word2vec.Text8Corpus('sentences.txt')
# model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
# model.save('result') # save
# size = 100 # model_train size
# print u'词向量训练完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出当前时间
# # 导入model
# model = word2vec.Word2Vec.load('result')
# new_sentences = filter_word_in_model(model, filename)
# print "开始计算特征向量"
# tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# # feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
# vec_titles = [] # 获取句子的向量
# max_titles = []
# for val in range(len(new_sentences)):
# vec = np.zeros(shape=(1, size))
# mat = np.zeros(shape=(30, size))
# for i in range(len(new_sentences[val])):
# print len(new_sentences[val])
# if i < 30:
# vec += model[new_sentences[val][i].decode('utf8')]
# mat[i] = model[new_sentences[val][i].decode('utf8')]
# if len(new_sentences[val]):
# vec = vec/len(new_sentences[val])
# vec_titles.append(vec)
# max_titles.append(mat)
# print "计算特征向量完毕"
# #vec_titles = feature_extraction(new_sentences,model,size)
# vec_titles = list(map(lambda x: x[0], vec_titles)) # 去掉外部的[], 获得title 的向量形式
# print(np.array(max_titles).shape)
# np.save("train_data_title_vec.npy", vec_titles)
# np.save("train_data_title_max.npy", max_titles)
# print u'生成train_data_title_vec完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出时间
# trains = np.load('train_data_title_max.npy')
# In[7]:
# # -*- coding: utf-8 -*-
# import numpy as np
# import sys
# import time
# from gensim.models import word2vec
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split # 随机分割
# from scipy.linalg import norm
# # from keras.datasets import mnist
# # from keras.models import Sequential
# # from keras.layers import Dense, Dropout, Activation, Flatten
# # from keras.layers import Convolution2D, MaxPooling2D
# # from keras.utils import np_utils
# # from keras import backend as K
# # import bm25
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
# reload(sys)
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
# sys.setdefaultencoding('utf-8')
# # 合并两个csv文件到
# filename1 = 'atec_nlp_sim_train.csv'
# filename2 = 'atec_nlp_sim_train_add.csv'
# combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
# if __name__ == '__main__':
# SUBMIT = False
# if SUBMIT:
# inpath, outpath = sys.argv[1], sys.argv[2]
# testpath = combine_file
# test_num = len_merge_sum
# else:
# num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
# inpath, outpath = 'merge_test.csv', 'output.csv'
# testpath = 'merge_train.csv'
# #test_num = 92228
# test_num = 87105
# # inpath, outpath = 'empty.csv', 'output.csv'
# # # testpath = 'merge_sum.csv'
# # # test_num = 92228
# filename = 'sentences.txt'
# corpus = splitSentence(testpath, inpath, filename, SUBMIT) # jieba 分词
# print u'语料corpus生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# # # 训练词向量模型
# sentences = word2vec.Text8Corpus('sentences.txt')
# model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
# model.save('result') # save
# size = 100 # model_train size
# print u'词向量训练完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出当前时间
# # 导入model
# model = word2vec.Word2Vec.load('result')
# new_sentences = filter_word_in_model(model, filename)
# print "开始计算特征向量"
# # tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# # feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
# vec_titles = [] # 获取句子的向量
# max_titles = []
# for val in range(len(new_sentences)/2):
# mat = np.zeros(shape=(50, size))
# for i in range(len(new_sentences[2*val])):
# if i < 25:
# mat[i] = model[new_sentences[2*val][i].decode('utf8')]
# for i in range(len(new_sentences[2*val+1])):
# if i < 25:
# mat[i+25] = model[new_sentences[2*val+1][i].decode('utf8')]
# max_titles.append(mat)
# print "计算特征向量完毕"
# #vec_titles = feature_extraction(new_sentences,model,size)
# # vec_titles = list(map(lambda x: x[0], vec_titles)) # 去掉外部的[], 获得title 的向量形式
# print (np.array(max_titles).shape)
# np.save("train_data_title_max.npy", max_titles)
# print u'生成train_data_title_vec完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出时间
# trains = np.load('train_data_title_max.npy')
# # nb_filters = 28
# # # size of pooling area for max pooling
# # pool_size = (2, 2)
# # # convolution kernel size
# # kernel_size = (3, 100)
# # input_shape = (img_rows, img_cols, 1)
# # model = Sequential()
# # """
# # model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1],
# # border_mode='same',
# # input_shape=input_shape))
# # """
# # model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]),
# # padding='same',
# # input_shape=input_shape)) # 卷积层1
# # model.add(Activation('relu')) #激活层
# # model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]))) #卷积层2
# # model.add(Activation('relu')) #激活层
# # model.add(MaxPooling2D(pool_size=pool_size)) #池化层
# # model.add(Dropout(0.25)) #神经元随机失活
# # model.add(Flatten()) #拉成一维数据
# # model.add(Dense(128)) #全连接层1
# # model.add(Activation('relu')) #激活层
# # model.add(Dropout(0.5)) #随机失活
# # model.add(Dense(nb_classes)) #全连接层2
# # model.add(Activation('softmax')) #Softmax评分
# # #编译模型
# # model.compile(loss='categorical_crossentropy',
# # optimizer='adadelta',
# # metrics=['accuracy'])
# # #训练模型
# # model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,
# # verbose=1, validation_data=(X_test, Y_test))
# In[8]:
# print trains[0]
# In[28]:
# -*- coding: utf-8 -*-
import numpy as np
import sys
import time
from gensim.models import word2vec
import lightgbm as lgb
from sklearn.model_selection import train_test_split # 随机分割
from scipy.linalg import norm
# import bm25
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde
sys.setdefaultencoding('utf-8')
# 合并两个csv文件到
filename1 = 'atec_nlp_sim_train.csv'
filename2 = 'atec_nlp_sim_train_add.csv'
combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
if __name__ == '__main__':
SUBMIT = True
if SUBMIT:
inpath, outpath = sys.argv[1], sys.argv[2]
testpath = combine_file
test_num = len_merge_sum
else:
num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
inpath, outpath = 'merge_test.csv', 'output.csv'
testpath = 'merge_train.csv'
#test_num = 92228
test_num = 87105
# inpath, outpath = 'empty.csv', 'output.csv'
# # testpath = 'merge_sum.csv'
# # test_num = 92228
filename = 'sentences.txt'
corpus = splitSentence(testpath, inpath, filename, SUBMIT) # jieba 分词
print u'语料corpus生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
# # 训练词向量模型
sentences = word2vec.Text8Corpus('sentences.txt')
model = word2vec.Word2Vec(sentences, sg=1, size=120, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
model.save('result') # save
size = 120 # model_train size
print u'词向量训练完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出当前时间
# 导入model
model = word2vec.Word2Vec.load('result')
new_sentences = filter_word_in_model(model, filename)
print "开始计算特征向量"
tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
vec_titles = [] # 获取句子的向量
value_eig = []
v_max = []
v_v = [0,0,0]
for val in range(len(new_sentences)):
vec = np.zeros(shape=(1, size))
# mat = np.zeros(shape=(len(new_sentences[val]), size))
for i in range(len(new_sentences[val])):
vec += model[new_sentences[val][i].decode('utf8')]
# mat[i] = model[new_sentences[val][i].decode('utf8')]
if len(new_sentences[val]):
vec = vec/len(new_sentences[val])
# a,b,c = np.linalg.svd(mat)
# b_l = list(b)
# b_s = sorted(b_l)
# b_1 = np.max(b)
# v_max.append(b_s[-1])
# if len(b_s) < 2:
# v_max.append(0)
# else:
# v_max.append(b_s[-2])
# if len(b_s) < 3:
# v_max.append(0)
# else:
# v_max.append(b_s[-3])
vec_titles.append(vec)
# if len(v_max) == 6:
# v_v[0] = abs(v_max[3]-v_max[0])
# v_v[1] = abs(v_max[4]-v_max[1])
# v_v[2] = abs(v_max[5]-v_max[2])
# value_eig.append(v_v)
# v_max = []
print "计算特征向量完毕"
# vec_titles = []
# for val in range(len(new_sentences)):
# vec = np.zeros(shape=(len(new_sentences[val]), size))
# for i in range(len(new_sentences[val])):
# vec[i] = model[new_sentences[val][i].decode('utf8')]
# if len(new_sentences[val]):
# V = np.dot(vec.transpose(),vec)
# a,b = np.linalg.eig(V)
# vec1 = np.zeros(shape = (size,1))
# for k in range(a.shape[0]):
# if a[k] > 0.1:
# vec1 = vec1+(a[k]*b[:,k]).reshape(size,1)
# print len(vec_titles)
# vec_titles.append(abs(vec1).reshape(1,size))
#vec_titles = feature_extraction(new_sentences,model,size)
vec_titles = list(map(lambda x: x[0], vec_titles)) # 去掉外部的[], 获得title 的向量形式
print(np.array(vec_titles).shape)
np.save("train_data_title_vec.npy", vec_titles)
print u'生成train_data_title_vec完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出时间
trains = np.load('train_data_title_vec.npy')
# bm_score = np.load('bm_score.npy')
# size = 100
new_sentences = []
with open('new_sentences.txt', 'r') as f:
for eachLine in f:
word = eachLine.decode('utf8').strip().split(' ')
new_sentences.append(word)
# 一维特征
# 6 杰拉德距离
J_dist = []
for val in range(len(new_sentences) / 2):
j = cal_jaccard(new_sentences[2 * val], new_sentences[2 * val + 1])
J_dist.append(j)
# 1向量夹角,2向量距离(最小) 3 bm_score
juzi = []
f1 = open('sentences_1.txt','r')
for eachLine in f1:
word = eachLine.decode('utf8').strip()
juzi.append(list(word))
distance = []
for i in range(len(juzi)/2):
j = cal_jaccard(juzi[2 * i], juzi[2 * i + 1])
distance.append([j])
f1.close()
cos_val = []
E_Dist = []
print 'train', len(trains)
for i in range(len(trains) / 2):
score1 = cos_Vector(trains[2 * i], trains[2 * i + 1])
cos_val += [score1]
score2 = calEuclideanDistance(trains[2 * i], trains[2 * i + 1])
E_Dist += [score2]
#添加一个二维特征
combine_feature1 = np.vstack((cos_val, J_dist)).transpose() # 2个
np.save("cos_val.npy", cos_val), np.save('E_Dist.npy', E_Dist)
# 4,5 各自的长度(2维),6 长度差异(删)
len_ = []
dif_length = []
for val in range(len(new_sentences) / 2):
a = [0.1*len(new_sentences[2 * val]), 0.1*len(new_sentences[2 * val + 1])]
#长度特征
len_.append(a)
b = abs(len(new_sentences[2 * val]) - len(new_sentences[2 * val + 1]))
#长度差特征
dif_length.append(b)
# combine_feature2 = np.vstack((np.transpose(len_), J_dist)).transpose() # 3个
# combine_feature = np.hstack((combine_feature1, combine_feature2)) # 5个
# combine_feature2 = len_ #二维特征
combine_feature3 = tfidf_cos
combine_feature4 = distance
# combine_feature = np.vstack((np.transpose(combine_feature), J_dist)).transpose() # 6个
combine_feature = np.hstack((combine_feature1, combine_feature3, combine_feature4)) # 4个
print combine_feature.shape
# 编辑距离
dim_1_num = 4 # 一维特征列数
# dim_1_num = 6 # 一维特征列数
print 'combine_feature length:', len(combine_feature)
# 产生新特征(size*num 维度)
feature1_val = []
feature2_val = []
feature3_val = []
feature4_val = []
feature5_val = []
feature6_val = []
print 'train', len(trains)
#train就是句子向量对应的矩阵
for i in range(len(trains)/2):
vec1 = trains[2*i]
vec2 = trains[2*i+1]
feature1_val.append(vec1)
feature2_val.append(vec2)
for i in range(len(trains)/2):
vec3 = vec_minus(trains[2*i], trains[2*i+1])
vec4 = vec_multi(trains[2*i], trains[2*i+1])
feature3_val.append(vec3)
feature4_val.append(vec4)
print 'feature1_val length:', len(feature3_val)
# feature_val = np.hstack((feature1_val, feature2_val, feature3_val, feature4_val, combine_feature)) # 400列///406
#合成总特征feature_val
feature_val = np.hstack((feature1_val, feature2_val,combine_feature)) # 300列///406
#feature_val = combine_feature
np.save("feature_val.npy", feature_val)
# feature_num = 4
feature_num = 2 #几个feature1_val向量组成的特征
print u'特征生成完毕'
y_true = []
with open(testpath, 'r') as f, open(inpath, 'r') as fin:
for line in f.readlines():
pair_id, sen1, sen2, label = line.strip().split('\t')
label = int(label)
y_true += [label]
np.save('y_true.npy', y_true)
print 'y_true length', len(y_true)
# 读取数据
y_true = np.load('y_true.npy')
# bm_score = np.load('bm_score.npy')
# bm_score_train = bm_score[:test_num]
# print 'bm_score_train length', len(bm_score_train)
# bm_score_test = bm_score[test_num:]
# bm_score = pd.Series([bm_score], index=['bm_score'])
cos_val = np.load('cos_val.npy')
cos_val_train = cos_val[:test_num]
cos_val_test = cos_val[test_num:]
feature_val = np.load('feature_val.npy')
feature_val_train = feature_val[:test_num]
feature_val_test = feature_val[test_num:]
#每一行对应一个训练集,最后一列是标号
trains = np.vstack((np.transpose(feature_val_train), y_true)).transpose()
print np.array(trains).shape
print u"数据拆分"
train, val = train_test_split(trains, test_size=0.2, random_state=21)
print 'train length', len(train)
print 'val length', len(val)
print u"训练集"
y = [train[i][feature_num*size+dim_1_num] for i in range(len(train))] # 训练集标签
X = [train[i][:feature_num*size+dim_1_num] for i in range(len(train))] # 训练集特征矩阵
print u"验证集"
val_y = [val[i][feature_num*size+dim_1_num] for i in range(len(val))] # 验证集标签
val_X = [val[i][:feature_num*size+dim_1_num] for i in range(len(val))] # 验证集特征矩阵
print u"测试集"
tests = feature_val_test
# 数据转换
lgb_train = lgb.Dataset(X, y, free_raw_data=False)
lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train, free_raw_data=False)
# 开始训练
print u'设置参数'
params = {
'num_threads' : '4',
'boosting_type': 'gbdt',
'boosting': 'gbdt',
'objective': 'binary',
'metric': 'binary_logloss',
'learning_rate': 0.1,
'num_leaves': 25,
'max_depth': 3,
'max_bin': 10,
'min_data_in_leaf': 8,
'feature_fraction': 1,
'bagging_fraction': 0.7,
'bagging_freq': 5,
'lambda_l1': 0,
'lambda_l2': 0,
'min_split_gain': 0
}
print u"开始训练"
gbm = lgb.train(params, # 参数字典
lgb_train, # 训练集
num_boost_round=3000, # 迭代次数
valid_sets=lgb_eval, # 验证集
early_stopping_rounds=30) # 早停系数
# 保存模型
from sklearn.externals import joblib
joblib.dump(gbm, 'gbm.pkl')
print u"预测,并输出在 outpath"
preds_offline = gbm.predict(tests, num_iteration=gbm.best_iteration) # 输出概率
np.save('preds.npy', preds_offline)
if not SUBMIT:
N = 200
score_best = 0
with open('merge_test.csv', 'r') as f1:
y_true_10 = []
for eachLine in f1:
lineno, sen1, sen2, label = eachLine.strip().split('\t')
a = int(label)
y_true_10.append(a)
for thred in range(1,N): # 阈值的选取,如何找到最好的阈值
thred = thred * (np.max(preds_offline) - np.min(preds_offline)) / N + np.min(preds_offline)
pred = []
for i in range(len(preds_offline)):
if preds_offline[i] > thred:
pred.append(1)
else:
pred.append(0)
score = eval_file(y_true_10, pred)
if score > score_best:
score_best = score
thred_best = thred
print u'最优阈值:', thred_best
for i in range(len(preds_offline)):
if preds_offline[i] > thred_best:
preds_offline[i] = 1
else:
preds_offline[i] = 0
print len(preds_offline)
f1_score = eval_file(y_true_10, preds_offline)
print 'F1 score is :' + str(f1_score)
fout = open(outpath,'w')
for t in preds_offline:
fout.write(str(t))
fout.write('\n')
fout.write('F1 score is :' + str(f1_score))
fout.close()
else:
with open(inpath, 'r') as fin, open(outpath, 'w') as fout:
line_id = []
for line in fin:
lineno, sen1, sen2 = line.strip().split('\t')
line_id.append(lineno)
for i in range(len(line_id)):
if preds_offline[i] >= 0.246:
fout.write(line_id[i] + '\t1\n')
else:
fout.write(line_id[i] + '\t0\n')
print u'运行完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) # 输出当前时间