yg838457845

阿里nlp2018初赛算法，Python代码

主要是用lightmbg库：


# coding: utf-8

# In[1]:


# -*- coding: utf-8 -*-
import sys
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
# 合并两个csv文件到


def combine(combine_file, filename1, filename2):
    f = open("true_value.txt",'w')
    len_merge_sum = 0
    with open(combine_file, 'w') as fout:
        with open(filename1, 'r') as f1:
            for eachLine in f1:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')
                fout.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
                if int(label) == 1:
                    f.write(sen1 + '\t' + sen2 + '\t' + label + '\n')
                len_merge_sum += 1
        with open(filename2, 'r') as f1:
            for eachLine in f1:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')
                fout.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
                if int(label) == 1:
                    f.write(sen1 + '\t' + sen2 + '\t' + label + '\n')
                len_merge_sum += 1
    fout.close()
    f.close()
    return combine_file, len_merge_sum


# In[2]:


# -*- coding: utf-8 -*-
from gensim.models import word2vec
import pandas as pd
import numpy as np
import sys
import time
import re
import jieba
import io
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
sys.setdefaultencoding('utf-8')


def process_simi_stop(simiwords, stopwords, line):
    for word, subword in simiwords.iteritems():
        if word in line:
            # print line
            #line = re.sub(word, subword, line)
            line = line.replace(word,subword)
            # print subword
    words1 = [w for w in jieba.cut(line) if w.strip()]
    word1 = []
    for i in words1:
        if i not in stopwords:
            word1.append(i)
    return word1,line


def splitSentence(inputFile, inpath, segment, submit):
    print u'分词开始！', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出当前时间
    start = time.clock()
    jieba.load_userdict("jieba_dict.txt")
    corpus = []

    simiwords = {}
    with io.open("simiwords.txt", encoding='utf-8') as fr:
        for line in fr:
            words = re.split(",", line.strip())
            simiwords[words[0]] = words[1]

    stopwords = []  # 停用词
    fstop = open('chinese_stopwords.txt', 'r')
    for eachWord in fstop:
        stopwords.append(eachWord.strip())
        # print eachWord.strip()

    fin = open(inputFile, 'r')  # 以读的方式打开文件 inputFile
    fin1 = open('sentences_1.txt','w')
    for eachLine in fin:
        # line = eachLine.strip()  # 去除每行首尾可能出现的空格
        # line = re.sub("[0-9\s+\.\!\/_,$%^*()?;；:-【】+\"\']+|[+——！，;:。？、~@#￥%……&*（）]+", "", eachLine)
        eachLine = re.sub("\*", " ", eachLine)
        # jieba.del_word('年')
        lineno, sen1, sen2, label = eachLine.strip().split('\t')
        word1,sen_1 = process_simi_stop(simiwords, stopwords, sen1)
        word2,sen_2 = process_simi_stop(simiwords, stopwords, sen2)
        fin1.write(sen_1)
        fin1.write("\n")
        fin1.write(sen_2)
        fin1.write("\n")
#         sen_11 = ' '.join(sen_1.decode('utf8'))
#         sen_12 = sen_11.split(" ")
#         for s in sen_12:
#             if s != sen_12[-1]:
#                 fin1.write(s+" ")
#             else:
#                 fin1.write(s)
#         fin1.write('\n')
#         sen_21 = ' '.join(sen_2.decode('utf8'))
#         sen_22 = sen_21.split(" ") 
#         for s in sen_22:
#             if s != sen_22[-1]:
#                 fin1.write(s+" ")
#             else:
#                 fin1.write(s)
#         fin1.write('\n')
        corpus.append(word1)
        corpus.append(word2)
    print len(corpus)
    with open(inpath, 'r') as fin2:  # inpath
        for eachLine in fin2:
            eachLine = re.sub("\*", " ", eachLine)
            if submit:
                lineno, sen1, sen2 = eachLine.strip().split('\t')
                #print "ceshijieshisha:", sen1, sen2
            else:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')   # 无label
                #print "ceshijieshisha:", sen1, sen2
            word1,sen_1 = process_simi_stop(simiwords, stopwords, sen1)
            word2,sen_2 = process_simi_stop(simiwords, stopwords, sen2)
            fin1.write(sen_1)
            fin1.write("\n")
            fin1.write(sen_2)
            fin1.write("\n")
#             sen_11 = ' '.join(sen_1.decode('utf8'))
#             sen_12 = sen_11.split(" ")
#             for s in sen_12:
#                 if s != sen_12[-1]:
#                     fin1.write(s+" ")
#                 else:
#                     fin1.write(s)
#             fin1.write('\n')
#             sen_21 = ' '.join(sen_2.decode('utf8'))
#             sen_22 = sen_21.split(" ") 
#             for s in sen_22:
#                 if s != sen_22[-1]:
#                     fin1.write(s+" ")
#                 else:
#                     fin1.write(s)
#             fin1.write('\n')
            corpus.append(word1)
            corpus.append(word2)
    print len(corpus)  # 204954
    fin1.close()
    with open(segment, 'w') as fs:
        for word in corpus:
            # print type(word)
            for w in word:
                # print w
                fs.write(w)  # 将分词好的结果写入到输出文件
                fs.writelines(' ')
            fs.write('\n')
    end = time.clock()
    print u'分词实际用时：', end - start
    return corpus


def filter_word_in_model(model, filename):
    a = []
    with open(filename, 'r') as file_to_read:
        for line in file_to_read:
            if True:
                if not line:
                    break
                a.append(line)
    sentences = []  # 读sentences 里面的词
    for i in range(len(a)):
        b = a[i].strip().split()
        sentences.append(b)
    print 'sentences length:', len(sentences)
    new_sentences = []  # 完成获取模型训练，剩余含有词向量序列
    for i in range(len(sentences)):
        new_sentence = []
        for j in range(len(sentences[i])):
            if sentences[i][j].decode('utf8') in model:
                new_sentence.append(sentences[i][j])
        new_sentences.append(new_sentence)
    print 'new_sentences length: ', len(new_sentences)
    # print(np.array(new_sentences).shape)
    print u'new_sentences,用时', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出当前时间
    with open('new_sentences.txt', 'w') as fs:  # 写入new_sentences
        for word in new_sentences:
            for w in word:
                fs.write(w)  # 将分词好的结果写入到输出文件
                fs.writelines(' ')
            fs.write('\n')
    return new_sentences


def eval_file(label1, pre):
    tp, tn, fp, fn = 0.0000001, 0.0000001, 0.0000001, 0.00000001
    for la, pr in zip(label1, pre):
        if la == 1 and pr == 1:
            tp += 1
        elif la == 1 and pr == 0:
            fn += 1
        elif la == 0 and pr == 0:
            tn += 1
        elif la == 0 and pr == 1:
            fp += 1
    recall = float(tp)/float(tp+fn)
    precision = float(tp)/float(tp+fp)
    f11 = 2*recall*precision/(recall+precision)
    return f11


def cos_Vector(x, y):  # 用cos求夹角
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    x = np.array(x)
    y = np.array(y)
    num = (x * y.T)
    num = float(num.sum())
    if num == 0:
        return 0
    denom = np.linalg.norm(x) * np.linalg.norm(y)
    if denom == 0:
        return 0
    cos = num / denom  # 余弦值
    sim = 0.5 + 0.5 * cos  # 归一化
    return sim


def vec_minus(x, y):  # 相减
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    x = np.array(x)
    y = np.array(y)
    sim = abs(x-y)
    return sim


def vec_multi(x, y):  # 相乘
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    x = np.array(x)
    y = np.array(y)
    sim1 = x * y
    return sim1


def calEuclideanDistance(x, y):
    if len(x) != len(y):
        print u'error input,x and y is not in the same space'
        return
    dist = np.sqrt(np.sum(np.square(x - y)))
    return dist

#相同数组中相同元素的数量统计
def cal_jaccard(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    avg_len = (len(set1) + len(set2)) / 2
    min_len = min(len(set1), len(set2))
    # return len(set1 & set2) * 1.0 / (len(set1) + len(set2) - len(set1 & set2))
    if min_len == 0:
        return 0
    else:
        return len(set1 & set2) * 1.0 / min_len


def zishu(X,useStatus):  
    if useStatus:  
        return 1.0 / (1 + np.exp(-(X)));  
    else:  
        return (X);  


# In[3]:


def fenge(input_file,out_file,out_file1):
    f = open(input_file,'r')
    f_1 = open(out_file,'w')
    f_2 = open(out_file1,'w')
    lines = f.readlines()
    Row = len(lines)
    D = int(Row*0.85)
    for i in range(Row):
        if i < D:
            lineno, sen1, sen2, label = lines[i].strip().split('\t')
            f_1.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
        else:
            lineno, sen1, sen2, label = lines[i].strip().split('\t')
            f_2.write(lineno + '\t' + sen1 + '\t' + sen2 + '\t' + label + '\n')
    f.close()
    f_1.close()
    f_2.close()
    return D
def feature_extraction(new_sentences,model,size):
    vec_titles = []  # 获取句子的向量
    for val in range(len(new_sentences)):
        vec = np.zeros(shape=(1, size))
        for i in range(len(new_sentences[val])):
            vec += model[new_sentences[val][i].decode('utf8')]
        if len(new_sentences[val]):
            vec = vec/len(new_sentences[val])
        vec_titles.append(vec)
    return vec_titles
            


# In[4]:


'''
import threading
import time
gl_num = 0
vec_titles = []
lock = threading.RLock()
def feature_extraction(new_sentences,model,size,val_start):
    lock.acquire()
    global vec_titles
    for val in range(val_start,len(new_sentences),4):
        vec = np.zeros(shape=(1, size))
        for i in range(len(new_sentences[val])):
            vec += model[new_sentences[val][i].decode('utf8')]
        if len(new_sentences[val]):
            vec = vec/len(new_sentences[val])
        vec_titles.append(vec)
    time.sleep(1)
    print len(vec_titles)
    lock.release()
thread_list = [] 
for i in range(4):
    t = threading.Thread(target=feature_extraction,args = (new_sentences,model,size,i))
    thread_list.append(t)
for t in thread_list:
    t.start()
'''


# In[5]:


# import jieba.analyse
# def get_keyword(model, num_keywords, new_sentences):
#     # 获取关键词
#     content = open('new_sentences.txt', 'rb').read()
#     jieba.analyse.set_stop_words('chinese_stopwords.txt')
#     keywords = jieba.analyse.extract_tags(content, topK=num_keywords, withWeight=False, allowPOS=())
#     print u'keywords长度：', len(keywords)

#     # 获取在模型中的关键词
#     keywords_in_model = []
#     for i in range(len(keywords)):
#         if keywords[i].decode('utf8') in model:
#             keywords_in_model.append(keywords[i])
#     print time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

#     # 计算每一个keywords中的一个词，与句子中所有词语最大的值
#     keywords_indexes = []  # 20w * 2k
#     for i in range(len(new_sentences)):
#         keywords_million_value = []
#         for val in range(num_keywords):  # key_words 2k
#             similar_values = []
#             for j in range(len(new_sentences[i])):  # 每一个title里的词
#                 try:
#                     value = model.similarity(new_sentences[i][j].decode('utf-8'), keywords_in_model[val].decode('utf-8'))
#                     similar_values.append(max(value, 0))
#                 except:
#                     print new_sentences[i][j]
#             try:
#                 keywords_one_value = max(similar_values)  # 得到第一个句子与第一词的相似度最大值
#             except:
#                 keywords_one_value = 0
#                 print i
#             keywords_million_value.append(keywords_one_value)  # 1w个
#         keywords_indexes.append(keywords_million_value)
#     print np.array(keywords_indexes).shape
#     # 每个标题 1w维向量！
#     np.save("train_data_similar_vec.npy", keywords_indexes)
#     print u'词袋生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
#     return keywords_indexes
# keywords_indexes = get_keyword(model,80,new_sentences)


# In[6]:


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer  
def get_tiidf_vec(filename):
    # 把new_sentences 写成 corpus要求的类型
    corpus = [' '.join(a) for a in filename]
    stopword = [u' ']
    #vectorizer = CountVectorizer(min_df=0,stop_words=stopword,token_pattern='(?u)\\b\\w+\\b')  # 词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
    #vectorizer = CountVectorizer(token_pattern=r"(?u)\b\w+\b")
    vectorizer = CountVectorizer(min_df=0,token_pattern=r"(?u)\W{1}|(?u)\b\w+\b")
    result = vectorizer.fit_transform(corpus)  # 文本转为词频矩阵
    transformer = TfidfTransformer()  # 统计每个词语的tf-idf权值
    tfidf = transformer.fit_transform(result)  # fit_transform是计算tf-idf
    vecs = []  # 每一个值的tfidf值
#     hz = result.toarray()
    weight=tfidf.toarray()
    word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
    print len(word)
#     print hz.shape
    print weight.shape 
   # return word,weight
#     for i in range(len(weight)):#打印每类文本的tf-idf词语权重，第一个for遍历所有文本，第二个for便利某一类文本下的词语权重  
#         print u"-------这里输出第",i,u"类文本的词语tf-idf权重------"  
#         for j in range(len(word)):  
#             print word[j],weight[i][j] 
    tfidf_cos = []
    hz_cos = []
    jk_cos = []
    for i in range(weight.shape[0]/2):
#         numerator = np.sum(np.min(hz[2*i:2*i+2,:], axis=0))
#         denominator = np.sum(np.max(hz[2*i:2*i+2,:], axis=0))
        value = np.dot(weight[2*i], weight[2*i+1])
#         value_hz = np.dot(hz[2*i], hz[2*i+1]) / (norm(hz[2*i]) * norm(hz[2*i+1]))
        value = 0.5 + 0.5 * value
#         value_hz = 0.5 + 0.5 * value_hz
        tfidf_cos.append([value])
#         hz_cos.append([value_hz])
#         jk_cos.append([1.0 * numerator / denominator])
    return tfidf_cos,hz_cos,jk_cos
# word_list,tfidf = get_tiidf_vec(new_sentences)


# In[5]:


# # -*- coding: utf-8 -*-
# import numpy as np
# import sys
# import time
# from gensim.models import word2vec
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split   # 随机分割
# from scipy.linalg import norm
# # import bm25
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
# reload(sys)
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
# sys.setdefaultencoding('utf-8')
# # 合并两个csv文件到
# filename1 = 'atec_nlp_sim_train.csv'
# filename2 = 'atec_nlp_sim_train_add.csv'
# combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
# if __name__ == '__main__':
#     SUBMIT = False
#     if SUBMIT:
#         inpath, outpath = sys.argv[1], sys.argv[2]
#         testpath = combine_file
#         test_num = len_merge_sum
#     else:
#         num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
#         inpath, outpath = 'merge_test.csv', 'output.csv'
#         testpath = 'merge_train.csv'
#         #test_num = 92228
#         test_num = 87105
#         # inpath, outpath = 'empty.csv', 'output.csv'
#         #         # testpath = 'merge_sum.csv'
#         #         # test_num = 92228
#     filename = 'sentences.txt'
#     corpus = splitSentence(testpath, inpath, filename, SUBMIT)  # jieba 分词

#     print u'语料corpus生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

# # # 训练词向量模型

#     sentences = word2vec.Text8Corpus('sentences.txt')
#     model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
#     model.save('result')  # save
#     size = 100  # model_train size
#     print u'词向量训练完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出当前时间
#     # 导入model
#     model = word2vec.Word2Vec.load('result')
#     new_sentences = filter_word_in_model(model, filename)
    
    
#     print "开始计算特征向量"
#     tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# #     feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
#     vec_titles = []  # 获取句子的向量
#     max_titles = []

#     for val in range(len(new_sentences)):
#         vec = np.zeros(shape=(1, size))
#         mat = np.zeros(shape=(30, size))
#         for i in range(len(new_sentences[val])):
#             print len(new_sentences[val])
#             if i < 30:
#                 vec += model[new_sentences[val][i].decode('utf8')]
#                 mat[i] = model[new_sentences[val][i].decode('utf8')]
#         if len(new_sentences[val]):
#             vec = vec/len(new_sentences[val])
#         vec_titles.append(vec)
#         max_titles.append(mat)
#     print "计算特征向量完毕"

#     #vec_titles = feature_extraction(new_sentences,model,size)
#     vec_titles = list(map(lambda x: x[0], vec_titles))  # 去掉外部的[], 获得title 的向量形式
#     print(np.array(max_titles).shape)
#     np.save("train_data_title_vec.npy", vec_titles)
#     np.save("train_data_title_max.npy", max_titles)
#     print u'生成train_data_title_vec完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出时间
#     trains = np.load('train_data_title_max.npy')


# In[7]:


# # -*- coding: utf-8 -*-
# import numpy as np
# import sys
# import time
# from gensim.models import word2vec
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split   # 随机分割
# from scipy.linalg import norm

# # from keras.datasets import mnist  
# # from keras.models import Sequential  
# # from keras.layers import Dense, Dropout, Activation, Flatten  
# # from keras.layers import Convolution2D, MaxPooling2D  
# # from keras.utils import np_utils  
# # from keras import backend as K  
# # import bm25
# stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
# reload(sys)
# sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
# sys.setdefaultencoding('utf-8')
# # 合并两个csv文件到
# filename1 = 'atec_nlp_sim_train.csv'
# filename2 = 'atec_nlp_sim_train_add.csv'
# combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
# if __name__ == '__main__':
#     SUBMIT = False
#     if SUBMIT:
#         inpath, outpath = sys.argv[1], sys.argv[2]
#         testpath = combine_file
#         test_num = len_merge_sum
#     else:
#         num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
#         inpath, outpath = 'merge_test.csv', 'output.csv'
#         testpath = 'merge_train.csv'
#         #test_num = 92228
#         test_num = 87105
#         # inpath, outpath = 'empty.csv', 'output.csv'
#         #         # testpath = 'merge_sum.csv'
#         #         # test_num = 92228
#     filename = 'sentences.txt'
#     corpus = splitSentence(testpath, inpath, filename, SUBMIT)  # jieba 分词

#     print u'语料corpus生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

# # # 训练词向量模型

#     sentences = word2vec.Text8Corpus('sentences.txt')
#     model = word2vec.Word2Vec(sentences, sg=1, size=100, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
#     model.save('result')  # save
#     size = 100  # model_train size
#     print u'词向量训练完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出当前时间
#     # 导入model
#     model = word2vec.Word2Vec.load('result')
#     new_sentences = filter_word_in_model(model, filename)
    
    
#     print "开始计算特征向量"
# #     tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
# #     feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
#     vec_titles = []  # 获取句子的向量
#     max_titles = []

#     for val in range(len(new_sentences)/2):
#         mat = np.zeros(shape=(50, size))
#         for i in range(len(new_sentences[2*val])):
#             if i < 25:
#                 mat[i] = model[new_sentences[2*val][i].decode('utf8')]
#         for i in range(len(new_sentences[2*val+1])):
#             if i < 25:
#                 mat[i+25] = model[new_sentences[2*val+1][i].decode('utf8')]        
#         max_titles.append(mat)
#     print "计算特征向量完毕"

#     #vec_titles = feature_extraction(new_sentences,model,size)
# #     vec_titles = list(map(lambda x: x[0], vec_titles))  # 去掉外部的[], 获得title 的向量形式
#     print (np.array(max_titles).shape)
#     np.save("train_data_title_max.npy", max_titles)
#     print u'生成train_data_title_vec完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出时间
#     trains = np.load('train_data_title_max.npy')    
# #     nb_filters = 28  
# #     # size of pooling area for max pooling  
# #     pool_size = (2, 2)  
# #     # convolution kernel size  
# #     kernel_size = (3, 100)  
# #     input_shape = (img_rows, img_cols, 1)  
# #     model = Sequential()  
# # """ 
# # model.add(Convolution2D(nb_filters, kernel_size[0], kernel_size[1], 
# #                         border_mode='same', 
# #                         input_shape=input_shape)) 
# # """  
# #     model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]),  
# #                         padding='same',  
# #                         input_shape=input_shape)) # 卷积层1  
# #     model.add(Activation('relu')) #激活层  
# #     model.add(Convolution2D(nb_filters, (kernel_size[0], kernel_size[1]))) #卷积层2  
# #     model.add(Activation('relu')) #激活层  
# #     model.add(MaxPooling2D(pool_size=pool_size)) #池化层  
# #     model.add(Dropout(0.25)) #神经元随机失活  
# #     model.add(Flatten()) #拉成一维数据  
# #     model.add(Dense(128)) #全连接层1  
# #     model.add(Activation('relu')) #激活层  
# #     model.add(Dropout(0.5)) #随机失活  
# #     model.add(Dense(nb_classes)) #全连接层2  
# #     model.add(Activation('softmax')) #Softmax评分  
  
# #     #编译模型  
# #     model.compile(loss='categorical_crossentropy',  
# #               optimizer='adadelta',  
# #               metrics=['accuracy'])  
# #     #训练模型  
# #     model.fit(X_train, Y_train, batch_size=batch_size, epochs=epochs,  
# #           verbose=1, validation_data=(X_test, Y_test))  


# In[8]:


# print trains[0]


# In[28]:


# -*- coding: utf-8 -*-
import numpy as np
import sys
import time
from gensim.models import word2vec
import lightgbm as lgb
from sklearn.model_selection import train_test_split   # 随机分割
from scipy.linalg import norm
# import bm25
stdi,stdo,stde=sys.stdin,sys.stdout,sys.stderr 
reload(sys)
sys.stdin,sys.stdout,sys.stderr=stdi,stdo,stde 
sys.setdefaultencoding('utf-8')
# 合并两个csv文件到
filename1 = 'atec_nlp_sim_train.csv'
filename2 = 'atec_nlp_sim_train_add.csv'
combine_file, len_merge_sum = combine('merge_sum.csv', filename1, filename2)
if __name__ == '__main__':
    SUBMIT = True
    if SUBMIT:
        inpath, outpath = sys.argv[1], sys.argv[2]
        testpath = combine_file
        test_num = len_merge_sum
    else:
        num = fenge("merge_sum.csv","merge_train.csv","merge_test.csv")
        inpath, outpath = 'merge_test.csv', 'output.csv'
        testpath = 'merge_train.csv'
        #test_num = 92228
        test_num = 87105
        # inpath, outpath = 'empty.csv', 'output.csv'
        #         # testpath = 'merge_sum.csv'
        #         # test_num = 92228
    filename = 'sentences.txt'
    corpus = splitSentence(testpath, inpath, filename, SUBMIT)  # jieba 分词

    print u'语料corpus生成完毕:', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))

# # 训练词向量模型

    sentences = word2vec.Text8Corpus('sentences.txt')
    model = word2vec.Word2Vec(sentences, sg=1, size=120, window=5, min_count=5, negative=3, sample=0.001, hs=1, workers=4)
    model.save('result')  # save
    size = 120  # model_train size
    print u'词向量训练完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出当前时间
    # 导入model
    model = word2vec.Word2Vec.load('result')
    new_sentences = filter_word_in_model(model, filename)
    
    
    print "开始计算特征向量"
    tfidf_cos,hz_cos,jk_cos = get_tiidf_vec(new_sentences)
#     feature_2 = np.hstack((tfidf_cos,hz_cos,jk_cos))
    vec_titles = []  # 获取句子的向量
    value_eig = []
    v_max = []
    v_v = [0,0,0]
    for val in range(len(new_sentences)):
        vec = np.zeros(shape=(1, size))
#         mat = np.zeros(shape=(len(new_sentences[val]), size))
        for i in range(len(new_sentences[val])):
            vec += model[new_sentences[val][i].decode('utf8')]
#             mat[i] = model[new_sentences[val][i].decode('utf8')]
        if len(new_sentences[val]):
            vec = vec/len(new_sentences[val])
#             a,b,c = np.linalg.svd(mat)
#             b_l = list(b)
#             b_s = sorted(b_l)
#             b_1 = np.max(b)
#         v_max.append(b_s[-1])
#         if len(b_s) < 2:
#             v_max.append(0)
#         else:
#             v_max.append(b_s[-2])
#         if len(b_s) < 3:
#             v_max.append(0)
#         else:
#             v_max.append(b_s[-3])
        vec_titles.append(vec)
#         if len(v_max) == 6:
#             v_v[0] = abs(v_max[3]-v_max[0])
#             v_v[1] = abs(v_max[4]-v_max[1])
#             v_v[2] = abs(v_max[5]-v_max[2])
#             value_eig.append(v_v)
#             v_max = []
    print "计算特征向量完毕"

#     vec_titles = []
#     for val in range(len(new_sentences)):
#         vec = np.zeros(shape=(len(new_sentences[val]), size))
#         for i in range(len(new_sentences[val])):
#             vec[i] = model[new_sentences[val][i].decode('utf8')]
#         if len(new_sentences[val]):
#             V = np.dot(vec.transpose(),vec)
#             a,b = np.linalg.eig(V)
#         vec1 = np.zeros(shape = (size,1))
#         for k in range(a.shape[0]):
#             if a[k] > 0.1:
#                 vec1 = vec1+(a[k]*b[:,k]).reshape(size,1)
#         print len(vec_titles)
#         vec_titles.append(abs(vec1).reshape(1,size))


    #vec_titles = feature_extraction(new_sentences,model,size)
    vec_titles = list(map(lambda x: x[0], vec_titles))  # 去掉外部的[], 获得title 的向量形式
    print(np.array(vec_titles).shape)
    np.save("train_data_title_vec.npy", vec_titles)
    print u'生成train_data_title_vec完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出时间
    trains = np.load('train_data_title_vec.npy')
    # bm_score = np.load('bm_score.npy')
    # size = 100
    new_sentences = []
    with open('new_sentences.txt', 'r') as f:
        for eachLine in f:
            word = eachLine.decode('utf8').strip().split(' ')
            new_sentences.append(word)
# 一维特征
    # 6  杰拉德距离
        J_dist = []
        for val in range(len(new_sentences) / 2):
            j = cal_jaccard(new_sentences[2 * val], new_sentences[2 * val + 1])
            J_dist.append(j)
    #  1向量夹角，2向量距离(最小) 3 bm_score

    juzi = []
    f1 = open('sentences_1.txt','r')
    for eachLine in f1:
        word = eachLine.decode('utf8').strip()
        juzi.append(list(word))
    distance = []
    for i in range(len(juzi)/2):
        j = cal_jaccard(juzi[2 * i], juzi[2 * i + 1])
        distance.append([j])
    f1.close()    
    
    cos_val = []
    E_Dist = []
    print 'train', len(trains)
    for i in range(len(trains) / 2):
        score1 = cos_Vector(trains[2 * i], trains[2 * i + 1])
        cos_val += [score1]
        score2 = calEuclideanDistance(trains[2 * i], trains[2 * i + 1])
        E_Dist += [score2]
    #添加一个二维特征
    combine_feature1 = np.vstack((cos_val, J_dist)).transpose()  # 2个
    np.save("cos_val.npy", cos_val), np.save('E_Dist.npy', E_Dist)
    # 4,5 各自的长度(2维)，6 长度差异(删)
    len_ = []
    dif_length = []
    for val in range(len(new_sentences) / 2):
        a = [0.1*len(new_sentences[2 * val]), 0.1*len(new_sentences[2 * val + 1])]
        #长度特征
        len_.append(a)
        b = abs(len(new_sentences[2 * val]) - len(new_sentences[2 * val + 1]))
        #长度差特征
        dif_length.append(b)
    # combine_feature2 = np.vstack((np.transpose(len_), J_dist)).transpose()  # 3个
    # combine_feature = np.hstack((combine_feature1, combine_feature2))  # 5个
    # combine_feature2 = len_ #二维特征
    combine_feature3 = tfidf_cos
    combine_feature4 = distance
    # combine_feature = np.vstack((np.transpose(combine_feature), J_dist)).transpose()  # 6个
    combine_feature = np.hstack((combine_feature1, combine_feature3, combine_feature4))  # 4个
    print combine_feature.shape
    # 编辑距离
    dim_1_num = 4  # 一维特征列数
    # dim_1_num = 6  # 一维特征列数
    print 'combine_feature length:', len(combine_feature)
# 产生新特征(size*num 维度)
    feature1_val = []
    feature2_val = []
    feature3_val = []
    feature4_val = []
    feature5_val = []
    feature6_val = []
    print 'train', len(trains)
    #train就是句子向量对应的矩阵
    for i in range(len(trains)/2):
        vec1 = trains[2*i]
        vec2 = trains[2*i+1]
        feature1_val.append(vec1)
        feature2_val.append(vec2)
    for i in range(len(trains)/2):
        vec3 = vec_minus(trains[2*i], trains[2*i+1])
        vec4 = vec_multi(trains[2*i], trains[2*i+1])
        feature3_val.append(vec3)
        feature4_val.append(vec4)
    print 'feature1_val length:', len(feature3_val)
    # feature_val = np.hstack((feature1_val, feature2_val, feature3_val, feature4_val, combine_feature))  # 400列///406
    #合成总特征feature_val
    feature_val = np.hstack((feature1_val, feature2_val,combine_feature))  # 300列///406
    #feature_val = combine_feature
    np.save("feature_val.npy", feature_val)
    # feature_num = 4
    feature_num = 2 #几个feature1_val向量组成的特征
    print u'特征生成完毕'

    y_true = []
    with open(testpath, 'r') as f, open(inpath, 'r') as fin:
        for line in f.readlines():
            pair_id, sen1, sen2, label = line.strip().split('\t')
            label = int(label)
            y_true += [label]
    np.save('y_true.npy', y_true)
    print 'y_true length', len(y_true)

    # 读取数据
    y_true = np.load('y_true.npy')
    # bm_score = np.load('bm_score.npy')
    # bm_score_train = bm_score[:test_num]
    # print 'bm_score_train length', len(bm_score_train)
    # bm_score_test = bm_score[test_num:]
    # bm_score = pd.Series([bm_score], index=['bm_score'])
    cos_val = np.load('cos_val.npy')
    cos_val_train = cos_val[:test_num]
    cos_val_test = cos_val[test_num:]
    feature_val = np.load('feature_val.npy')
    feature_val_train = feature_val[:test_num]
    feature_val_test = feature_val[test_num:]
    #每一行对应一个训练集，最后一列是标号
    trains = np.vstack((np.transpose(feature_val_train), y_true)).transpose()
    print np.array(trains).shape
    print u"数据拆分"
    train, val = train_test_split(trains, test_size=0.2, random_state=21)
    print 'train length', len(train)
    print 'val length', len(val)
    print u"训练集"
    y = [train[i][feature_num*size+dim_1_num] for i in range(len(train))]  # 训练集标签
    X = [train[i][:feature_num*size+dim_1_num] for i in range(len(train))]  # 训练集特征矩阵
    print u"验证集"
    val_y = [val[i][feature_num*size+dim_1_num] for i in range(len(val))]  # 验证集标签
    val_X = [val[i][:feature_num*size+dim_1_num] for i in range(len(val))]  # 验证集特征矩阵
    print u"测试集"
    tests = feature_val_test
    # 数据转换
    lgb_train = lgb.Dataset(X, y, free_raw_data=False)
    lgb_eval = lgb.Dataset(val_X, val_y, reference=lgb_train, free_raw_data=False)
    # 开始训练
    print u'设置参数'
    params = {
        'num_threads' : '4',
        'boosting_type': 'gbdt',
        'boosting': 'gbdt',
        'objective': 'binary',
        'metric': 'binary_logloss',

        'learning_rate': 0.1,
        'num_leaves': 25,
        'max_depth': 3,

        'max_bin': 10,
        'min_data_in_leaf': 8,

        'feature_fraction': 1,
        'bagging_fraction': 0.7,
        'bagging_freq': 5,

        'lambda_l1': 0,
        'lambda_l2': 0,
        'min_split_gain': 0
        
    }
    print u"开始训练"
    gbm = lgb.train(params,  # 参数字典
                    lgb_train,  # 训练集
                    num_boost_round=3000,  # 迭代次数
                    valid_sets=lgb_eval,  # 验证集
                    early_stopping_rounds=30)  # 早停系数
    # 保存模型
    from sklearn.externals import joblib
    joblib.dump(gbm, 'gbm.pkl')
    print u"预测,并输出在 outpath"
    preds_offline = gbm.predict(tests, num_iteration=gbm.best_iteration)  # 输出概率
    np.save('preds.npy', preds_offline)

    if not SUBMIT:
        N = 200
        score_best = 0
        with open('merge_test.csv', 'r') as f1:
            y_true_10 = []
            for eachLine in f1:
                lineno, sen1, sen2, label = eachLine.strip().split('\t')
                a = int(label)
                y_true_10.append(a)

        for thred in range(1,N):  # 阈值的选取，如何找到最好的阈值
            thred = thred * (np.max(preds_offline) - np.min(preds_offline)) / N + np.min(preds_offline)
            pred = []
            for i in range(len(preds_offline)):
                if preds_offline[i] > thred:
                    pred.append(1)
                else:
                    pred.append(0)
            score = eval_file(y_true_10, pred)
            if score > score_best:
                score_best = score
                thred_best = thred
        print u'最优阈值：', thred_best

        for i in range(len(preds_offline)):
            if preds_offline[i] > thred_best:
                preds_offline[i] = 1
            else:
                preds_offline[i] = 0
        print len(preds_offline)
        f1_score = eval_file(y_true_10, preds_offline)
        print 'F1 score is :' + str(f1_score)
        fout = open(outpath,'w')
        for t in preds_offline:
            fout.write(str(t))
            fout.write('\n')
        fout.write('F1 score is :' + str(f1_score))
        fout.close()
    else:
        with open(inpath, 'r') as fin, open(outpath, 'w') as fout:
            line_id = []
            for line in fin:
                lineno, sen1, sen2 = line.strip().split('\t')
                line_id.append(lineno)
            for i in range(len(line_id)):
                if preds_offline[i] >= 0.246:
                    fout.write(line_id[i] + '\t1\n')
                else:
                    fout.write(line_id[i] + '\t0\n')
    print u'运行完毕', time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))  # 输出当前时间

你可能感兴趣的:(competition)

龙珠训练营机器学习task04 a_little_pig_ python
学习笔记为阿里云天池龙珠计划机器学习训练营的学习内容，学习链接为：https://tianchi.aliyun.com/competition/entrance/231702/introduction?spm=5176.20222472.J_3678908510.8.8f5e67c2RKrT98总体思路：分别使用LightGBM，xgboost，gbdt，catboost建立多个个体学习器（加入b
【Codex】Evaluating Large Language Models Trained on Code NLP_wendi 深度学习 Transformer deep learning 人工智能
这篇文章来解读最近比较有意思的Transformer预训练模型在自动生成代码方面的应用，PaperLink:EvaluatingLargeLanguageModelsTrainedonCode自动生成Code系列文章解读：【AlphaCode】Competition-LevelCodeGenerationwithAlphaCodeAbstract我们提出了Codex模型，基于GPT的模型架构，在G
2020-03-24 黑乎乎AI
Datawhale零基础入门数据挖掘-Task2数据分析【代码摘要】赛题：零基础入门数据挖掘-二手车交易价格预测地址：[https://tianchi.aliyun.com/competition/entrance/231784/introduction?spm=5176.12281957.1004.1.38b02448ausjSX]EDA的价值主要在于熟悉数据集，了解数据集，对数据集进行验证来确
速速报名，30万奖金不容错过！OpenTiny 挑战赛开发指南来咯~ 前端开源组件库低代码
OpenTiny正在参加开放原子开源基金会的开源大赛，目前大赛活动正在火热进行中，报名截止时间是2024-03-15，还没上车的朋友们抓紧时间，报名参赛即有机会获得丰厚奖金，一等奖10万，总共30万。OpenTiny前端Web应用开发挑战赛报名链接：https://competition.atomgit.com/competitionInfo?id=341b80d53f...点击报名按钮即可报名参
Piano Competition Arvin于
June-8thSaturdaySunnyTodayItookpartintheSingaporeinternationalyouthpianocompetition.ThecompetitionisinXiMeiWuZhouhotel.Wesignedit.Theteacherwouldcallsevenpeopleatatimeintothecompetitionroom.Itwasmytur
2021-11-21 张建邦
1、global：全球的全球污染：globalpollution正如科学家预言的那样：asispredictedbyscientists翻译：Asispredictedbyscientists，globalpollutionhasbecomeoneoftheseverestquestionsthatpeopleface.2、competition竞争applicant求职者…的五倍：fivetim
N1CTF Junior 2024 Web Official Writeup（Nu1L Team组织的官方纳新赛事，旨在选拔优秀人才加入Nu1L Team，可是小北是大二生，抱着玩玩的心态来的） Stitch . CTF 我的大学笔记 Web 前端 android web web安全 CTF N1CTF
Nu1L-CTF大本营-网络安全竞赛平台-i春秋(ichunqiu.com)https://www.ichunqiu.com/competition/team/15赛事举办方信息Nu1LTeam组织的官方纳新赛事，旨在选拔优秀人才加入Nu1LTeam作为国内TOPCTF战队，Nu1LTeam自2015年10月成立以来，斩获了国内外众多赛事冠军以及闯入DEFCONCTF总决赛，这得益于Nu1L每一位
图像识别ocr 等经典项目 javastart 深度学习文本检测
看到不错的资料，刚好是自己需要的方面，先收集起来。百度图像识别初赛数据集链接:https://pan.baidu.com/s/19cX6DH4fnQMd4S2_XH-l4w密码:guc3初赛和决赛代码https://github.com/ypwhs/baiduyun_deeplearning_competition:生成车牌号链接:https://pan.baidu.com/s/1EySDV4Vv
知识 | 3C战略三角模型简介 Jer_Z
3C战略三角模型（3C模型）是由日本战略研究的领军人物大前研一（KenichiOhmae）提出的，他强调成功战略有三个关键因素，在制定任何经营战略时，都必须考虑这三个因素，即公司自身（Corporation）。公司顾客（Customer）。竞争对手（Competition）。只有将公司、顾客与竞争者整合在同一个战略内，可持续的竞争优势才有存在的可能。公司本身、顾客和竞争对手构成了战略的三角形，精明
KAGGLE · GETTING STARTED CODE COMPETITION 图像风格迁移示例代码阅读 Karen_Yu_ tensorflow GAN keras 计算机视觉风格迁移
本博文阅读的代码来自于I’mSomethingofaPainterMyself|Kaggle倾情推荐：MonetCycleGANTutorial|Kaggle数据集说明I’mSomethingofaPainterMyself|KaggleFilesmonet_jpg-300Monetpaintingssized256x256inJPEGformatmonet_tfrec-300Monetpaint
官宣！OpenTiny 前端 Web 应用开发挑战赛开赛啦~30万奖金等你拿！ OpenTiny社区前端前端开源 web 低代码
OpenTiny前端Web应用开发挑战赛开赛啦~30万奖金等你拿！无论你是个人还是团队作战！只要你对前端感兴趣，想要参与前端项目实战你就可以报名参赛！！！报名入口：https://competition.atomgit.com/competitionInfo?id=341b80d53f52b95c53606723e10f575b只要你报名参与《OpenTiny前端Web应用开发挑战赛》你可以了解开
1、BCI competition iv 2a数据预处理是馒头阿深度学习人工智能 python 神经网络 pytorch
前言：好的，这篇开始，我们首先以08年BCI国际大赛4分类运动想象的数据作为入门项目，本篇讲的是该数据集的预处理工程，BCI领域有很多研究方向的数据，比如运动想象，睡眠，癫痫，眼动，肌电等等，而我是比较推荐把该数据作为入门的数据，官方介绍简单易懂，并且做的是分类。1.什么是预处理工程？定义：数据预处理是从数据中检测，纠正或删除损坏，不准确或不适用于模型的记录的过程问题：数据类型不同，比如有的是文字
天池零基础入门数据挖掘-心跳信号分类预测（流程初试水） JadessLi
作为第一次参加此类比赛的小白，流水账般地记录一下自己首次比赛提交过程。一.赛题简介https://tianchi.aliyun.com/competition/entrance/531883/introduction题以心电图心跳信号数据为背景，要求选手根据心电图感应数据预测心跳信号所属类别，其中心跳信号对应正常病例以及受不同心律不齐和心肌梗塞影响的病例，这是一个多分类的问题。通过这道赛题来引导大
冠军团队！第二届百度搜索创新大赛AI方案 Datawhale 人工智能百度
Datawhale干货作者：李柯辰，Datawhale成员写在前面大家好，我们是2023年第二届百度搜索创新大赛赛道三——AI应用设计赛道的冠军团队——“肝到凌晨”，很高兴能与大家分享我们这次比赛的经验，同时也希望以后有机会可以一起交流学习。赛事地址：https://aistudio.baidu.com/competition/detail/1067/0/introduction赛道三——“设计一
[BJDCTF2020]BJD hamburger competition-buuctf oliveira-time java 前端开发语言
buuctf做题记录拿到手里是一堆文件：这里的code是我之后存在这里的一个文件夹，并不是题目给的）看到了Unity，之前并未做过类似的题目，去查了相关参考：比较关键的几点是：u3d支持c#和js两种脚本，而且c#是最普遍的核心.net代码位于下述路径:\TPH\TPH_Data\Managed\Assembly-CSharp.dll3.关于.net逆向分析根据以上信息，使用ILSpy打开hamb
Python机器学习实践与Kaggle实战（转） weixin_30512785
https://mlnote.wordpress.com/2015/12/16/python%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E5%AE%9E%E8%B7%B5%E4%B8%8Ekaggle%E5%AE%9E%E6%88%98-machine-learning-for-kaggle-competition-in-python/Author:MiaoFan(范
基于决策树（Decision Tree）的bagging算法：随机森林（Random Forest）（包括具体代码）浅行learning 机器学习
重点先跑模型：数据：http://sofasofa.io/competition.php?id=1代码：需要修改一下文件的地址：代码运行的时候一步一步来：#-*-coding:utf-8-*-#import常用包importnumpyasnpimportpandasaspdimportmatplotlib.pyplotasplt#数据完整化和数据数值化fromsklearn.preprocessi
kaggle competition code & nice kernels & thinking Andy_Ren
背景过去1年在kaggle上有过几次参赛，排名一直在往上升从65%~15%。也算有进步，但是不够快。所以花一天的时间把参赛的代码整理了一下。公布在GitHub上：https://github.com/mr007rin/kaggleimage.png结构common：常用的代码段，包括EDA、数据清洗、特征工程、模型构建（目前仅支持分类模型）、模型选择参赛代码：home-credit-default
Uncle Maker: (Time)Stamping Out The Competition in Ethereum 粥粥粥少女的拧发条鸟 CCS 2023 区块链
目录笔记后续的研究方向摘要引言贡献攻击的简要概述UncleMaker:(Time)StampingOutTheCompetitioninEthereumCCS2023笔记本文对以太坊1的共识机制进行了攻击，该机制允许矿工获得比诚实同行更高的挖矿奖励。这种名为“UncleMaker”的攻击操纵区块时间戳和难度调整算法，使矿工在区块竞赛中占据优势。作者描述了攻击的几种变体，包括一种对矿工来说没有风险的
CCKS2023-面向金融领域的主体事件检测-亚军方案分享谈笑风生... AI数据竞赛金融 AI AI数据竞赛
赛题分析大赛地址https://tianchi.aliyun.com/competition/entrance/532098/introduction?spm=a2c22.12281925.0.0.52b97137bpVnmh任务描述主体事件检测是语言文本分析和金融领域智能应用的重要任务之一，如在金融风控领域往往会对公司主体进行风险事件的检测。基于句子粒度的上下文进行公司事件检测，事件包含事件类型
Unit3 translation 王雨缘
1.正如科学家所预言的那样,全球污染成了人类面临的最严重的问题题之一。Asscientistspredicts,theglobalpollutionbecomesoneofthemostseriousproblemsthatpeoplearefacedwith.2.谋求这些职份的竞争很激烈—今年的求职者(applicant)是去年的五倍.(competition)Thecompetitionofs
School training competition ( Second ) ros275229 算法学习 CF 算法 c++
A.MediumNumber链接:Problem-1760A-Codeforces就是求三个数的中位数:#include#defineIOSios::sync_with_stdio(0);cin.tie(0);cout.tie(0);#defineendl'\n'usingnamespacestd;typedeflonglongLL;constintN=2e5+10;inlinevoidsolve
2020-04-01 黑乎乎AI
#Datawhale零基础入门数据挖掘-Task4建模调参##四、建模与调参Tip:此部分为零基础入门数据挖掘的Task4建模调参部分，带你来了解各种模型以及模型的评价和调参策略，欢迎大家后续多多交流。**赛题：零基础入门数据挖掘-二手车交易价格预测**地址：[https://tianchi.aliyun.com/competition/entrance/231784/introduction?s
BUUCTF [HBNIS2018]caesar 1 玥轩_521 BUUCTF MISC 安全 CTF 笔记 BUUCTF 网络安全 Misc
BUUCTF:https://buuoj.cn/challenges题目描述：得到的flag请包上flag{}提交。来源：https://github.com/hebtuerror404/CTF_competition_warehouse_2018密文：下载附件，得到一个.txt文件。解题思路：1、用浏览器搜索“caesar”，发现是“凯撒”的意思，再看题目描述，感觉是凯撒加密。2、使用Pytho
Datawhale智能汽车AI挑战赛 Unicornlyy 机器学习 python学习汽车人工智能
1.赛题解析赛题地址：https://tianchi.aliyun.com/competition/entrance/532155任务：输入：元宇宙仿真平台生成的前视摄像头虚拟视频数据（8-10秒左右）；输出：对视频中的信息进行综合理解，以指定的json文件格式，按照数据说明中的关键词（key）填充描述型的文本信息（value，中文/英文均可以）评分标准：系统会针对参赛者提交的json文件，通过描
NOIP2023模拟13联测34 competition coolalex776 题解算法数据结构
题目大意给定一个有nnn个选手的团队去参加比赛，比赛有mmm道题，每个选手可以100%100\%100%将第li∼ril_i\simr_ili∼ri道题做出来。比赛时，团队会随机派出编号连续的人去做题，得分为做出来题目的总数。求该团队参加比赛的期望得分。答案对1e9+71e9+71e9+7取模。题目思路我们先考虑暴力做法，先n2n^2n2枚举派出那些选手去参加比赛，然后log⁡n\lognlogn
NOIP2023模拟13联测34 B.competition 2020fengziyang 线段树题解算法 c++数据结构 noip模拟
NOIP2023模拟13联测34B.competition文章目录NOIP2023模拟13联测34B.competition题目大意思路code题目大意现在有nnn个区间[li,ri][l_i,r_i][li,ri]，现在问你选取若干的连续的区间的区间并的大小的和。思路设prei,jpre_{i,j}prei,j表示前i−1i-1i−1个区间内，包含点jjj的最靠右的数是多少。可以发现答案就是∑i
NOIP2023模拟13联测34 总结 2020fengziyang 模拟赛总结 noip模拟考试总结
NOIP2023模拟13联测34总结文章目录NOIP2023模拟13联测34总结比赛过程题目A.origen题目大意思路B.competition题目大意思路C.tour题目大意D.abstract题目大意比赛过程看了一下题，感觉就T2T2T2有一点思路。T1T1T1先打一个303030分暴力，感觉要分位考虑，想了大概1h1h1h就跳了。T2T2T2想到了先求出整个区间的长度乘上包含这个区间的总数
基于运动想象的公开数据集：Data set IVa (BCI Competition III) Jurio. BCI 机器学习公开数据集
由FraunhoferFIRST、智能数据分析小组（Klaus-RobertMüller、BenjaminBlankertz）以及Charité-柏林大学医学部本杰明·富兰克林校区神经学系神经物理学小组（GabrielCurio）提供的数据集。致BenjaminBlankertz⟨[email protected]⟩1EEG信号分析参考后文：https://blog.c
2023NOIP A层联测26-competition dygxczn 算法数据结构
现在有一个题目数量为mmm的比赛，有一个团队想要来参加。这个团队有nnn位选手，编号为iii的选手能做第li∼ril_i\simr_ili∼ri道题，每一道题他都有100%100\%100%的概率能做出来。这个团队会随机派出一支队伍来参加这个比赛。因为编号相邻的人关系更好，默契度也更高，所以说一个团队派出的队伍一直都是编号为连续的区间的选手。一个队伍的得分为该队伍能做出的题的数量。求这个团队参加比
VMware Workstation 11 或者 VMware Player 7安装MAC OS X 10.10 Yosemite iwindyforest vmware mac os 10.10 workstation player
最近尝试了下VMware下安装MacOS 系统，安装过程中发现网上可供参考的文章都是VMware Workstation 10以下， MacOS X 10.9以下的文章，只能提供大概的思路，但是实际安装起来由于版本问题，走了不少弯路，所以我尝试写以下总结，希望能给有兴趣安装OSX的人提供一点帮助。写在前面的话：其实安装好后发现，由于我的th
关于《基于模型驱动的B/S在线开发平台》源代码开源的疑虑？ deathwknight JavaScript java 框架
本人从学习Java开发到现在已有10年整，从一个要自学 java买成javascript的小菜鸟，成长为只会java和javascript语言的老菜鸟（个人邮箱：[email protected]）一路走来，跌跌撞撞。用自己的三年多业余时间，瞎搞一个小东西（基于模型驱动的B/S在线开发平台，非MVC框架、非代码生成）。希望与大家一起分享，同时有许些疑虑，希望有人可以交流下平台
如何把maven项目转成web项目 Kai_Ge maven MyEclipse
创建Web工程，使用eclipse ee创建maven web工程 1.右键项目,选择Project Facets,点击Convert to faceted from 2.更改Dynamic Web Module的Version为2.5.(3.0为Java7的,Tomcat6不支持). 如果提示错误,可能需要在Java Compiler设置Compiler compl
主管？？？ Array_06 工作
转载：http://www.blogjava.net/fastzch/archive/2010/11/25/339054.html 很久以前跟同事参加的培训，同事整理得很详细，必须得转！前段时间，公司有组织中高阶主管及其培养干部进行了为期三天的管理训练培训。三天的课程下来，虽然内容较多，因对老师三天来的课程内容深有感触，故借着整理学习心得的机会，将三天来的培训课程做了一个
python内置函数大全 2002wmj python
最近一直在看python的document，打算在基础方面重点看一下python的keyword、Build-in Function、Build-in Constants、Build-in Types、Build-in Exception这四个方面，其实在看的时候发现整个《The Python Standard Library》章节都是很不错的，其中描述了很多不错的主题。先把Build-in Fu
JSP页面通过JQUERY合并行 357029540 JavaScript jquery
在写程序的过程中我们难免会遇到在页面上合并单元行的情况，如图所示如果对于会的同学可能很简单，但是对没有思路的同学来说还是比较麻烦的，提供一下用JQUERY实现的参考代码 function mergeCell(){ var trs = $("#table tr"); &nb
Java基础冰天百华 java基础
学习函数式编程 package base; import java.text.DecimalFormat; public class Main { public static void main(String[] args) { // Integer a = 4; // Double aa = (double)a / 100000; // Decimal
unix时间戳相互转换 adminjun 转换 unix 时间戳
如何在不同编程语言中获取现在的Unix时间戳(Unix timestamp)？ Java time JavaScript Math.round(new Date().getTime()/1000) getTime()返回数值的单位是毫秒 Microsoft .NET / C# epoch = (DateTime.Now.ToUniversalTime().Ticks - 62135
作为一个合格程序员该做的事 aijuans 程序员
作为一个合格程序员每天该做的事 1、总结自己一天任务的完成情况最好的方式是写工作日志，把自己今天完成了什么事情，遇见了什么问题都记录下来，日后翻看好处多多 2、考虑自己明天应该做的主要工作把明天要做的事情列出来，并按照优先级排列，第二天应该把自己效率最高的时间分配给最重要的工作 3、考虑自己一天工作中失误的地方，并想出避免下一次再犯的方法出错不要紧，最重
由html5视频播放引发的总结 ayaoxinchao html5 视频 video
前言项目中存在视频播放的功能，前期设计是以flash播放器播放视频的。但是现在由于需要兼容苹果的设备，必须采用html5的方式来播放视频。我就出于兴趣对html5播放视频做了简单的了解，不了解不知道，水真是很深。本文所记录的知识一些浅尝辄止的知识，说起来很惭愧。视频结构本该直接介绍html5的<video>的，但鉴于本人对视频
解决httpclient访问自签名https报javax.net.ssl.SSLHandshakeException: sun.security.validat bewithme httpclient
如果你构建了一个https协议的站点，而此站点的安全证书并不是合法的第三方证书颁发机构所签发，那么你用httpclient去访问此站点会报如下错误 javax.net.ssl.SSLHandshakeException: sun.security.validator.ValidatorException: PKIX path bu
Jedis连接池的入门级使用 bijian1013 redis redis数据库 jedis
Jedis连接池操作步骤如下： a.获取Jedis实例需要从JedisPool中获取； b.用完Jedis实例需要返还给JedisPool； c.如果Jedis在使用过程中出错，则也需要还给JedisPool； packag
变与不变 bingyingao 不变变亲情永恒
变与不变周末骑车转到了五年前租住的小区，曾经最爱吃的西北面馆、江西水饺、手工拉面早已不在，各种店铺都换了好几茬，这些是变的。三年前还很流行的一款手机在今天看起来已经落后的不像样子。三年前还运行的好好的一家公司，今天也已经不复存在。一座座高楼拔地而起，
【Scala十】Scala核心四：集合框架之List bit1129 scala
Spark的RDD作为一个分布式不可变的数据集合，它提供的转换操作，很多是借鉴于Scala的集合框架提供的一些函数，因此，有必要对Scala的集合进行详细的了解 1. 泛型集合都是协变的，对于List而言，如果B是A的子类，那么List[B]也是List[A]的子类，即可以把List[B]的实例赋值给List[A]变量 2. 给变量赋值(注意val关键字，a，b
Nested Functions in C bookjovi c closure
Nested Functions 又称closure，属于functional language中的概念，一直以为C中是不支持closure的，现在看来我错了，不过C标准中是不支持的，而GCC支持。既然GCC支持了closure，那么 lexical scoping自然也支持了，同时在C中label也是可以在nested functions中自由跳转的
Java-Collections Framework学习与总结-WeakHashMap BrokenDreams Collections
总结这个类之前，首先看一下Java引用的相关知识。Java的引用分为四种：强引用、软引用、弱引用和虚引用。强引用：就是常见的代码中的引用，如Object o = new Object();存在强引用的对象不会被垃圾收集
读《研磨设计模式》-代码笔记-解释器模式-Interpret bylijinnan java 设计模式
声明：本文只为方便我个人查阅和理解，详细的分析以及源代码请移步原作者的博客http://chjavach.iteye.com/ package design.pattern; /* * 解释器（Interpreter）模式的意图是可以按照自己定义的组合规则集合来组合可执行对象 * * 代码示例实现XML里面1.读取单个元素的值 2.读取单个属性的值 * 多
After Effects操作&快捷键 cherishLC After Effects
1、快捷键官方文档中文版：https://helpx.adobe.com/cn/after-effects/using/keyboard-shortcuts-reference.html 英文版：https://helpx.adobe.com/after-effects/using/keyboard-shortcuts-reference.html 2、常用快捷键
Maven 常用命令 crabdave maven
Maven 常用命令 mvn archetype:generate mvn install mvn clean mvn clean complie mvn clean test mvn clean install mvn clean package mvn test mvn package mvn site mvn dependency:res
shell bad substitution daizj shell 脚本
#!/bin/sh /data/script/common/run_cmd.exp 192.168.13.168 "impala-shell -islave4 -q 'insert OVERWRITE table imeis.${tableName} select ${selectFields}, ds, fnv_hash(concat(cast(ds as string), im
Java SE 第二讲（原生数据类型 Primitive Data Type） dcj3sjt126com java
Java SE 第二讲： 1. Windows: notepad, editplus, ultraedit, gvim Linux: vi, vim, gedit 2. Java 中的数据类型分为两大类： 1）原生数据类型（Primitive Data Type） 2）引用类型（对象类型）（R
CGridView中实现批量删除 dcj3sjt126com PHP yii
1，CGridView中的columns添加 array( 'selectableRows' => 2, 'footer' => '<button type="button" onclick="GetCheckbox();" style=&
Java中泛型的各种使用 dyy_gusi java 泛型
Java中的泛型的使用：1.普通的泛型使用在使用类的时候后面的<>中的类型就是我们确定的类型。 public class MyClass1<T> {//此处定义的泛型是T private T var; public T getVar() { return var; } public void setVa
Web开发技术十年发展历程 gcq511120594 Web 浏览器数据挖掘
回顾web开发技术这十年发展历程： Ajax 03年的时候我上六年级，那时候网吧刚在小县城的角落萌生。传奇，大话西游第一代网游一时风靡。我抱着试一试的心态给了网吧老板两块钱想申请个号玩玩，然后接下来的一个小时我一直在，注，册，账，号。彼时网吧用的512k的带宽，注册的时候，填了一堆信息，提交，页面跳转，嘣，”您填写的信息有误，请重填”。然后跳转回注册页面，以此循环。我现在时常想，如果当时a
openSession()与getCurrentSession()区别： hetongfei java DAO Hibernate
来自 http://blog.csdn.net/dy511/article/details/6166134 1.getCurrentSession创建的session会和绑定到当前线程,而openSession不会。 2. getCurrentSession创建的线程会在事务回滚或事物提交后自动关闭,而openSession必须手动关闭。这里getCurrentSession本地事务(本地
第一章安装Nginx+Lua开发环境 jinnianshilongnian nginx lua openresty
首先我们选择使用OpenResty，其是由Nginx核心加很多第三方模块组成，其最大的亮点是默认集成了Lua开发环境，使得Nginx可以作为一个Web Server使用。借助于Nginx的事件驱动模型和非阻塞IO，可以实现高性能的Web应用程序。而且OpenResty提供了大量组件如Mysql、Redis、Memcached等等，使在Nginx上开发Web应用更方便更简单。目前在京东如实时价格、秒
HSQLDB In-Process方式访问内存数据库 liyonghui160com
HSQLDB一大特色就是能够在内存中建立数据库，当然它也能将这些内存数据库保存到文件中以便实现真正的持久化。先睹为快！下面是一个In-Process方式访问内存数据库的代码示例：下面代码需要引入hsqldb.jar包（hsqldb-2.2.8） import java.s
Java线程的5个使用技巧 pda158 java 数据结构
Java线程有哪些不太为人所知的技巧与用法？　　萝卜白菜各有所爱。像我就喜欢Java。学无止境，这也是我喜欢它的一个原因。日常工作中你所用到的工具，通常都有些你从来没有了解过的东西，比方说某个方法或者是一些有趣的用法。比如说线程。没错，就是线程。或者确切说是Thread这个类。当我们在构建高可扩展性系统的时候，通常会面临各种各样的并发编程的问题，不过我们现在所要讲的可能会略有不同。
开发资源大整合：编程语言篇——JavaScript（1） shoothao JavaScript
概述：本系列的资源整合来自于github中各个领域的大牛，来收藏你感兴趣的东西吧。程序包管理器管理javascript库并提供对这些库的快速使用与打包的服务。 Bower - 用于web的程序包管理。 component - 用于客户端的程序包管理，构建更好的web应用程序。 spm - 全新的静态的文件包管
避免使用终结函数 vahoa.ma java jvm C++
终结函数（finalizer）通常是不可预测的，常常也是很危险的，一般情况下不是必要的。使用终结函数会导致不稳定的行为、更差的性能，以及带来移植性问题。不要把终结函数当做C++中的析构函数（destructors）的对应物。我自己总结了一下这一条的综合性结论是这样的： 1）在涉及使用资源，使用完毕后要释放资源的情形下，首先要用一个显示的方