以下为参加各种比赛时,涉及各种过程的代码汇总。主要使用一下python模块:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import seaborn as sns
%matplotlib inline
import os
import pickle
读取和保存数据
#从csv加载dataframe
data = pd.read_csv('自己的路径', header=None, sep='\t')
data.columns = ['id','feature1','feature2','...'] # 设置行索引
#从pkl加载dataframe
data = pickle.load(open("data/data_feature.pkl",'rb'))
data.columns = ['id','feature1','feature2','...'] # 设置行索引
建议统一采用以下的方式读取数据,即在第一次打开时保存为pkl(读取pkl文件速度快,特别在文件较大时,中间的特征也需要采用pkl保存):
def save_feature(data,names,file): #names为需要保存的列名list
data = data[names]
output = open(file, 'wb')
pickle.dump(data, output)
output.close()
names = 'u_id,gender,key_word'.split(",") #自定义列名 list
file = 'data/member_info_0926.pkl'
usingSaved = os.path.exists(file)
#读取item
if not usingSaved:
data = pd.read_csv('data/member_info_0926.txt', sep='\t', names = names)
save_feature(data,data.columns,file)
else:
data = pickle.load(open(file, 'rb'))
data.head()
测试集和训练集的拼接与分离
一般来说,归一化、label encoding、离散化等处理特征的方法需要测试集和训练集一块处理。
length = len(train)
data = pd.concat([train, test], axis=0, sort=True)
#-------------------------
对data进行特征处理
#-------------------------
#train和test的拆分
train_features = ['fea1','fea2','fea3'] #真正需要送入模型的特征名
train = data[train_features][0:length]
test = data[train_features][length:]
特征工程常用函数
离散特征(例如用户id),最常见的编码方法,label encoding, count vec 和mean encoding
name = ['性别','邀请创建时间-hour','邀请创建时间-week','用户多分类特征d','用户多分类特征c','用户多分类特征a']
'''
label encoding
data: 整个dataframe
feats: 需要进行label encoding的特征名称
在embedding 前需要进行label encoding,比较稀疏时作为lightgbm的输入意义不大
'''
def ge_label_encoding(data,features):
encoder = LabelEncoder()
for feat in name:
col_name = '{}_label_enc'.format(feat)
data[feat+"_count"] = data[feat].map(data[feat].value_counts().astype(int))
data.loc[data[feat+"_count"] < 2, feat] = "-1" # 对出现次数为1的进行截断,可自定义阈值
encoder.fit(data[feat])
data[col_name] = encoder.transform(data[feat])
return data
'''
count vec
data: 整个dataframe
feats: 需要进行count vec的特征名称
可直接作为lightgbm的输入
'''
def get_count_vec(data,feats):
for feat in feats:
col_name = '{}_count_vec'.format(feat)
data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
data.loc[data[col_name] < 2, feat] = -1 # 对出现次数为1的进行截断,可自定义阈值
data[feat] += 1
data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
return data
'''
expand meaning encoding
data: 整个dataframe
feats: 需要进行meaning encoding的特征名称
target: 分类的目标标签
可直接作为lightgbm的输入
'''
def get_expand_meaning_encoding(data,feas,target):
for fea in feas:
cumsum = data.groupby(fea)[target].cumsum() - data[target]
cumcnt = data.groupby(fea).cumcount()
data[fea+"_mean_encoding"] = cumsum / cumcnt
return data
多值离散特征的处理,需要变成定长的特征,因此需要截断和填补,对于其长度,根据具体情况可使用多值离散特征的平均长度,最长长度的一半,最长长度。
#对多值离散特征进行编码
def get_var_feature(data,names,nums):
encoding_list = []
key2index = {}
for i,feat in enumerate(names):
def __split(x):
key_ans = x.split(',') #多值离散特征通常在同一列,并使用某种分隔符分割
for key in key_ans:
if key not in key2index:
key2index[key] = len(key2index) + 1
return list(map(lambda x: key2index[x], key_ans))
genres_list = list(map(__split, data[feat].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = nums[i]
#注意选择填充和截断的位置在前还是在后
genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post',truncating='pre' )
data_topic = pd.DataFrame(genres_list)
#生成列名字
data_topic.columns = [feat+str(i) for i in range(max_len)] if max_len>1 else [feat+'_enc']
encoding_list.append(data_topic)
print("fea length {}".format(len(key2index)+1))
data = data.reset_index(drop= True)
data = pd.concat([data]+encoding_list, axis=1)
return data
data = get_var_feature(data,['问题绑定话题','关注话题','用户上次回答话题'],[6,11,6])