数据竞赛/kaggle/数据分析入门汇总

以下为参加各种比赛时,涉及各种过程的代码汇总。主要使用一下python模块:

import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
import seaborn as sns 
%matplotlib inline 
import os
import pickle

读取和保存数据

#从csv加载dataframe
data = pd.read_csv('自己的路径', header=None, sep='\t')
data.columns = ['id','feature1','feature2','...']  # 设置行索引
#从pkl加载dataframe
data = pickle.load(open("data/data_feature.pkl",'rb'))
data.columns = ['id','feature1','feature2','...']  # 设置行索引

建议统一采用以下的方式读取数据,即在第一次打开时保存为pkl(读取pkl文件速度快,特别在文件较大时,中间的特征也需要采用pkl保存):

def save_feature(data,names,file): #names为需要保存的列名list
    data = data[names]
    output = open(file, 'wb')
    pickle.dump(data, output)
    output.close()

names = 'u_id,gender,key_word'.split(",") #自定义列名 list

file = 'data/member_info_0926.pkl'
usingSaved = os.path.exists(file)
#读取item
if not usingSaved:    
    data = pd.read_csv('data/member_info_0926.txt', sep='\t', names = names)
    save_feature(data,data.columns,file)
else:
    data = pickle.load(open(file, 'rb'))
    
data.head()

测试集和训练集的拼接与分离

一般来说,归一化、label encoding、离散化等处理特征的方法需要测试集和训练集一块处理。

length = len(train)
data = pd.concat([train, test], axis=0, sort=True)
#-------------------------
对data进行特征处理
#-------------------------
#train和test的拆分
train_features = ['fea1','fea2','fea3'] #真正需要送入模型的特征名
train = data[train_features][0:length]
test = data[train_features][length:]

特征工程常用函数

离散特征(例如用户id),最常见的编码方法,label encoding, count vec 和mean encoding

name = ['性别','邀请创建时间-hour','邀请创建时间-week','用户多分类特征d','用户多分类特征c','用户多分类特征a']
'''
label encoding
data: 整个dataframe
feats: 需要进行label encoding的特征名称
在embedding 前需要进行label encoding,比较稀疏时作为lightgbm的输入意义不大
'''
def ge_label_encoding(data,features):
    encoder = LabelEncoder()
    for feat in name:
        col_name = '{}_label_enc'.format(feat)
        data[feat+"_count"] = data[feat].map(data[feat].value_counts().astype(int))
        data.loc[data[feat+"_count"] < 2, feat] = "-1" # 对出现次数为1的进行截断,可自定义阈值
        encoder.fit(data[feat])
        data[col_name] = encoder.transform(data[feat])
    return data
'''
count vec 
data: 整个dataframe
feats: 需要进行count vec的特征名称
可直接作为lightgbm的输入
'''
def get_count_vec(data,feats):
    for feat in feats:
        col_name = '{}_count_vec'.format(feat)
        data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
        data.loc[data[col_name] < 2, feat] = -1 # 对出现次数为1的进行截断,可自定义阈值
        data[feat] += 1
        data[col_name] = data[feat].map(data[feat].value_counts().astype(int))
        data[col_name] = (data[col_name] - data[col_name].min()) / (data[col_name].max() - data[col_name].min())
    return data
'''
expand meaning encoding
data: 整个dataframe
feats: 需要进行meaning encoding的特征名称
target: 分类的目标标签
可直接作为lightgbm的输入
'''
def get_expand_meaning_encoding(data,feas,target):
    for fea in feas:
        cumsum = data.groupby(fea)[target].cumsum() - data[target]
        cumcnt = data.groupby(fea).cumcount()
        data[fea+"_mean_encoding"] = cumsum / cumcnt
    return data

多值离散特征的处理,需要变成定长的特征,因此需要截断和填补,对于其长度,根据具体情况可使用多值离散特征的平均长度,最长长度的一半,最长长度。

#对多值离散特征进行编码
def get_var_feature(data,names,nums):
    encoding_list = []
    key2index = {}
    for i,feat in enumerate(names):
        def __split(x):
            key_ans = x.split(',') #多值离散特征通常在同一列,并使用某种分隔符分割
            for key in key_ans:
                if key not in key2index:         
                    key2index[key] = len(key2index) + 1
            return list(map(lambda x: key2index[x], key_ans))
        genres_list = list(map(__split, data[feat].values))
        genres_length = np.array(list(map(len, genres_list)))
        max_len = nums[i]
        #注意选择填充和截断的位置在前还是在后
        genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post',truncating='pre' )
        data_topic = pd.DataFrame(genres_list)
        #生成列名字
        data_topic.columns = [feat+str(i) for i in range(max_len)] if max_len>1 else [feat+'_enc']
        encoding_list.append(data_topic)
    print("fea length {}".format(len(key2index)+1))
    data = data.reset_index(drop= True)
    data = pd.concat([data]+encoding_list, axis=1)
    return data
data = get_var_feature(data,['问题绑定话题','关注话题','用户上次回答话题'],[6,11,6])

你可能感兴趣的:(数据竞赛/kaggle/数据分析入门汇总)