目标编码_一般形式和函数封装

目标编码代码一般形式

import pandas as pd
pd.set_option('display.max_columns',None)#显示所有列
data=pd.read_csv(r"C:\Users\15187\Desktop\games1.csv")
colums=["firstBlood","winner"]这里这个只是为了拿到要的数据罢了,没有多余的意思
train=data[colums]
from sklearn.model_selection import KFold
columns=["firstBlood"]#定义分组依据
lable=["winner"]#Y目标值
folds = KFold(n_splits=5,shuffle=True,random_state=2020)#5折交叉运算
aggs={}
tt=pd.DataFrame()
caculate=["count","mean"]
for col in lable:
	aggs[col] = caculate
for col in columns:
	i=1#指示变量,为1的时候新生成一个数据框,以后直接concat即可
	
	for fold_,(trn_idx,val_idx) in enumerate(folds.split(train,train)):
		tmp = train.iloc[trn_idx]
		groupby_aim=tmp.groupby(columns).agg(aggs)
		#print(groupby_aim)
		if i==1:
			tt=pd.DataFrame(groupby_aim)
			i=2
		else:
			p=pd.DataFrame(groupby_aim)
			tt=pd.concat([tt,p])
	re=tt.groupby(by=columns).mean()#在对最后的count取每个组的均值
result = pd.merge(train, re, how="left", on=columns)#对函数进行左连接,这里tain可以是test,也可以是原始数据集
print(result)

结果如下表所示

 warnings.warn(msg, UserWarning)
       firstBlood  winner  (winner, count)  (winner, mean)
0               2       1          19857.6         1.58718
1               1       1          20890.4         1.40520
2               2       1          19857.6         1.58718
3               1       1          20890.4         1.40520
4               2       1          19857.6         1.58718

函数封装形式

def aim_code(columns,lable,tain,test):
	"""
	columns:分类依据
	lable:y变量,及目标值
	train:进行交叉统计的数据框
	test:测试集:或者原始数据集
	output:数据框格式
	"""
	train = tain
	from sklearn.model_selection import KFold
	columns = columns
	lable = lable
	test=test
	folds = KFold(n_splits=5, shuffle=True, random_state=2020)
	aggs = {}
	tt = pd.DataFrame()
	caculate = ["count", "mean"]
	for col in lable:
		aggs[col] = caculate
	for col in columns:
		colname = col + '_kfold'
		i = 1
		for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train)):
			tmp = train.iloc[trn_idx]
			groupby_aim = tmp.groupby(columns).agg(aggs)
			# print(groupby_aim)
			if i == 1:
				tt = pd.DataFrame(groupby_aim)
				i = 2
			else:
				p = pd.DataFrame(groupby_aim)
				tt = pd.concat([tt, p])
		re = tt.groupby(by=columns).mean()
	result = pd.merge(test, re, how="left", on=columns)
	return result

你可能感兴趣的:(机器学习,机器学习,python,数据挖掘)