阿里云天池:https://tianchi.aliyun.com/dataset/dataDetail?dataId=45
了解背景:
预测指标:
预览数据:
购买数据
标签(Ali_Mum_Baby) | 定义 | 值 |
---|---|---|
user_id | 用户id | |
auction_id | 购买行为编号 | |
cat_id | 商品种类ID | |
cat1 | 商品属于哪个类别 | XX:XX;XX:XX;… |
property | 商品属性 | |
buy_mount | 购买数量 | |
day | 购买时间 | 年月日 ‘YYYYMMDD’ |
婴儿信息
标签(Ali_Baby) | 定义 | 值 |
---|---|---|
user_id | 用户id | |
birthday | 出生日期 | |
gender | 性别 | 0 男性;1 女性 |
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
warnings.filterwarnings('ignore')
查看购买信息,婴儿信息:
bought=pd.read_csv(r'../baby shop/表1购买商品.csv')
baby=pd.read_csv(r'../baby shop/表2婴儿信息.csv')
bought.head()
baby.head()
bought.info()
print("-"*30)
baby.info()
查看重复值
bought.duplicated('user_id').sum()
baby.duplicated('user_id').sum()
合并数据然后再拆分
# 合并以便于筛选
df=pd.merge(bought,baby,on='user_id',how='left')
# 拆分为训练数据和测试数据
train=df.drop(df[df['birthday'].isnull()].index)
test=df.drop(df[df['birthday'].notnull()].index)
#重制引索
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
查看训练数据图表
pandas_profiling.ProfileReport(train).to_file(r'../baby shop/example.html')
再合并
df=train.append(test,ignore_index=True)
df.drop(df['auction_id'], axis=1, inplace=True)
df.drop(df['birthday'], axis=1, inplace=True)
df.info()
df.describe()
异常值处理
# 不是自用的买家
df.drop(df['buy_mount'][df['buy_mount']>6].index,inplace=True)
# 性别数值为2
df.drop(df['gender'][df['gender']==2],inplace=True)
property商品属性处理:选择订单最多的商品属性进行填充。
df['property'] = df['property'].fillna(df['property'].mode().iloc[0])
gender婴儿性别填充,在特征工程后进行填充。
对property商品属性进行以“ ;”字符分列,提取商品各个属性
df['property'] = df['property'].astype('str')
y_property = df['property'].map(lambda x: x.split(';'‘:’))
y_property = y_property.tolist()
# 定义个递归函数
def flatten(items):
result = []
for item in items:
if isinstance(item, (tuple, list)):
result.extend(flatten(item))
else:
result.append(item)
return result
X_property=list(set(flatten(y_property)))# 28599个商品属性
将商品属性编码后与数据合并
df = pd.concat([df, df_property], axis=0)
年龄 = 购买日期 - 出生日期
import arrow
df['Age'] =train['day'].map(lambda x: arrow.get(str(x), 'YYYYMMDD'))- train['birthday'].map(lambda x: arrow.get(str(int(x)), 'YYYYMMDD'))
df['Age']=df['Age'].dt.days
df['Age']=df['Age'].map(lambda x: format(x/365,'.2f'))
df['Age']=df['Age'].astype('float')
# 父母会在孩子出生前提前购买,负数用0填充
df['Age']=df['Age'].map(lambda x: 0 if x <= 0 else x)
# 删除train中的出生日期,用年龄替换
train.drop(train['birthday'], axis=1, inplace=True)
train['Age']=df['Age']
婴儿的性别对用户购买的行为有一定的影响,查看各个商品属性与性别的相关度,取最高的几个。
Sexpre = df[X_property.extend('gender')]
Sexcov = Sexpre.corr()
Sexcov['Sex'].sort_values()
使用随机森林对其填充
SexKnown = Sexpre[Sexpre['Sex'].notnull()]
SexUnKnown = Sexpre[Sexpre['Sex'].isnull()]
SexKnown_X = SexKnown.drop(['Sex'], axis=1)
SexUnKnown_X = SexUnKnown.drop(['Sex'], axis=1)
SexKnown_y = SexKnown['Sex']
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor(random_state=None, n_estimators=500, n_jobs=-1)
rfr.fit(SexKnown_X, SexKnown_y)
rfr.score(SexKnown_X, SexKnown_y)
SexUnKnown_y = rfr.predict(SexUnKnown_X)
df.loc[df['Sex'].isnull(), ['Sex']] = SexUnKnown_y
df.info()#已无缺失值
先编码
# 删除property列
df.drop(df['property'],axis=1,inplace=True)
# 独热编码
df = pd.get_dummies(df)
PCA降维
from sklearn.decomposition import PCA, KernelPCA
# 拆分数据
n=len(df['Age'].notnull())
X = df[:n]
test_X = df[n:]
X_scaled = RobustScaler().fit(X).transform(X)
y_price = train['birthday']
test_X_scaled = RobustScaler().fit(test_X).transform(test_X)
pca = PCA(n_components=1000)
X_scaled = pca.fit_transform(X_scaled)
test_X_scaled = pca.transform(test_X_scaled)
X_scaled.shape, test_X_scaled.shape
# 导入机器学习算法库
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, \
ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
# 设置kfold,交叉采样法拆分数据集
kfold = StratifiedKFold(n_splits=5)
# 拆分数据
exercise_X = X_scaled
exercise_y = y_price
test_x = test_X_scaled
# 汇总不同模型算法
classifiers = []
classifiers.append(SVC())
classifiers.append(DecisionTreeClassifier())
classifiers.append(RandomForestClassifier())
classifiers.append(ExtraTreesClassifier())
classifiers.append(GradientBoostingClassifier())
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression())
classifiers.append(LinearDiscriminantAnalysis())
dfclass = pd.DataFrame(classifiers).astype('str')[0].map(lambda x: x.split('(')[0])
# 打印结果
results = []
for classifier in classifiers:
results.append(cross_val_score(classifier,
exercise_X,
exercise_y, scoring='accuracy', cv=kfold, n_jobs=-1))
results_means, results_std = [], []
for result in results:
results_means.append(result.mean())
results_std.append(result.std())
# 表格查看
resultsDF = pd.DataFrame(
{'std': results_std,
'means': results_means,
'sklean': dfclass})
result.to_csv('../baby shop/result1.csv', index=False)