数据包含约20万用户数据,分成12组,同时提供了用户行为属性,如:手机品牌、型号、APP的类型。
评价指标:logloss
步骤:
1.解读数据;
2.特征工程;
3.模型参数;
数据集说明:
每一个用户用一个ID表示,一个用户的行为是一系列的Events里面,每个Event里面的信息包括该ID发生的时间、地理坐标信息,安装的APP类型、手机型号类别等。
涉及知识点:
1.pandas多表连接、数据处理;
2.OneHot编码;
3.特征选择;
4.交叉验证选择参数
main.py
# -*- coding: utf-8 -*-
import pandas as pd
import os
from pd_tools import split_train_test, get_part_data
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.decomposition import PCA
from ml_tools import get_best_model
from sklearn.metrics import log_loss
from sklearn.feature_selection import VarianceThreshold
# 数据集变量声明
dataset_path = './dataset'
gender_age_filename = 'gender_age.csv'
phone_brand_device_model_filename = 'phone_brand_device_model.csv'
events_filename = 'events.csv'
app_events_filename = 'app_events.csv'
app_labels_filename = 'app_labels.csv'
label_categories_filename = 'label_categories.csv'
train_gender_age_filename = 'gender_age_train.csv'
test_gender_age_filename = 'gender_age_test.csv'
is_first_run = False
def run_main():
"""
主函数
"""
if is_first_run:
# 1. 分割数据集
print('分割数据集')
all_gender_age = pd.read_csv(os.path.join(dataset_path, gender_age_filename))
df_train, df_test = split_train_test(all_gender_age)
# 查看训练集测试集基本信息
print('训练集中各类的数据个数:', df_train.groupby('group').size())
print('测试集中各类的数据个数:', df_test.groupby('group').size())
# 保存分割的数据集
df_train.to_csv(os.path.join(dataset_path, train_gender_age_filename),
index=False)
df_test.to_csv(os.path.join(dataset_path, test_gender_age_filename),
index=False)
# 2. 加载数据
print('加载数据')
# 加载数据
gender_age_train = pd.read_csv(os.path.join(dataset_path, train_gender_age_filename),
index_col='device_id')
gender_age_test = pd.read_csv(os.path.join(dataset_path, test_gender_age_filename),
index_col='device_id')
# 选取部分数据用于实验
percent = 0.1
gender_age_train = get_part_data(gender_age_train, percent=percent)
gender_age_test = get_part_data(gender_age_test, percent=percent)
phone_brand_device_model = pd.read_csv(os.path.join(dataset_path, phone_brand_device_model_filename))
# 去掉重复数据
phone_brand_device_model = phone_brand_device_model.drop_duplicates('device_id').set_index('device_id')
events = pd.read_csv(os.path.join(dataset_path, events_filename),
usecols=['device_id', 'event_id'], index_col='event_id')
app_events = pd.read_csv(os.path.join(dataset_path, app_events_filename),
usecols=['event_id', 'app_id'])
# app_labels = pd.read_csv(os.path.join(dataset_path, app_labels_filename))
# 3. 特征工程
# 3.1 手机品牌特征
# 使用LabelEncoder将类别转换为数字
brand_label_encoder = LabelEncoder()
brand_label_encoder.fit(phone_brand_device_model['phone_brand'].values)
phone_brand_device_model['brand_label_code'] = \
brand_label_encoder.transform(phone_brand_device_model['phone_brand'].values)
gender_age_train['brand_label_code'] = phone_brand_device_model['brand_label_code']
gender_age_test['brand_label_code'] = phone_brand_device_model['brand_label_code']
# 使用OneHotEncoder将数字转换为OneHot码
brand_onehot_encoder = OneHotEncoder()
brand_onehot_encoder.fit(phone_brand_device_model['brand_label_code'].values.reshape(-1, 1))
tr_brand_feat = brand_onehot_encoder.transform(gender_age_train['brand_label_code'].values.reshape(-1, 1))
te_brand_feat = brand_onehot_encoder.transform(gender_age_test['brand_label_code'].values.reshape(-1, 1))
print('[手机品牌]特征维度:', tr_brand_feat.shape[1])
# 3.2 手机型号特征
# 合并手机品牌与型号字符串
phone_brand_device_model['brand_model'] = \
phone_brand_device_model['phone_brand'].str.cat(phone_brand_device_model['device_model'])
# 使用LabelEncoder将类别转换为数字
model_label_encoder = LabelEncoder()
model_label_encoder.fit(phone_brand_device_model['brand_model'].values)
phone_brand_device_model['brand_model_label_code'] = \
model_label_encoder.transform(phone_brand_device_model['brand_model'].values)
gender_age_train['brand_model_label_code'] = phone_brand_device_model['brand_model_label_code']
gender_age_test['brand_model_label_code'] = phone_brand_device_model['brand_model_label_code']
# 使用OneHotEncoder将数字转换为OneHot码
model_onehot_encoder = OneHotEncoder()
model_onehot_encoder.fit(phone_brand_device_model['brand_model_label_code'].values.reshape(-1, 1))
tr_model_feat = model_onehot_encoder.transform(gender_age_train['brand_model_label_code'].values.reshape(-1, 1))
te_model_feat = model_onehot_encoder.transform(gender_age_test['brand_model_label_code'].values.reshape(-1, 1))
print('[手机型号]特征维度:', tr_model_feat.shape[1])
# 3.3 安装app特征
device_app = app_events.merge(events, how='left', left_on='event_id', right_index=True)
# 运行app的总次数
n_run_s = device_app['app_id'].groupby(device_app['device_id']).size()
# 运行app的个数
n_app_s = device_app['app_id'].groupby(device_app['device_id']).nunique()
gender_age_train['n_run'] = n_run_s
gender_age_train['n_app'] = n_app_s
# 填充缺失数据
gender_age_train['n_run'].fillna(0, inplace=True)
gender_age_train['n_app'].fillna(0, inplace=True)
gender_age_test['n_run'] = n_run_s
gender_age_test['n_app'] = n_app_s
# 填充缺失数据
gender_age_test['n_run'].fillna(0, inplace=True)
gender_age_test['n_app'].fillna(0, inplace=True)
tr_run_feat = gender_age_train['n_run'].values.reshape(-1, 1)
tr_app_feat = gender_age_train['n_app'].values.reshape(-1, 1)
te_run_feat = gender_age_test['n_run'].values.reshape(-1, 1)
te_app_feat = gender_age_test['n_app'].values.reshape(-1, 1)
# 3.4 合并所有特征
tr_feat = np.hstack((tr_brand_feat.toarray(), tr_model_feat.toarray(), tr_run_feat, tr_app_feat))
te_feat = np.hstack((te_brand_feat.toarray(), te_model_feat.toarray(), te_run_feat, te_app_feat))
print('特征提取结束')
print('每个样本特征维度:', tr_feat.shape[1])
# 3.5 特征范围归一化
scaler = StandardScaler()
tr_feat_scaled = scaler.fit_transform(tr_feat)
te_feat_scaled = scaler.transform(te_feat)
# 3.6 特征选择
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
tr_feat_scaled_sel = sel.fit_transform(tr_feat_scaled)
te_feat_scaled_sel = sel.transform(te_feat_scaled)
# 3.7 PCA降维操作
pca = PCA(n_components=0.95) # 保留95%共享率的特征向量
tr_feat_scaled_sel_pca = pca.fit_transform(tr_feat_scaled_sel)
te_feat_scaled_sel_pca = pca.transform(te_feat_scaled_sel)
print('特征处理结束')
print('处理后每个样本特征维度:', tr_feat_scaled_sel_pca.shape[1])
# 4 为数据添加标签
group_label_encoder = LabelEncoder()
group_label_encoder.fit(gender_age_train['group'].values)
y_train = group_label_encoder.transform(gender_age_train['group'].values)
y_test = group_label_encoder.transform(gender_age_test['group'].values)
# 5. 训练模型
# 5.1 逻辑回归模型
print('训练逻辑回归模型...')
lr_param_grid = [
{'C': [1e-3, 1e-2, 1e-1, 1, 10, 100]}
]
lr_model = LogisticRegression()
best_lr_model = get_best_model(lr_model,
tr_feat_scaled_sel_pca, y_train,
lr_param_grid, cv=3)
y_pred_lr = best_lr_model.predict_proba(te_feat_scaled_sel_pca)
# 5.2 SVM
print('训练SVM模型...')
svm_param_grid = [
{'C': [1e-2, 1e-1, 1, 10, 100], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
# 设置probability=True用于输出预测概率
svm_model = svm.SVC(probability=True)
best_svm_model = get_best_model(svm_model,
tr_feat_scaled_sel_pca, y_train,
svm_param_grid, cv=3)
y_pred_svm = best_svm_model.predict_proba(te_feat_scaled_sel_pca)
# 6. 查看结果
print('逻辑回归模型 logloss:', log_loss(y_test, y_pred_lr))
print('SVM logloss:', log_loss(y_test, y_pred_svm))
if __name__ == '__main__':
run_main()
ml_tools.py
# -*- coding: utf-8 -*-
from sklearn.model_selection import GridSearchCV
def get_best_model(model, X_train, y_train, params, cv=5):
"""
交叉验证获取最优模型
默认5折交叉验证
"""
clf = GridSearchCV(model, params, cv=cv)
clf.fit(X_train, y_train)
return clf.best_estimator_
pd_tools.py
# -*- coding: utf-8 -*-
import pandas as pd
import math
def split_train_test(df_data, size=0.8):
"""
分割训练集和测试集
"""
# 为保证每个类中的数据能在训练集中和测试集中的比例相同,所以需要依次对每个类进行处理
df_train = pd.DataFrame()
df_test = pd.DataFrame()
labels = df_data['group'].unique().tolist()
for label in labels:
# 找出group的记录
df_w_label = df_data[df_data['group'] == label]
# 重新设置索引,保证每个类的记录是从0开始索引,方便之后的拆分
df_w_label = df_w_label.reset_index()
# 默认按80%训练集,20%测试集分割
# 这里为了简化操作,取前80%放到训练集中,后20%放到测试集中
# 当然也可以随机拆分80%,20%(尝试实现下DataFrame中的随机拆分)
# 该类数据的行数
n_lines = df_w_label.shape[0]
split_line_no = math.floor(n_lines * size)
text_df_w_label_train = df_w_label.iloc[:split_line_no, :]
text_df_w_label_test = df_w_label.iloc[split_line_no:, :]
# 放入整体训练集,测试集中
df_train = df_train.append(text_df_w_label_train)
df_test = df_test.append(text_df_w_label_test)
df_train = df_train.reset_index()
df_test = df_test.reset_index()
return df_train, df_test
def get_part_data(df_data, percent=1):
"""
从df_data中按percent选取部分数据
"""
df_result = pd.DataFrame()
grouped = df_data.groupby('group')
for group_name, group in grouped:
n_group_size = group.shape[0]
n_part_size = math.floor(n_group_size * percent)
part_df = group.iloc[:n_part_size, :]
df_result = df_result.append(part_df)
return df_result