KDD Cup 2020 - Debiasing:user feature

Written by wanping7

from datetime import datetime

# data process
import numpy as np, pandas as pd
from  datetime import datetime, timedelta

# visualize
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
%config ZMQInteractiveShell.ast_node_interactivity='all'

# sys
import os, sys

import warnings
warnings.filterwarnings('ignore')

# 大小设置
sns.set(rc={'figure.figsize':(13,7)})
# 风格设置
sns.set_style("whitegrid")


PATH = "../data/"

结论

  • 数据少量缺失(age/gender/city)
    • age/gender同时为空其余有值的样本:81
    • city为空其余有值的样本:22
    • age为空其余有值的样本:2
    • 不存在三个特征全部为空的样例
  • 数据有异常(user_id存在三个用户各重复1次)
  • 数据分布
    • 男女比例=22:78
    • 男性年龄等级由高到低前6(90%+):5->4->7->6->2->3
    • 女性年龄等级由高到低前6(90%+):4->5->2->3->7->6
    • 男性不同年龄之间的地域分布差异较大
    • 女性不同年龄之间的地域分布差异较小
    • 男女相同年龄之间的地域分布部分存在差异
TRAIN_PATH = PATH + "underexpose_train/"

用户特征

  • underexpose_user_feat.csv
    • user_id
    • user_age_level
    • user_gender
    • user_city_level

1 缺失情况

user_feat = pd.read_csv(TRAIN_PATH + "underexpose_user_feat.csv", header=None, 
                        names=["user_id", "user_age_level", "user_gender", "user_city_level"])
na_sta = pd.DataFrame(user_feat.isna().sum(), columns=["nan_num"])
na_sta["nan_percent(%)"] = na_sta["nan_num"]/user_feat.shape[0]
print("=======================================================>缺失情况:")
na_sta
print("=======================================================>带空值的行数:", user_feat.isnull().T.any().sum())
print("=======================================================>带空值的详情:")
print("========> age/gender/city  同时为空:", user_feat[(user_feat.user_age_level.isnull()) & 
                         (user_feat.user_gender.isnull()) & (user_feat.user_city_level.isnull())].shape[0])

print("========> age/gender  同时为空:", user_feat[(user_feat.user_age_level.isnull()) & 
                         (user_feat.user_gender.isnull())].shape[0])

print("========> city  为空其余特征有值:", user_feat[~(user_feat.user_age_level.isnull()) & 
                         ~(user_feat.user_gender.isnull()) & (user_feat.user_city_level.isnull())].shape[0])

print("========> age  为空其余特征有值:", user_feat[(user_feat.user_age_level.isnull()) & 
                         ~(user_feat.user_gender.isnull()) & ~(user_feat.user_city_level.isnull())].shape[0])
print("=======================================================>数据预览:")
user_feat.head(2)
=======================================================>缺失情况:
nan_num nan_percent(%)
user_id 0 0.000000
user_age_level 83 0.012226
user_gender 81 0.011931
user_city_level 22 0.003241
=======================================================>带空值的行数: 105
=======================================================>带空值的详情:
========> age/gender/city  同时为空: 0
========> age/gender  同时为空: 81
========> city  为空其余特征有值: 22
========> age  为空其余特征有值: 2
=======================================================>数据预览:
user_id user_age_level user_gender user_city_level
0 17 8.0 M 4.0
1 26 7.0 M 2.0

2 用户重复

  • 32152\23453\14818分别重复1次
print("===================================>查看用户id数:")
print("================> 未去重用户数:", user_feat.shape[0])
print("================> 去重用户数:  ", pd.unique(user_feat.user_id).shape[0])
print("===================================>查看重复id用户:")
user_feat.user_id.value_counts()[user_feat.user_id.value_counts()>1]
print("===================================>取出重复用户id的数据:")
user_feat[(user_feat.user_id == 32152) | (user_feat.user_id == 23453) | (user_feat.user_id == 14818)]
===================================>查看用户id数:
================> 未去重用户数: 6789
================> 去重用户数:   6786
===================================>查看重复id用户:





32152    2
23453    2
14818    2
Name: user_id, dtype: int64



===================================>取出重复用户id的数据:
user_id user_age_level user_gender user_city_level
1466 14818 3.0 M 3.0
1467 14818 2.0 M 3.0
5733 23453 5.0 F 2.0
5734 23453 5.0 F 5.0
6513 32152 1.0 F 6.0
6514 32152 2.0 F 6.0

3 数据分布

性别与年龄

  • 性别分布
  • 性别下的年龄分布
age_agg = user_feat.user_age_level.value_counts().reset_index()
age_agg.columns = ["user_age_level", "count_"]

gender_agg = user_feat.user_gender.value_counts().reset_index()
gender_agg.columns = ["user_gender", "count_"]

age_gender_agg = user_feat[["user_age_level", "user_gender"]].groupby(["user_age_level", "user_gender"]).size().reset_index()
age_gender_agg.columns = ["user_age_level", "user_gender", "count_"]

# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=2, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}], 
                                           [{'type':'domain'}, {'type':'domain'}]])


x = fig.add_trace(go.Pie(values=age_agg.count_.values, labels=age_agg.user_age_level.values, title='年龄等级'), 1, 1)
x = fig.add_trace(go.Pie(values=gender_agg.count_.values, labels=gender_agg.user_gender.values, title='性别'), 1, 2)

x = fig.add_trace(go.Pie(values=age_gender_agg[age_gender_agg.user_gender=="M"].count_.values, 
                         labels=age_gender_agg[age_gender_agg.user_gender=="M"].user_age_level.values, 
                         title='男性年龄等级'), 2, 1)
x = fig.add_trace(go.Pie(values=age_gender_agg[age_gender_agg.user_gender=="F"].count_.values, 
                         labels=age_gender_agg[age_gender_agg.user_gender=="F"].user_age_level.values, 
                         title='女性年龄等级'), 2, 2)


x = fig.update_traces(hole=.4, textposition='inside', textinfo='percent+label')

y = fig.update_layout(
    grid= dict(columns=2, rows=2),
    autosize=False,
    width=700,
    height=400,
    margin = dict(t=0, l=200, r=0, b=0)
)
fig.show()

KDD Cup 2020 - Debiasing:user feature_第1张图片

性别、年龄与城市

  • 性别下的年龄与城市
age_gender_city_agg = user_feat[["user_age_level", "user_gender", "user_city_level"]
                               ].groupby(["user_age_level", "user_gender", "user_city_level"]).size().reset_index()
age_gender_city_agg.columns = ["user_age_level", "user_gender", "user_city_level", "count_"]

age_gender_city_agg["user_age_level"] = age_gender_city_agg["user_age_level"].astype(str)
age_gender_city_agg["user_city_level"] = age_gender_city_agg["user_city_level"].astype(str)


agg_genderM_city_agg = age_gender_city_agg[age_gender_city_agg.user_gender=="M"]
agg_genderF_city_agg = age_gender_city_agg[age_gender_city_agg.user_gender=="F"]



fig1 = px.sunburst(agg_genderM_city_agg, path=["user_gender", 'user_age_level', 'user_city_level'], 
                  values='count_',
                  color='user_age_level')
# x = fig1.update_traces(marker_coloraxis=None)
fig1.update_layout(
    grid= dict(columns=2, rows=1),
    autosize=False,
    width=800,
    height=250,
    margin = dict(t=0, l=200, r=0, b=0)
)


fig2 = px.sunburst(agg_genderF_city_agg, path=["user_gender", 'user_age_level', 'user_city_level'], 
                  values='count_', 
                  color='user_age_level')
# x = fig2.update_traces(marker_coloraxis=None)
fig2.update_layout(
    grid= dict(columns=2, rows=1),
    autosize=False,
    width=800,
    height=250,
    margin = dict(t=0, l=200, r=0, b=0)
)



# fig = go.Figure()
# x = fig.add_trace(go.Sunburst(
#     ids=fig1["data"][0]["ids"],
#     labels=fig1["data"][0]["labels"],
#     parents=fig1["data"][0]["parents"],
#     values=fig1["data"][0]["values"],
#     domain=dict(column=0)
    
# ))
# x = fig.add_trace(go.Sunburst(
#     ids=fig2["data"][0]["ids"],
#     labels=fig2["data"][0]["labels"],
#     parents=fig2["data"][0]["parents"],
#     values=fig2["data"][0]["values"],
#     domain=dict(column=1)
# ))

# fig.update_layout(
#     grid= dict(columns=2, rows=1),
#     autosize=False,
#     width=700,
#     height=400,
#     margin = dict(t=0, l=150, r=0, b=0)
# )

KDD Cup 2020 - Debiasing:user feature_第2张图片

整体分布

  • 由内到外依次是:性别 -> 年龄 -> 城市
fig3 = px.sunburst(age_gender_city_agg, path=["user_gender", 'user_age_level', 'user_city_level'], 
                  values='count_',
                  color='user_age_level')
# x = fig3.update_traces(marker_coloraxis=None)
fig3.update_layout(
    grid= dict(columns=2, rows=1),
    autosize=False,
    width=800,
    height=500,
    margin = dict(t=0, l=200, r=0, b=0)
)

KDD Cup 2020 - Debiasing:user feature_第3张图片

你可能感兴趣的:(比赛)