section8

### 本章节的目的是 【明确目标用户群】 ,以更好的服务现有用户。 ### 【知识点】 ### 1.作图 - 显示中文 ` plt.rcParams[‘font.sans-serif’] = [‘SimHei’] # 步骤一(替换sans-serif字体) plt.rcParams[‘axes.unicode_minus’] = False # 步骤二(解决坐标轴负数的负号显示问题)` ### 2.数据库操作 - sqlalchemy 引擎 ` engine = create_engine(‘mysql+pymysql://root:123456@localhost:3306/datascience’) ` ### 3.批量读取文件 - os.wolk()、os.path.join()用法
for root, dirs, files in os.walk(path): 
        for file in files:`
            rfile = os.path.join(root,file)
            if rfile.split('.')[-1] == 'tsv':
                rdf = pd.read_csv(rfile, sep='\t')
                df = df.append(rdf)
### 4.groupby()以及agg() 的联合使用,应对不同列使用不同的函数 - 按月统计 ` affc = {‘payment’:’sum’, ‘log_date’:’count’} dfm = df.groupby([‘log_month’, ‘user_id’]).agg(affc).reset_index() ` - 修改列明 ` renam = {‘log_date’:’access_days’} dfm.rename(columns=renam, inplace=True) ` ### 5.KMeans 聚类的使用 - 单列的聚类(需要将单列应用 reshape(-1,1)格式化为1列) `from sklearn.cluster import KMeans a47 = action[‘A47’].reshape(-1, 1) kms = KMeans(n_clusters=3).fit(a47)` - 聚类的标签 labels_ 属性 `cluster = kms.labels_` - 将标签添加至源数据中,运用groupby()查看分组情况 `action[‘cluster’] = cluster action.groupby([‘cluster’])[‘user_id’].count()` - 可视化分组
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
### 6.主成分分析 - 数据预处理 - - 提取要进行主成分分析的列 `paction = acc.iloc[:,3:(len(acc.columns)-1)]` - - 删掉0值较多的列 `cc = paction[paction==0].count(axis=0)/len(paction) cc.plot() dd = cc[cc ### 一、库导入以及matplotlib显示中文
import pandas as pd
import numpy as np
import pymysql
from sqlalchemy import create_engine
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
import os

plt.rcParams['font.sans-serif'] = ['SimHei'] # 步骤一(替换sans-serif字体)
plt.rcParams['axes.unicode_minus'] = False   # 步骤二(解决坐标轴负数的负号显示问题)
%matplotlib inline
数据库引擎
engine = create_engine('mysql+pymysql://root:123456@localhost:3306/datascience')
### 二、批量读取文件
def read_files(path):
    df = pd.DataFrame()
    for root, dirs, files in os.walk(path):
        for file in files:
            rfile = os.path.join(root,file)
            if rfile.split('.')[-1] == 'tsv':
                rdf = pd.read_csv(rfile, sep='\t')
                df = df.append(rdf)
    return df
action_path  = 'data/sample-data/section8/daily/action/'
dau_path = 'data/sample-data/section8/daily/dau/'
dpu_path = 'data/sample-data/section8/daily/dpu/'

action = read_files(action_path)
dau = read_files(dau_path)
dpu = read_files(dpu_path)
查看数据完整性以及头部信息
print(action.isnull().sum().sum())
print(action.shape)
# print(action.info())
action.head()
0 (2653, 57)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_date app_name user_id A1 A2 A3 A4 A5 A6 A7 A45 A46 A47 A48 A49 A50 A51 A52 A53 A54
0 2013-10-31 game-01 654133 0 0 0 0 0 0 0 0 0 380 25655 0 0 0 0 0.0 46
1 2013-10-31 game-01 425530 0 0 0 0 10 1 233 19 20 180543 347 36 22 4 0 0.0 71
2 2013-10-31 game-01 709596 0 0 0 0 0 0 0 0 0 416 24817 0 0 0 0 0.0 2
3 2013-10-31 game-01 525047 0 2 0 0 9 0 0 22 22 35200 6412 21 0 0 0 0.0 109
4 2013-10-31 game-01 796908 0 0 0 0 0 0 0 29 29 388 25444 1 0 0 0 0.0 64

5 rows × 57 columns

print(dau.isnull().sum().sum())
print(dau.shape)
print(dau.info())
dau.head()
0 (509754, 3)
print(dpu.isnull().sum().sum())
print(dpu.shape)
print(dpu.info())
dpu.head()
0 (3532, 4)
# 写入数据库

# action.to_sql('s8_action', engine, index=False)
# dau.to_sql('s8_dau', engine, index=False)
# dpu.to_sql('s8_dpu', engine, index=False)
## 三、数据预处理 ### 1.合并 DAU DPU
df = pd.merge(dau, dpu[['log_date','user_id','payment']], how='left', on=['user_id','log_date'])
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_date app_name user_id payment
0 2013-05-01 game-01 608801 NaN
1 2013-05-01 game-01 712453 NaN
2 2013-05-01 game-01 776853 NaN
3 2013-05-01 game-01 823486 NaN
4 2013-05-01 game-01 113600 NaN
# 将无消费记录的消费额设为 0 
print(df.payment.isnull().sum())
df['payment'].fillna(0, inplace=True)
print(df.payment.isnull().sum())
507151 0
# 添加消费额标志位
df['is_pay'] = df['payment'].apply( lambda x: 1 if x>0 else 0 )
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_date app_name user_id payment is_pay
0 2013-05-01 game-01 608801 0.0 0
1 2013-05-01 game-01 712453 0.0 0
2 2013-05-01 game-01 776853 0.0 0
3 2013-05-01 game-01 823486 0.0 0
4 2013-05-01 game-01 113600 0.0 0
### 2.按月统计
# 增加月份列
df['log_month'] = df['log_date'].apply(lambda x: x[0:7])
df.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_date app_name user_id payment is_pay log_month
0 2013-05-01 game-01 608801 0.0 0 2013-05
1 2013-05-01 game-01 712453 0.0 0 2013-05
2 2013-05-01 game-01 776853 0.0 0 2013-05
3 2013-05-01 game-01 823486 0.0 0 2013-05
4 2013-05-01 game-01 113600 0.0 0 2013-05

巧妙运用 groupby 以及 agg 函数,统计出用户按月份的 消费情况 和 登陆次数

# 按月统计
affc = {'payment':'sum', 'log_date':'count'}
dfm = df.groupby(['log_month', 'user_id']).agg(affc).reset_index()
# 修改列明
renam = {'log_date':'access_days'}
dfm.rename(columns=renam, inplace=True)
dfm.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_month user_id payment access_days
0 2013-05 65 0.0 1
1 2013-05 115 0.0 1
2 2013-05 194 0.0 1
3 2013-05 426 0.0 4
4 2013-05 539 0.0 1
### 4.使用 Kmeans 进行分类, 得到排名靠前的用户,即 重度用户/中度用户/轻度用户 A47 列即是排行榜得分, 从分布图上看出,大部分用户得分很低,符合幂律曲线
# 
action['A47'].hist(bins=50, figsize=(6,4))
sns.distplot(action['A47'],bins=50,kde=True)
#### 对 A47 列进行聚类,分为3类
from sklearn.cluster import KMeans

a47 = action['A47'].reshape(-1, 1)

kms = KMeans(n_clusters=3).fit(a47)
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(…) instead This is separate from the ipykernel package so we can avoid doing imports until
cluster = kms.labels_
kms.cluster_centers_
array([[ 9359.84787792], [ 69386.11297071], [185857.17948718]])
action['cluster'] = cluster
action.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_date app_name user_id A1 A2 A3 A4 A5 A6 A7 A46 A47 A48 A49 A50 A51 A52 A53 A54 cluster
0 2013-10-31 game-01 654133 0 0 0 0 0 0 0 0 380 25655 0 0 0 0 0.0 46 0
1 2013-10-31 game-01 425530 0 0 0 0 10 1 233 20 180543 347 36 22 4 0 0.0 71 2
2 2013-10-31 game-01 709596 0 0 0 0 0 0 0 0 416 24817 0 0 0 0 0.0 2 0
3 2013-10-31 game-01 525047 0 2 0 0 9 0 0 22 35200 6412 21 0 0 0 0.0 109 0
4 2013-10-31 game-01 796908 0 0 0 0 0 0 0 29 388 25444 1 0 0 0 0.0 64 0

5 rows × 58 columns

action.groupby(['cluster'])['user_id'].count()
cluster 0 2096 1 479 2 78 Name: user_id, dtype: int64 图上显示,通过聚类分解后用户分为3个类, 0 表示轻度用户,排行榜得分最少; 1 表示中度用户,排行版得分居中; 2 表示重度用户,排行版得分较高,而且用户数量较少,符合实际情况。
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))
sns.scatterplot(x='user',y='A47',hue='cluster',data=snsdf, palette='rainbow', alpha=.2)
snsdf = action[['user_id','A47','cluster']].sort_values(by='A47',ascending=False)
snsdf['user'] = range(len(snsdf))

plt.figure(figsize=(8,5))
snsdf1 = snsdf.reset_index()
snsdf1[snsdf1['cluster']==2]['A47'].plot(color='r',label='2:重度用户')
snsdf1[snsdf1['cluster']==1]['A47'].plot(color='g',label='1:中度用户')
snsdf1[snsdf1['cluster']==0]['A47'].plot(color='b',label='0:轻度用户')
plt.legend()
plt.xlabel('用户分布')
plt.ylabel('排行榜得分')
Text(0,0.5,’排行榜得分’) ![png](output_33_1.png) #### 限定排名靠前的用户,即得分较高的重度和中度用户,以便接下来进行分析
acc = action[action['cluster']>=1]
acc.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
log_date app_name user_id A1 A2 A3 A4 A5 A6 A7 A46 A47 A48 A49 A50 A51 A52 A53 A54 cluster
1 2013-10-31 game-01 425530 0 0 0 0 10 1 233 20 180543 347 36 22 4 0 0.0 71 2
5 2013-10-31 game-01 776120 0 0 0 0 9 0 0 38 142214 684 37 15 0 0 0.0 312 2
7 2013-10-31 game-01 276197 0 0 0 0 7 0 58 15 54602 4226 15 0 8 0 0.0 95 1
8 2013-10-31 game-01 221572 0 0 0 0 1 0 0 24 39891 5792 4 0 0 0 0.0 21 1
9 2013-10-31 game-01 692433 0 0 0 0 6 0 0 28 50706 4549 16 8 0 0 0.0 154 1

5 rows × 58 columns

## 5.主成分分析 获取关键的参数
paction = acc.iloc[:,3:(len(acc.columns)-1)]
paction.index=acc.user_id
paction.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A45 A46 A47 A48 A49 A50 A51 A52 A53 A54
user_id
425530 0 0 0 0 10 1 233 58.25 288 230 19 20 180543 347 36 22 4 0 0.0 71
776120 0 0 0 0 9 0 0 0.00 325 195 19 38 142214 684 37 15 0 0 0.0 312
276197 0 0 0 0 7 0 58 7.25 150 100 15 15 54602 4226 15 0 8 0 0.0 95
221572 0 0 0 0 1 0 0 0.00 40 14 24 24 39891 5792 4 0 0 0 0.0 21
692433 0 0 0 0 6 0 0 0.00 102 95 15 28 50706 4549 16 8 0 0 0.0 154

5 rows × 54 columns

#### 1.删掉 0 值比较多的列
cc = paction[paction==0].count(axis=0)/len(paction)
print(cc.head())
cc.plot()
A1 1.000000 A2 0.926391 A3 1.000000 A4 0.994614 A5 0.055655 dtype: float64
# cc[cc>.8]
dd = cc[cc<.95]
paction = paction[dd.index]
paction.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2 A5 A6 A7 A8 A9 A10 A11 A12 A13 A45 A46 A47 A48 A49 A50 A51 A52 A53 A54
user_id
425530 0 10 1 233 58.25 288 230 19 2 19 19 20 180543 347 36 22 4 0 0.0 71
776120 0 9 0 0 0.00 325 195 38 8 19 19 38 142214 684 37 15 0 0 0.0 312
276197 0 7 0 58 7.25 150 100 15 3 11 15 15 54602 4226 15 0 8 0 0.0 95
221572 0 1 0 0 0.00 40 14 0 0 3 24 24 39891 5792 4 0 0 0 0.0 21
692433 0 6 0 0 0.00 102 95 0 0 2 15 28 50706 4549 16 8 0 0 0.0 154

5 rows × 32 columns

#### 2.删掉相关性较强的列
corp = paction.corr()
plt.figure(figsize=(15,8))
sns.heatmap(corp)
mask = np.array(corp)
mask[np.tril_indices_from(mask)] = False
fig,ax = plt.subplots()
fig.set_size_inches(15,8)
sns.heatmap(corp,mask=mask)
coll = corp.columns
corp = pd.DataFrame(np.tril(corp, -1))
corp.columns = coll
corp.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2 A5 A6 A7 A8 A9 A10 A11 A12 A13 A45 A46 A47 A48 A49 A50 A51 A52 A53 A54
0 0.000000 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.069744 0.000000 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.076185 0.178833 0.000000 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.158735 0.219395 0.371360 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.167200 0.186124 0.242025 0.803161 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 32 columns

pac2 = paction.loc[:,(corp.abs()<.7).all()]      # 任何一个数都小于0.7 的数据
pac2.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2 A11 A12 A13 A20 A23 A24 A43 A44 A46 A48 A49 A50 A51 A53 A54
user_id
425530 0 19 2 19 0 0 0.5 23 0.92174 20 347 36 22 4 0.0 71
776120 0 38 8 19 0 0 0.0 20 0.90256 38 684 37 15 0 0.0 312
276197 0 15 3 11 0 0 0.0 10 0.92000 15 4226 15 0 8 0.0 95
221572 0 0 0 3 0 0 0.0 2 0.85714 24 5792 4 0 0 0.0 21
692433 0 0 0 2 0 0 0.0 11 0.73684 28 4549 16 8 0 0.0 154
### 进行主成分分析
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(pac2)
PCA(copy=True, iterated_power=’auto’, n_components=None, random_state=None, svd_solver=’auto’, tol=0.0, whiten=False)
redio = pca.explained_variance_ratio_
print(redio) 
print(pca.singular_values_)  
[9.97843804e-01 1.92024564e-03 1.20120771e-04 5.57014208e-05 2.67905481e-05 1.54533752e-05 9.31262940e-06 4.38846214e-06 3.02317261e-06 8.36725295e-07 1.31874979e-07 9.78197162e-08 3.86464536e-08 2.94647596e-08 1.82272465e-08 7.54580333e-09] [3.96183910e+04 1.73797668e+03 4.34684952e+02 2.96004755e+02 2.05284590e+02 1.55911168e+02 1.21032418e+02 8.30848288e+01 6.89599635e+01 3.62791414e+01 1.44027941e+01 1.24044853e+01 7.79687146e+00 6.80796010e+00 5.35458829e+00 3.44523057e+00]
recu = redio.cumsum()
print(recu)
x = np.arange(len(recu))
plt.plot(recu, color='r')
[0.9978438 0.99976405 0.99988417 0.99993987 0.99996666 0.99998212 0.99999143 0.99999582 0.99999884 0.99999968 0.99999981 0.99999991 0.99999994 0.99999997 0.99999999 1. ] [ #### 得到降维后的数据
pca.set_params(n_components=10)
pac3 = pd.DataFrame(pca.fit_transform(pac2))
pacsse = pac3.copy()
pac3.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4 5 6 7 8 9
0 2706.266005 -100.824346 -1.874787 -1.577536 12.481591 -2.394320 9.770878 7.807535 0.021273 -2.169596
1 2373.811140 147.314930 -16.386795 -8.428655 10.019577 -3.004725 6.009771 0.961469 -1.598531 2.144615
2 -1171.733361 -5.493081 0.744995 0.542033 -0.785251 -5.756412 -1.012336 -1.778067 7.256884 0.343277
3 -2738.903900 -50.468487 2.328491 2.965415 -5.794347 11.891289 2.965366 -1.182413 0.065619 1.245358
4 -1493.642618 58.686385 -10.807612 11.777973 7.664692 9.312968 4.376429 1.994214 -1.568050 0.426246
## 6.KMeans 进行聚类
from sklearn.cluster import KMeans

km = KMeans(n_clusters=5)
km.fit(pac3)
KMeans(algorithm=’auto’, copy_x=True, init=’k-means++’, max_iter=300, n_clusters=5, n_init=10, n_jobs=1, precompute_distances=’auto’, random_state=None, tol=0.0001, verbose=0)
clu = km.labels_
pac3['clu'] = clu
pac3.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4 5 6 7 8 9 clu
0 2706.266005 -100.824346 -1.874787 -1.577536 12.481591 -2.394320 9.770878 7.807535 0.021273 -2.169596 0
1 2373.811140 147.314930 -16.386795 -8.428655 10.019577 -3.004725 6.009771 0.961469 -1.598531 2.144615 0
2 -1171.733361 -5.493081 0.744995 0.542033 -0.785251 -5.756412 -1.012336 -1.778067 7.256884 0.343277 1
3 -2738.903900 -50.468487 2.328491 2.965415 -5.794347 11.891289 2.965366 -1.182413 0.065619 1.245358 4
4 -1493.642618 58.686385 -10.807612 11.777973 7.664692 9.312968 4.376429 1.994214 -1.568050 0.426246 1
pac3.groupby('clu')[2].count()
clu 0 90 1 113 2 122 3 109 4 123 Name: 2, dtype: int64 #### palette 的颜色风格: Accent, Accent_r, Blues, Blues_r, BrBG, BrBG_r, BuGn, BuGn_r, BuPu, BuPu_r, CMRmap, CMRmap_r, Dark2, Dark2_r, GnBu, GnBu_r, Greens, Greens_r, Greys, Greys_r, OrRd, OrRd_r, Oranges, Oranges_r, PRGn, PRGn_r, Paired, Paired_r, Pastel1, Pastel1_r, Pastel2, Pastel2_r, PiYG, PiYG_r, PuBu, PuBuGn, PuBuGn_r, PuBu_r, PuOr, PuOr_r, PuRd, PuRd_r, Purples, Purples_r, RdBu, RdBu_r, RdGy, RdGy_r, RdPu, RdPu_r, RdYlBu, RdYlBu_r, RdYlGn, RdYlGn_r, Reds, Reds_r, Set1, Set1_r, Set2, Set2_r, Set3, Set3_r, Spectral, Spectral_r, Vega10, Vega10_r, Vega20, Vega20_r, Vega20b, Vega20b_r, Vega20c, Vega20c_r, Wistia, Wistia_r, YlGn, YlGnBu, YlGnBu_r, YlGn_r, YlOrBr, YlOrBr_r, YlOrRd, YlOrRd_r, afmhot, afmhot_r, autumn, autumn_r, binary, binary_r, bone, bone_r, brg, brg_r, bwr, bwr_r, cool, cool_r, coolwarm, coolwarm_r, copper, copper_r, cubehelix, cubehelix_r, flag, flag_r, gist_earth, gist_earth_r, gist_gray, gist_gray_r, gist_heat, gist_heat_r, gist_ncar, gist_ncar_r, gist_rainbow, gist_rainbow_r, gist_stern, gist_stern_r, gist_yarg, gist_yarg_r, gnuplot, gnuplot2, gnuplot2_r, gnuplot_r, gray, gray_r, hot, hot_r, hsv, hsv_r, icefire, icefire_r, inferno, inferno_r, jet, jet_r, magma, magma_r, mako, mako_r, nipy_spectral, nipy_spectral_r, ocean, ocean_r, pink, pink_r, plasma, plasma_r, prism, prism_r, rainbow, rainbow_r, rocket, rocket_r, seismic, seismic_r, spectral, spectral_r, spring, spring_r, summer, summer_r, tab10, tab10_r, tab20, tab20_r, tab20b, tab20b_r, tab20c, tab20c_r, terrain, terrain_r, viridis, viridis_r, vlag, vlag_r, winter, winter_r
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pac3,style='clu',hue='clu', palette='autumn')
### 将分类后的类别添加至原数据中
pac4 = pac2.copy()
pac4['cluster'] = list(pac3.clu)
pac4.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2 A11 A12 A13 A20 A23 A24 A43 A44 A46 A48 A49 A50 A51 A53 A54 cluster
user_id
425530 0 19 2 19 0 0 0.5 23 0.92174 20 347 36 22 4 0.0 71 0
776120 0 38 8 19 0 0 0.0 20 0.90256 38 684 37 15 0 0.0 312 0
276197 0 15 3 11 0 0 0.0 10 0.92000 15 4226 15 0 8 0.0 95 1
221572 0 0 0 3 0 0 0.0 2 0.85714 24 5792 4 0 0 0.0 21 4
692433 0 0 0 2 0 0 0.0 11 0.73684 28 4549 16 8 0 0.0 154 1
# 计算每个类的平均值
clu5 = pac4.groupby('cluster').mean()
# 删除相关性较高的列
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2 A20 A23 A24 A44 A46 A50 A51 A54
cluster
0 0.022222 0.322222 0.655556 0.167691 0.858193 27.600000 10.666667 2.011111 166.711111
1 0.079646 0.274336 0.362832 0.095231 0.844027 20.159292 3.008850 1.469027 102.106195
2 0.073770 0.377049 0.336066 0.070628 0.849343 24.737705 4.286885 1.844262 121.909836
3 0.018349 0.229358 0.284404 0.098252 0.845981 24.119266 5.266055 1.733945 146.871560
4 0.203252 0.292683 0.243902 0.063686 0.775076 18.983740 2.130081 0.975610 84.032520
from sklearn.preprocessing import scale

ccccc = pd.DataFrame(scale(cccc))
ccccc.columns = cccc.columns
ccccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A2 A20 A23 A24 A44 A46 A50 A51 A54
0 -0.855590 0.468859 1.918400 1.862020 0.785882 1.422970 1.867773 1.118457 1.424282
1 0.002962 -0.503392 -0.094337 -0.104961 0.315530 -0.940402 -0.688647 -0.381093 -0.746672
2 -0.084884 1.582038 -0.278379 -0.772826 0.492038 0.513827 -0.261998 0.656909 -0.081200
3 -0.913505 -1.416613 -0.633601 -0.022944 0.380387 0.317394 0.064879 0.351742 0.757602
4 1.851016 -0.130892 -0.912083 -0.961289 -1.973837 -1.313789 -0.982007 -1.746015 -1.354012
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):
    # 构造数据
    values = ccccc.loc[i,:]
    # 为了使雷达图一圈封闭起来
    values = np.concatenate((values,[values[0]]))
    # 绘制
    plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,’重要指标雷达图呈现’) ![png](output_70_1.png) ## 不进行预处理的降维
dfp = acc.iloc[:,3:(len(acc.columns)-1)]
dfp.index=acc.user_id
dfp.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A1 A2 A3 A4 A5 A6 A7 A8 A9 A10 A45 A46 A47 A48 A49 A50 A51 A52 A53 A54
user_id
425530 0 0 0 0 10 1 233 58.25 288 230 19 20 180543 347 36 22 4 0 0.0 71
776120 0 0 0 0 9 0 0 0.00 325 195 19 38 142214 684 37 15 0 0 0.0 312
276197 0 0 0 0 7 0 58 7.25 150 100 15 15 54602 4226 15 0 8 0 0.0 95
221572 0 0 0 0 1 0 0 0.00 40 14 24 24 39891 5792 4 0 0 0 0.0 21
692433 0 0 0 0 6 0 0 0.00 102 95 15 28 50706 4549 16 8 0 0 0.0 154

5 rows × 54 columns

from sklearn.decomposition import PCA

pca = PCA(whiten=False)
pca.fit(dfp)
PCA(copy=True, iterated_power=’auto’, n_components=None, random_state=None, svd_solver=’auto’, tol=0.0, whiten=False)
retio = pca.explained_variance_ratio_
# print(retio) 
# print(pca.singular_values_)  

rec = retio.cumsum()
print(rec)
x = np.arange(len(rec))
plt.plot(rec, color='r')
[0.9996008 0.99995245 0.99997489 0.99999016 0.9999933 0.99999564 0.99999759 0.99999838 0.99999897 0.9999995 0.99999962 0.99999972 0.99999979 0.99999986 0.9999999 0.99999993 0.99999996 0.99999997 0.99999997 0.99999998 0.99999998 0.99999999 0.99999999 0.99999999 0.99999999 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. ] [
pca.set_params(n_components=10)
pacsse = pd.DataFrame(pca.fit_transform(dfp))
pacsse.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4 5 6 7 8 9
0 94938.293061 -342.891655 -161.442878 -199.616210 1.830692 73.107938 153.124982 124.440657 -34.371612 46.548951
1 56613.313155 -960.580156 -38.560364 -45.836571 13.670166 90.767620 -145.846645 -40.255134 10.508203 16.287863
2 -31060.195159 388.005529 -6.932692 -0.948812 -5.332728 18.237293 11.393467 14.689011 -7.994909 32.398532
3 -45806.252443 1579.357883 -81.812845 -96.488345 -18.477649 -90.059217 31.377291 -22.865193 -19.724837 16.293640
4 -34963.135693 611.858506 -18.187490 -16.454233 -5.597209 -9.722257 -63.112236 -3.943266 7.222725 -10.889839
## 手肘法获取最优 K 值
from sklearn.cluster import KMeans

df_features = pacsse # 读入数据
# '利用SSE选择k'
SSE = []  # 存放每次结果的误差平方和
for k in range(1,9):
    estimator = KMeans(n_clusters=k)  # 构造聚类器
    estimator.fit(df_features)
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[ #### 显然,先标准化数据是不合适的
# 显然,先标准化数据是不合适的

df_features = pd.DataFrame(scale(pacsse)) 

SSE = []  
for k in range(1,9):
    estimator = KMeans(n_clusters=k) 
    estimator.fit(df_features)
    SSE.append(estimator.inertia_)
X = range(1,9)
plt.xlabel('k')
plt.ylabel('SSE')
plt.plot(X,SSE,'o-')
[
km = KMeans(n_clusters=4)
km.fit(pacsse)
clu = km.labels_
pacsse['clu'] = clu
pacsse.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
0 1 2 3 4 5 6 7 8 9 clu
0 94938.293061 -342.891655 -161.442878 -199.616210 1.830692 73.107938 153.124982 124.440657 -34.371612 46.548951 2
1 56613.313155 -960.580156 -38.560364 -45.836571 13.670166 90.767620 -145.846645 -40.255134 10.508203 16.287863 0
2 -31060.195159 388.005529 -6.932692 -0.948812 -5.332728 18.237293 11.393467 14.689011 -7.994909 32.398532 1
3 -45806.252443 1579.357883 -81.812845 -96.488345 -18.477649 -90.059217 31.377291 -22.865193 -19.724837 16.293640 1
4 -34963.135693 611.858506 -18.187490 -16.454233 -5.597209 -9.722257 -63.112236 -3.943266 7.222725 -10.889839 1
pacsse.groupby('clu')[2].count()
clu 0 153 1 344 2 54 3 6 Name: 2, dtype: int64
plt.figure(figsize=(13,7))
sns.scatterplot(x=0, y=1, data=pacsse,style='clu',hue='clu', palette='autumn')
### 显然,不进行预处理的数据聚类是有问题的, 第一主成分和第二主成分 显然是相关的
pac4 = pac2.copy()
pac4['cluster'] = list(pacsse.clu)
pac4.head()

clu5 = pac4.groupby('cluster').mean()
clu5.drop(columns='A53',inplace=True)
c5cor = clu5.corr()
plt.figure(figsize=(15,8))
sns.heatmap(c5cor,annot=True)
ccrp = pd.DataFrame(np.tril(c5cor,-1))
ccrp.columns = clu5.columns
cccc = clu5.loc[:,(ccrp.abs()<.95).all()]
cccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A12 A20 A51 A54
cluster
0 3.398693 0.228758 1.810458 146.287582
1 1.938953 0.316860 1.433140 101.531977
2 4.592593 0.407407 1.870370 169.777778
3 2.166667 0.166667 1.666667 213.833333
from sklearn.preprocessing import scale

ccccc = pd.DataFrame(scale(cccc))

ccccc.columns = cccc.columns
ccccc
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
A12 A20 A51 A54
0 0.352533 -0.562784 0.684599 -0.285229
1 -1.021705 0.406288 -1.555764 -1.388557
2 1.476502 1.402249 1.040338 0.293858
3 -0.807330 -1.245753 -0.169173 1.379928
plt.figure(figsize=(8,8))
# 极坐标的分割分数
N = ccccc.shape[1]
# 设置雷达图的角度,用于平分切开一个圆面
angles = np.linspace(0, 2*np.pi, N, endpoint=False)
# 使雷达图一圈封闭起来
angles = np.concatenate((angles,[angles[0]]))
for i in range(len(ccccc)):
    # 构造数据
    values = ccccc.loc[i,:]
    # 为了使雷达图一圈封闭起来
    values = np.concatenate((values,[values[0]]))
    # 绘制
    plt.polar(angles, values, 'o-', linewidth=2)
plt.legend(ccccc.index, loc='lower right')
# 添加极坐标的标签
plt.thetagrids(angles * 180/np.pi, labels=list(ccccc.columns))
plt.title('重要指标雷达图呈现')
Text(0.5,1.05,'重要指标雷达图呈现')

你可能感兴趣的:(机器学习实战)