Project

import pandas as pd
c1=pd.read_excel('C:/Users/50502/Desktop/Python项目/02-百货商场data/data/cumcm2018c1.xlsx')
c2=pd.read_csv('C:/Users/50502/Desktop/Python项目/02-百货商场data/data/cumcm2018c2.csv')
D:\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3057: DtypeWarning: Columns (0,11) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
c1.head()
kh csrq xb djsj
0 c68b20b4 2002-11-02 00:00:00 0.0 2013-05-11 00:00:00.000
1 1ca15332 NaN 0.0 2004-11-04 16:31:52.436
2 a37cc182 1967-02-17 00:00:00 0.0 2004-12-31 21:24:34.216
3 2ab88539 1982-06-01 00:00:00 0.0 2010-11-19 00:00:00.000
4 b4c77269 1964-02-05 00:00:00 0.0 2007-12-14 00:00:00.000
c2.head()
kh dtime spbm sl sj je spmc jf syjh djh gzbm gzmc
0 1be1e3fe 2015-01-01 00:05:41.593 f09c9303 1 290.0 270.20 兰芝化妆品正价瓶 270.20 6 25bb 8077.0 兰芝柜
1 1be1e3fe 2015-01-01 00:05:41.593 f09c9303 1 325.0 302.80 兰芝化妆品正价瓶 302.80 6 25bb 8077.0 兰芝柜
2 1be1e3fe 2015-01-01 00:05:41.593 f09c9303 1 195.0 181.80 兰芝化妆品正价瓶 181.80 6 25bb 8077.0 兰芝柜
3 1be1e3fe 2015-01-01 00:05:41.593 f09c9303 1 270.0 251.55 兰芝化妆品正价瓶 251.55 6 25bb 8077.0 兰芝柜
4 1be1e3fe 2015-01-01 00:05:41.593 f09c9303 2 245.0 456.55 兰芝化妆品正价瓶 456.55 6 25bb 8077.0 兰芝柜
#去重
c1=c1.drop_duplicates()
c2=c2.drop_duplicates()
#合并数据集
c = pd.merge(c1,c2)
c.head()
kh csrq xb djsj dtime spbm sl sj je spmc jf syjh djh gzbm gzmc
0 a37cc182 1967-02-17 00:00:00 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 499.0 297.07 WMF D无 297.08 101 7cd8 7296.0 WMF 柜
1 a37cc182 1967-02-17 00:00:00 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 12288.0 11089.93 WMF D无 11089.92 101 7cd8 7296.0 WMF 柜
2 a37cc182 1967-02-17 00:00:00 0.0 2004-12-31 21:24:34.216 2017-02-28 17:17:35.533 252403ef 1 598.0 598.00 双立人商品 F无 598.00 102 ed0a 7242.0 双立人柜
3 34c2dc93 1986-07-18 00:00:00 0.0 2007-04-19 00:00:00.000 2017-11-13 13:36:45.656 4708e0bb 1 260.0 260.00 植村秀三色眼影/眼线液/眉笔. 260.00 97 c3f4 8156.0 Shu Uemura
4 828aa9b4 1951-12-29 00:00:00 0.0 2004-09-24 16:39:08.716 2017-05-17 11:30:39.080 a4809307 1 1288.0 1288.00 酷彩F件 1288.00 102 95b0 7300.0 Le Creuset(酷彩)
# 查看数据的形状
print("数据的形状:", c.shape)
# 查看每列数据的非空值的个数和数据类型
print("每列数据的非空值的个数和数据类型:\n", c.info())
# 查看每列数据的描述统计信息
print("数据的描述统计信息:\n", c.describe())
# 查看每列数据的空值个数
print("每列数据的空值个数:\n", c.isnull().sum())
数据的形状: (506006, 15)

Int64Index: 506006 entries, 0 to 506005
Data columns (total 15 columns):
kh       506006 non-null object
csrq     482173 non-null object
xb       495568 non-null float64
djsj     474547 non-null datetime64[ns]
dtime    506006 non-null object
spbm     506006 non-null object
sl       506006 non-null int64
sj       506006 non-null float64
je       506006 non-null float64
spmc     506006 non-null object
jf       506006 non-null float64
syjh     506006 non-null int64
djh      506006 non-null object
gzbm     506006 non-null float64
gzmc     492463 non-null object
dtypes: datetime64[ns](1), float64(5), int64(2), object(7)
memory usage: 61.8+ MB
每列数据的非空值的个数和数据类型:
 None
数据的描述统计信息:
                   xb             sl            sj            je  \
count  495568.000000  506006.000000  5.060060e+05  5.060060e+05   
mean        0.098372       1.054300  1.430813e+03  1.356862e+03   
std         0.297817       2.366001  3.695847e+03  3.706632e+03   
min         0.000000     -30.000000  1.000000e-01 -6.712575e+05   
25%         0.000000       1.000000  3.400000e+02  3.200000e+02   
50%         0.000000       1.000000  7.200000e+02  6.960000e+02   
75%         0.000000       1.000000  1.622000e+03  1.577430e+03   
max         1.000000    1492.000000  1.342515e+06  1.342515e+06   

                  jf           syjh           gzbm  
count  506006.000000  506006.000000  506006.000000  
mean     1030.375453     111.340160    6296.818781  
std      2125.932871      82.427647    2363.259002  
min   -201616.000000       1.000000       0.000000  
25%       145.000000      43.000000    4334.000000  
50%       500.000000      97.000000    8064.000000  
75%      1270.000000     170.000000    8121.000000  
max    268503.000000     320.000000    9005.000000  
每列数据的空值个数:
 kh           0
csrq     23833
xb       10438
djsj     31459
dtime        0
spbm         0
sl           0
sj           0
je           0
spmc         0
jf           0
syjh         0
djh          0
gzbm         0
gzmc     13543
dtype: int64
c.describe()
xb sl sj je jf syjh gzbm
count 495568.000000 506006.000000 5.060060e+05 5.060060e+05 506006.000000 506006.000000 506006.000000
mean 0.098372 1.054300 1.430813e+03 1.356862e+03 1030.375453 111.340160 6296.818781
std 0.297817 2.366001 3.695847e+03 3.706632e+03 2125.932871 82.427647 2363.259002
min 0.000000 -30.000000 1.000000e-01 -6.712575e+05 -201616.000000 1.000000 0.000000
25% 0.000000 1.000000 3.400000e+02 3.200000e+02 145.000000 43.000000 4334.000000
50% 0.000000 1.000000 7.200000e+02 6.960000e+02 500.000000 97.000000 8064.000000
75% 0.000000 1.000000 1.622000e+03 1.577430e+03 1270.000000 170.000000 8121.000000
max 1.000000 1492.000000 1.342515e+06 1.342515e+06 268503.000000 320.000000 9005.000000
c.info()

Int64Index: 506006 entries, 0 to 506005
Data columns (total 15 columns):
kh       506006 non-null object
csrq     482173 non-null object
xb       495568 non-null float64
djsj     474547 non-null datetime64[ns]
dtime    506006 non-null object
spbm     506006 non-null object
sl       506006 non-null int64
sj       506006 non-null float64
je       506006 non-null float64
spmc     506006 non-null object
jf       506006 non-null float64
syjh     506006 non-null int64
djh      506006 non-null object
gzbm     506006 non-null float64
gzmc     492463 non-null object
dtypes: datetime64[ns](1), float64(5), int64(2), object(7)
memory usage: 61.8+ MB
# 查看销售日期列数据中每个值的个数
c.dtime.value_counts()
2017-11-25 15:36:23.626    54
2015-04-25 11:16:56.140    45
2015-03-02 12:12:11.923    38
2015-04-26 13:19:38.080    37
2016-06-24 14:27:57.830    37
2016-09-23 14:50:44.063    36
2015-04-24 18:47:34.390    35
2015-06-03 16:07:30.813    34
2016-12-31 16:45:42.343    33
2015-04-26 17:40:18.063    33
2015-04-24 17:03:14.423    32
2017-11-26 16:02:26.936    32
2017-08-27 17:29:24.376    31
2016-11-25 12:19:11.720    31
2016-08-27 12:13:19.203    31
2015-04-26 11:20:21.813    30
2015-01-10 17:13:34.343    30
2015-04-24 15:17:28.390    30
2016-09-23 18:09:19.796    30
2015-08-17 13:44:25.686    30
2017-05-12 16:14:14.360    29
2015-01-01 11:52:18.890    29
2017-09-23 15:48:34.186    29
2017-10-08 16:41:18.283    28
2015-04-24 17:17:20.110    27
2017-08-26 15:06:12.046    27
2015-04-25 15:24:03.436    27
2017-11-26 15:05:00.516    27
2015-02-16 17:28:15.906    26
2017-05-12 18:59:47.906    26
                           ..
2016-11-04 14:03:26.453     1
2016-03-30 16:48:55.110     1
2017-10-19 17:27:16.126     1
2016-05-24 15:16:28.936     1
2017-05-23 14:34:51.546     1
2015-04-24 16:33:23.423     1
2017-01-19 17:12:30.610     1
2015-06-24 12:10:02.813     1
2016-07-26 19:34:21.330     1
2015-04-02 20:47:07.453     1
2016-01-09 16:42:14.156     1
2017-05-08 13:52:32.266     1
2017-06-18 16:39:42.436     1
2015-01-16 16:54:14.063     1
2017-05-05 13:18:11.686     1
2017-02-03 18:28:31.203     1
2017-05-19 12:08:02.500     1
2017-09-17 15:24:35.500     1
2017-08-17 12:41:08.626     1
2015-07-13 11:04:57.796     1
2015-04-30 17:17:53.110     1
2017-07-05 18:30:27.813     1
2016-06-28 12:15:22.093     1
2015-07-27 12:07:34.093     1
2015-05-02 19:00:01.626     1
2017-10-13 13:44:54.173     1
2017-09-22 17:02:49.360     1
2015-02-01 17:17:37.580     1
2017-04-08 12:51:00.110     1
2016-06-28 11:58:35.296     1
Name: dtime, Length: 264022, dtype: int64
## 查看销售日期列数据中每年的个数
import datetime as dt
c['dtime']=pd.to_datetime(c['dtime'],errors='coerce')
c.dtime.dt.year.value_counts()
2017    231373
2016    146958
2015    123852
2018      3823
Name: dtime, dtype: int64
# 数据去重
new_c = c.drop_duplicates()
print(new_c.shape)
(506006, 15)
# 去除数据中的空值
c = new_c.dropna()
c.shape
(445894, 15)
c.info()

Int64Index: 445894 entries, 0 to 506005
Data columns (total 15 columns):
kh       445894 non-null object
csrq     445894 non-null object
xb       445894 non-null float64
djsj     445894 non-null datetime64[ns]
dtime    445894 non-null datetime64[ns]
spbm     445894 non-null object
sl       445894 non-null int64
sj       445894 non-null float64
je       445894 non-null float64
spmc     445894 non-null object
jf       445894 non-null float64
syjh     445894 non-null int64
djh      445894 non-null object
gzbm     445894 non-null float64
gzmc     445894 non-null object
dtypes: datetime64[ns](2), float64(5), int64(2), object(6)
memory usage: 54.4+ MB
#定义年龄0-44为青年,45-59为中年人,60以上为老年人
#在c表中csrq
import datetime as dt
#提取年数
c['csrq']=pd.to_datetime(c['csrq'],errors='coerce')
c['year'] = c['csrq'].dt.year
#年龄计算
now_year = dt.datetime.now().year
c['age'] = now_year - c['year']
#年龄阶层
bins = [0,44,59,120] 
labels = ['青年 ','中年人', '老年人'] 
c['年龄阶层'] = pd.cut(c['age'], bins, labels=labels)
#季节
c['month']=c['dtime'].dt.month
bins = [0,2.9,5,8,11,12.1] 
labels = ['冬季 ','春季', '夏季', '秋季','冬季']
#labels = ['Winter ','Spring', 'Summer', 'Autumn','Winter'] 
c['季节'] = pd.cut(c['month'], bins, labels=labels)
#入会时长数据定义、处理与描述
c['djsj']=pd.to_datetime(c['djsj'],errors='coerce')
c['入会时长']=dt.datetime.now().year-c['djsj'].dt.year
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:13: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:15: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:19: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:21: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:22: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
c.head()
kh csrq xb djsj dtime spbm sl sj je spmc ... syjh djh gzbm gzmc year age 年龄阶层 month 季节 入会时长
0 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 499.0 297.07 WMF D无 ... 101 7cd8 7296.0 WMF 柜 1967 53 中年人 9 秋季 16
1 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 12288.0 11089.93 WMF D无 ... 101 7cd8 7296.0 WMF 柜 1967 53 中年人 9 秋季 16
2 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2017-02-28 17:17:35.533 252403ef 1 598.0 598.00 双立人商品 F无 ... 102 ed0a 7242.0 双立人柜 1967 53 中年人 2 冬季 16
3 34c2dc93 1986-07-18 0.0 2007-04-19 00:00:00.000 2017-11-13 13:36:45.656 4708e0bb 1 260.0 260.00 植村秀三色眼影/眼线液/眉笔. ... 97 c3f4 8156.0 Shu Uemura 1986 34 青年 11 秋季 13
4 828aa9b4 1951-12-29 0.0 2004-09-24 16:39:08.716 2017-05-17 11:30:39.080 a4809307 1 1288.0 1288.00 酷彩F件 ... 102 95b0 7300.0 Le Creuset(酷彩) 1951 69 老年人 5 春季 16

5 rows × 21 columns

c.age.describe()
count    445894.000000
mean         63.031622
std          62.198657
min           3.000000
25%          38.000000
50%          46.000000
75%          53.000000
max         267.000000
Name: age, dtype: float64
#去除不符合逻辑的值:删除年龄大于>120的行(认为会员最高年龄不超过120随)
c['age'] = c['age'].drop(c['age'][c['age'] > 120].index)
#去除不符合逻辑的值:删除消费金额小于0的行
c['je']=c['je'].drop(c['je'][c['je'] < 0].index)
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
# c.rename(
#     columns={'kh':'卡号', 'dtime':'消费时间', 'spbm':'商品编码', 'sl':'数量',
#              'sj':'售价', 'je':'实际消费金额', 'spmc':'商品名称', 'jf':'积分',
#              'syjh':'收银机号', 'djh':'单据号', 'gzbm':'柜组编码',
#              'gzmc':'柜组名称'}, inplace = True)
import numpy as np
c.groupby(by=["spmc"])["je"].agg({"各类商品销售金额":np.sum})
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
各类商品销售金额
spmc
.雷达钟表无 1370300.52
23区女装系列A件 1655993.77
23区女装系列B.5件 253656.22
23区女装系列B件 280106.95
23区女装系列C件 258243.90
23区女装系列D件 63571.14
23区女装系列E件 105316.01
23区女装系列F件 1426.00
23区女装系列正价件 14810.40
3S 系列B无 1599.00
3S 系列C无 1399.00
5+E 285.00
5+E. 335.00
5+服饰A件 1521.00
5+服饰C件 978.00
5+服饰E件 445.00
6F-NIKEKIDS正价件 199.00
6F欣欣玩具正价个 713.00
8848 正价个 270080.00
A.O.史密斯 正价件 146028.04
ABLE JEANS正价件 1399.00
AD 正价无 799.00
ADSE 正价无 699.00
ADSE 系列A无 508.00
AD集合店正价 1996.00
AO2F件 5615.00
AO2正价件 1434.00
AOJO 正价副 549.00
APM D无 48688.00
APM 正价无 2364639.24
... ...
香奈儿限量35ml邂逅35ml 6077.00
香奈儿青春光彩保湿粉凝霜. 16182.50
香奈儿青春光彩柔润粉芯. 22616.94
香奈儿青春光彩柔润粉饼13g 281251.06
香奈儿青春光彩水润粉底液30ml 2899.16
香奈儿魅力润体乳200ml 1010.00
香水 正价无 95674.49
马天奴 A件 2788.00
骆驼新B* 670.00
高美高手袋系列A个 14881.11
高美高手袋系列B个 51714.90
高美高手袋系列F个 3397.00
高美高手袋系列正价个 2088.00
高美高正价* 1888.00
高美高特卖个 5568.00
高美高系列正价个 479329.78
魅可唇膏支 510.00
魅可持久防水眼线笔支 200.00
魅可新式卷笔刀支 40.00
魅可时尚焦点小眼影(片)支 115.00
魅可焦点小眼影支 155.00
黄金金条件 37978.20
黄金饰品克 105375.00
黛安芬内衣系列3件 36.00
黛安芬内衣系列B件 47239.81
黛安芬内衣系列C件 24655.99
黛安芬内衣系列E件 2124.00
黛安芬内衣系列F件 2647.92
黛安芬内衣系列件 475907.51
黛安芬内衣系列员购件 8669.00

5936 rows × 1 columns

je = c.groupby(by=["spmc"])["je"].agg({"sum":np.sum})[0:5]
#index = range(len(je))
index = range(5)
plt.bar(index, je["sum"], color="dodgerblue")
plt.xticks(index, je.index)
plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:1: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  """Entry point for launching an IPython kernel.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-59b1cBLf-1584007168716)(output_18_1.png)]

import matplotlib.pyplot as plt
qn=c.age[(c.age < 45)].shape[0]
zn=c.age[(c.age < 60)&(c.age >= 45)].shape[0]
ln=c.age[(c.age >= 60)].shape[0]
import matplotlib.pyplot as plt
labels=('Young','Middle-aged','Elderly')
sizes=(qn,zn,ln)
colors=['lightgreen','gold','lightskyblue']
explode=0,0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member age')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XOo9KI81-1584007168721)(output_19_0.png)]

#性别人数比例和不同性别消费金额比例
#性别,0女,1男
#性别比例图
#年龄占比图
import matplotlib.pyplot as plt
Female=c['xb'][(c['xb'] == 0)].shape[0]
Male=c['xb'][(c['xb'] == 1)].shape[0]
labels=('Female','Male')
sizes=(Female,Male)
colors=['lightgreen','gold']
explode=0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member sex')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-XPB7UnkN-1584007168723)(output_20_0.png)]

#不同年龄阶层消费总额占比
qne = c[(c.age < 45)]['je'].sum()
zne = c[(c.age < 60)&(c.age >= 45)]['je'].sum()
lne = c[(c.age >= 60)]['je'].sum()
labels=('Young','Middle-aged','Elderly')
sizes=(qn,zn,ln)
colors=['lightgreen','gold','lightskyblue']
explode=0,0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of different age member''s payment')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-muUEQQEd-1584007168727)(output_21_0.png)]

#不同性别消费金额占比
import matplotlib.pyplot as plt
Female=c[(c['xb'] == 0)]['je'].sum()
Male=c[(c['xb'] == 1)]['je'].sum()
labels=('Female','Male')
sizes=(Female,Male)
colors=['lightskyblue','gold']
explode=0,0.1
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of different members sex payment')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-AyHHa9WB-1584007168729)(output_22_0.png)]

#会员订单数=会员字段非空且单据号字段非空的行数;(剔除不符合逻辑的消费金额小于零值)
hy=c2[(c2['kh'].isnull()==False)&(c2['djh'].isnull()==False)&(c2['je']>=0)]
#非会员订单数=会员字段为空但单据号字段非空的行数;(剔除不符合逻辑的消费金额小于零值)
fhy=c2[(c2['kh'].isnull()==True)&(c2['djh'].isnull()==False)&(c2['je']>=0)]
# #提取年数
# hy['csrq']=pd.to_datetime(hy['csrq'],errors='coerce')
# hy['year'] = hy['csrq'].dt.year
# fhy['csrq']=pd.to_datetime(fhy['csrq'],errors='coerce')
# fhy['year'] = fhy['csrq'].dt.year
# #年龄计算
# now_year = dt.datetime.now().year
# hy['age'] = now_year - hy['year']
# fhy['age'] = now_year - fhy['year']
# #去除不符合逻辑的值:删除年龄大于>120的行(认为会员最高年龄不超过120随)
# hy['age'] = hy['age'].drop(hy['age'][hy['age'] > 120].index)
# fhy['age'] = fhy['age'].drop(fhy['age'][fhy['age'] > 120].index)
#c2['kh'](c2['kh'].isnull()==True)
num_hy=hy.iloc[:,0].size
num_fhy=fhy.iloc[:,0].size
print (num_hy,num_fhy)
869072 997811
#会员与非会员的订单数饼图
import matplotlib.pyplot as plt
labels=('Member orders','Non-member orders')
sizes=(num_hy,num_fhy)
colors=['lightgreen','gold']
explode=0,0
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member orders')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Srh33bjN-1584007168731)(output_24_0.png)]

#会员与非会员的消费金额总数饼图
import matplotlib.pyplot as plt
labels=('Member orders','Non-member orders')
sizes=(hy['je'].sum(),fhy['je'].sum())
colors=['lightblue','gold']
explode=0,0
plt.pie(sizes,explode=explode,labels=labels,
        colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member orders')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jERIprf7-1584007168740)(output_25_0.png)]

#2015-2018各年会员消费总额
#对hy表格增加年、月两列,删除除je、year、month的列
hy['dtime']=pd.to_datetime(hy['dtime'],errors='coerce')
hyy=hy
hyy['year']=hyy['dtime'].dt.year
hyy['month']=hyy['dtime'].dt.month
hyy=hyy.drop(['kh', 'dtime','spbm','sl','sj','spmc','jf','syjh','djh','gzbm','gzmc'], axis=1)
#会员2015-2018各年会员消费总额
import numpy as np
je_counts=hyy.groupby(by=["year"])["je"].agg({"各年销售金额":np.sum})
#饼图
plt.pie(je_counts, labels=je_counts.index,explode=[0.1,0,0.1,0.1],
        colors=['lightblue','gold','lightgreen','lightcoral'], 
        autopct='%.2f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of yearly payment')
plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:10: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
  # Remove the CWD from sys.path while we load stuff.
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:14: MatplotlibDeprecationWarning: Non-1D inputs to pie() are currently squeeze()d, but this behavior is deprecated since 3.1 and will be removed in 3.3; pass a 1D array instead.

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-tUu3vx9P-1584007168751)(output_26_1.png)]

#2015-2018年每个月的消费金额
c['dtime']=pd.to_datetime(c['dtime'],errors='coerce')
for i in range(2015, 2019):
    monthly = []
    for j in range(0,13):
        monthly.append(c[(c['dtime'].dt.year == i) & (c['dtime'].dt.month == j)]['je'].sum())
    plt.plot(range(0,len(monthly)), monthly,color=(0.4, 0.8, 0.6))
    plt.xlabel("Month")
    plt.ylabel("Amount")
    plt.grid(True)
    plt.title('The consumption in '+str(i))
    plt.show()    
# plt.plot(sl_counts.index, sl_counts, "-", color=(0.4, 0.8, 0.6))
# plt.xlabel("日期")
# plt.xticks(rotation=15)
# plt.ylabel("销量")
# plt.grid(True)
# plt.title("折线图")
# plt.show()
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IGATV2CI-1584007168765)(output_27_1.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-yrT9weJs-1584007168776)(output_27_2.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-8O0NO7gq-1584007168780)(output_27_3.png)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uJ5IRmQ4-1584007168786)(output_27_4.png)]

#各时段消费人数
zs=c['dtime'][(c['dtime'].dt.hour >= 5)&(c['dtime'].dt.hour < 11)].shape[0]
zw=c['dtime'][(c['dtime'].dt.hour >= 11)&(c['dtime'].dt.hour < 14)].shape[0]
xw=c['dtime'][(c['dtime'].dt.hour >= 14)&(c['dtime'].dt.hour < 19)].shape[0]
ws=c['dtime'][(c['dtime'].dt.hour >= 19)&(c['dtime'].dt.hour < 24)].shape[0]
lc=c['dtime'][(c['dtime'].dt.hour >= 0)&(c['dtime'].dt.hour < 5)].shape[0]
print ('早上:05:00:00~10:59:59订单数量:%d'%zs)
print ('中午:11:00:00~13:59:59订单数量:%d'%zw)
print ('下午:14:00:00~18:59:59订单数量:%d'%xw)
print ('晚上:19:00:00~23:59:59订单数量:%d'%ws)
print ('凌晨:00:00:00~04:59:59订单数量:%d'%lc)
早上:05:00:00~10:59:59订单数量:17833
中午:11:00:00~13:59:59订单数量:104883
下午:14:00:00~18:59:59订单数量:223950
晚上:19:00:00~23:59:59订单数量:99136
凌晨:00:00:00~04:59:59订单数量:92
#各时段人数可视化图
import matplotlib.pyplot as plt
labels=('Morning','Noon','Early morning','Afternoon','Evening')
sizes=(zs,zw,lc,xw,ws)
explode=[0,0.1,0,0.1,0.1]
plt.pie(sizes,explode=explode,labels=labels,colors=['lightblue','gold','lightgreen','lightcoral','lightgreen'],
        autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of member time')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-N8C60Yuv-1584007168789)(output_29_0.png)]

#春季3、4、5月;夏季6、7、8月;秋季9、10、11月;冬季12、1、2月
Spring=c['dtime'][(c['dtime'].dt.month >= 3)&(c['dtime'].dt.month <= 5)].shape[0]
Summer=c['dtime'][(c['dtime'].dt.month >= 6)&(c['dtime'].dt.month <= 8)].shape[0]
Autumn=c['dtime'][(c['dtime'].dt.month >= 9)&(c['dtime'].dt.month <= 11)].shape[0]
Winter=c['dtime'][(c['dtime'].dt.month >= 12)&(c['dtime'].dt.month <= 2)].shape[0]
print ('春季3、4、5月订单数量:%d'%Spring)
print ('夏季6、7、8月订单数量:%d'%Summer)
print ('秋季9、10、11月订单数量:%d'%Autumn)
print ('冬季12、1、2月订单数量:%d'%Winter)
春季3、4、5月订单数量:132681
夏季6、7、8月订单数量:118389
秋季9、10、11月订单数量:100197
冬季12、1、2月订单数量:0
#各季节会员消费人数
import matplotlib.pyplot as plt
labels=('Spring','Summer','Autumn','Winter')
sizes=(Spring,Summer,Autumn,Winter)
explode=[0.1,0,0.1,0]
colors=['lightgreen','gold','lightskyblue','lightcoral']
plt.pie(sizes,explode=explode,labels=labels,colors=colors
        ,autopct='%.2f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.title('Distribution of season')
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EwYd2g5S-1584007168790)(output_31_0.png)]

c.head()
kh csrq xb djsj dtime spbm sl sj je spmc ... syjh djh gzbm gzmc year age 年龄阶层 month 季节 入会时长
0 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 499.0 297.07 WMF D无 ... 101 7cd8 7296.0 WMF 柜 1967 53.0 中年人 9 秋季 16
1 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 12288.0 11089.93 WMF D无 ... 101 7cd8 7296.0 WMF 柜 1967 53.0 中年人 9 秋季 16
2 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2017-02-28 17:17:35.533 252403ef 1 598.0 598.00 双立人商品 F无 ... 102 ed0a 7242.0 双立人柜 1967 53.0 中年人 2 冬季 16
3 34c2dc93 1986-07-18 0.0 2007-04-19 00:00:00.000 2017-11-13 13:36:45.656 4708e0bb 1 260.0 260.00 植村秀三色眼影/眼线液/眉笔. ... 97 c3f4 8156.0 Shu Uemura 1986 34.0 青年 11 秋季 13
4 828aa9b4 1951-12-29 0.0 2004-09-24 16:39:08.716 2017-05-17 11:30:39.080 a4809307 1 1288.0 1288.00 酷彩F件 ... 102 95b0 7300.0 Le Creuset(酷彩) 1951 69.0 老年人 5 春季 16

5 rows × 21 columns

##任务3.1:构建会员用户“基本”特征标签
#固有基础属性标签:性别、年龄阶层、入会时长
c['性别']=c['xb'].map({1.0:'男',0.0:'女'})
# c['年龄']=aaa.replace("黑色","黄色")
im=c.drop(['csrq','xb','djsj','dtime','spbm','sl','sj',
              'syjh','djh','gzbm','gzmc','year','age','month'], axis=1)
im.head()
kh je spmc jf 年龄阶层 季节 入会时长 性别 消费水平 新老会员
0 a37cc182 297.07 WMF D无 297.08 中年人 秋季 16 低消费 老会员
1 a37cc182 11089.93 WMF D无 11089.92 中年人 秋季 16 高消费 老会员
2 a37cc182 598.00 双立人商品 F无 598.00 中年人 冬季 16 中等消费 老会员
3 34c2dc93 260.00 植村秀三色眼影/眼线液/眉笔. 260.00 青年 秋季 13 低消费 老会员
4 828aa9b4 1288.00 酷彩F件 1288.00 老年人 春季 16 中等消费 老会员
#任务3.2:构建会员用户“业务”特征标签
#基础性息处理后得到标签:消费水平(300以下低消费;300-1500中等消费;1500以上高消费)
#入会时长:新老会员;消费水平:低中高;有价值的用户信息
#消费金额数据描述
im['je'].describe()
im['入会时长'].describe()
count    445894.000000
mean          6.322101
std           2.989830
min           2.000000
25%           4.000000
50%           5.000000
75%           8.000000
max          18.000000
Name: 入会时长, dtype: float64
im['消费水平'] = pd.cut(im['je'], bins=[0,300,1500,9999999], labels=['低消费','中等消费', '高消费'] )
im['新老会员'] = pd.cut(im['入会时长'], bins=[0,5,999], labels=['新会员','老会员'] )
im.head()
kh je jf 年龄阶层 季节 入会时长 性别 消费水平 新老会员
0 a37cc182 297.07 297.08 中年人 秋季 16 低消费 老会员
1 a37cc182 11089.93 11089.92 中年人 秋季 16 高消费 老会员
2 a37cc182 598.00 598.00 中年人 冬季 16 中等消费 老会员
3 34c2dc93 260.00 260.00 青年 秋季 13 低消费 老会员
4 828aa9b4 1288.00 1288.00 老年人 春季 16 中等消费 老会员
#任务3.3:构建会员用户“偏好”特征标签
#用户行为推测标签:经常购买运动T、裤、鞋为体育标签;购物行为——怀孕标签
#购物时间段、季节、商品类型偏好
im['购物时间段'] = pd.cut(c['dtime'].dt.hour, bins=[0,5,11,14,19,24], labels=['凌晨','早上', '中午','下午','晚上'] )
im.head()
kh je spmc jf 年龄阶层 季节 入会时长 性别 消费水平 新老会员 购物时间段
0 a37cc182 297.07 WMF D无 297.08 中年人 秋季 16 低消费 老会员 早上
1 a37cc182 11089.93 WMF D无 11089.92 中年人 秋季 16 高消费 老会员 早上
2 a37cc182 598.00 双立人商品 F无 598.00 中年人 冬季 16 中等消费 老会员 下午
3 34c2dc93 260.00 植村秀三色眼影/眼线液/眉笔. 260.00 青年 秋季 13 低消费 老会员 中午
4 828aa9b4 1288.00 酷彩F件 1288.00 老年人 春季 16 中等消费 老会员 早上
#删去无关列
im=im.drop(['je','jf','入会时长'], axis=1)
im.head()
kh spmc 年龄阶层 季节 性别 消费水平 新老会员 购物时间段
0 a37cc182 WMF D无 中年人 秋季 低消费 老会员 早上
1 a37cc182 WMF D无 中年人 秋季 高消费 老会员 早上
2 a37cc182 双立人商品 F无 中年人 冬季 中等消费 老会员 下午
3 34c2dc93 植村秀三色眼影/眼线液/眉笔. 青年 秋季 低消费 老会员 中午
4 828aa9b4 酷彩F件 老年人 春季 中等消费 老会员 早上
#任务3.4:建立用户画像
#整合所有标签信息
#1.文字呈现每个用户的画像结果(#会员卡号、性别、偏好品类、近期消费次数)
#2.可视化方式呈现
#对2bde7c95
bd=im[(im['kh']=='2bde7c95')]
bd=bd.drop(['kh'], axis=1)
bd.head()
bd.to_csv('D:/test.csv') 


# import matplotlib.pyplot as plt
# mid_je = im
# mid_je = mid_je.reset_index()
# dic = {x[0]:x[1] for x in mid_je.loc[:,:].values}
# wc= wordcloud.WordCloud(scale=16, font_path='simhei.ttf',
#                         background_color='white', max_words=100,colormap="coolwarm")
# X = wc.generate_from_frequencies(dic)
# plt.axis('off')
# plt.imshow(X)
# wordcloud = WordCloud(background_color='white',scale=1.5).generate(bd)
# #显示词云图片
# plt.imshow(wordcloud)
# plt.axis('off')
# plt.show()
#导入wordcloud模块和matplotlib模块
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from scipy.misc import imread
#读取一个txt文件
with open("D:/test.csv" ,encoding="utf-8")as file:
    #读取文本内容
    text=file.read()
#读入背景图片
# bg_pic = imread('D:/3.png')
#生成词云
wordcloud = WordCloud(background_color='white',font_path='simhei.ttf',scale=1.5,
                       max_words=100,colormap="coolwarm").generate(text)
# image_colors = ImageColorGenerator(bg_pic)
#显示词云图片
plt.imshow(wordcloud)
plt.axis('off')
plt.show()
#保存图片
# wordcloud.to_file('test.jpg')

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-P9HCikf3-1584007168794)(output_39_0.png)]

c.head()
kh csrq xb djsj dtime spbm sl sj je spmc ... gzmc year age 年龄阶层 month 季节 入会时长 性别 消费水平 新老会员
0 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 499.0 297.07 WMF D无 ... WMF 柜 1967 53.0 中年人 9 秋季 16 低消费 老会员
1 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2016-09-25 10:49:14.016 d62a69e3 1 12288.0 11089.93 WMF D无 ... WMF 柜 1967 53.0 中年人 9 秋季 16 高消费 老会员
2 a37cc182 1967-02-17 0.0 2004-12-31 21:24:34.216 2017-02-28 17:17:35.533 252403ef 1 598.0 598.00 双立人商品 F无 ... 双立人柜 1967 53.0 中年人 2 冬季 16 中等消费 老会员
3 34c2dc93 1986-07-18 0.0 2007-04-19 00:00:00.000 2017-11-13 13:36:45.656 4708e0bb 1 260.0 260.00 植村秀三色眼影/眼线液/眉笔. ... Shu Uemura 1986 34.0 青年 11 秋季 13 低消费 老会员
4 828aa9b4 1951-12-29 0.0 2004-09-24 16:39:08.716 2017-05-17 11:30:39.080 a4809307 1 1288.0 1288.00 酷彩F件 ... Le Creuset(酷彩) 1951 69.0 老年人 5 春季 16 中等消费 老会员

5 rows × 24 columns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
km=c.drop(['csrq','xb','djsj','dtime','spbm','sl','sj','spmc',
              'syjh','gzbm','gzmc','year','age','month','jf',
           '年龄阶层','季节','性别','消费水平','新老会员'], axis=1)
km_cp=km.copy()
km.head()
kh je djh 入会时长
0 a37cc182 297.07 7cd8 16
1 a37cc182 11089.93 7cd8 16
2 a37cc182 598.00 ed0a 16
3 34c2dc93 260.00 c3f4 13
4 828aa9b4 1288.00 95b0 16
#入会时长 (Duration)
#消费频率 (Frequency)
#消费金额 (Monetary)
#计算最近一次消费
D=km_cp[['kh','入会时长']]
#计算消费次数
Fre=km_cp[['kh','djh']].groupby('kh').count().reset_index()
#用户个人总消费金额
Mone=km_cp[['kh','je']].groupby('kh').sum().reset_index()
#合并数据集
km_1=pd.merge(D,Fre,on='kh',how='inner')
km_2=pd.merge(km_1,Mone,on='kh',how='inner')
km_2=km_2.rename(columns={'kh':'卡号','djh':'消费频率','je':'消费金额'})
km_2.describe()
入会时长 消费频率 消费金额
count 445894.000000 445894.000000 4.458940e+05
mean 6.322101 85.904888 1.172853e+05
std 2.989830 262.171340 3.098385e+05
min 2.000000 1.000000 0.000000e+00
25% 4.000000 11.000000 1.071100e+04
50% 5.000000 32.000000 3.541280e+04
75% 8.000000 81.000000 1.081595e+05
max 18.000000 3039.000000 3.257258e+06
#K-means模型训练
km_future=km_2[['入会时长','消费频率','消费金额']]
km_2_SC=StandardScaler().fit_transform(data_future)
KMeans_model=KMeans(n_clusters=3)
fit_model=KMeans_model.fit(data_b_SC) 
print(KMeans_model.cluster_centers_)
[[ 1.30518985 -0.10829497 -0.13086107]
 [-1.11113518 11.26400174 10.13423447]
 [-0.54919514 -0.06399474 -0.04323129]]
#K-means聚类雷达图绘制
angles=np.linspace(0,2*np.pi,3,endpoint=False) 
angles=np.concatenate((angles,[angles[0]]))
centers=KMeans_model.cluster_centers_ 
plt_data=np.concatenate((centers,centers[:,[0]]),axis=1)
label=['Duration','Frequency','Monetary']
fig=plt.figure(figsize=(6,6))
ax=fig.add_subplot(111,polar=True)
for i in range(len(plt_data)):
    ax.plot(angles,plt_data[i],'o-',label=label[i])
ax.set_thetagrids(angles*180/np.pi,label)
plt.legend(bbox_to_anchor=(0.8,1.15),ncol=3)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IFPDvVQF-1584007168796)(output_44_0.png)]

# from sklearn.preprocessing import StandardScaler # 导入数据标准化模块
# from sklearn.cluster import KMeans # 导入K-Means聚类模块
# from sklearn.datasets import load_iris # 导入鸢尾花数据
# iris = load_iris()
# iris_data = iris['data']
# iris_target = iris['target']
# iris_names = iris['feature_names']
# sc = StandardScaler()
# iris_data = sc.fit_transform(iris_data) # 数据标准化
# model = KMeans(n_clusters=3,random_state=0,max_iter=500) # 构建聚类模块
# fit_model = model.fit(iris_data) # 训练聚类模块
# #输出聚类中心
# print('聚类中心\n',model.cluster_centers_)
# print('类别\n',pd.Series(model.labels_).value_counts())
# angles = np.linspace(0,2*np.pi,4,endpoint=False) # 将圆根据标签的个数等比分
# angles = np.concatenate((angles,[angles[0]])) # 闭合
# centers = model.cluster_centers_  # 获取聚类中心数据
# plot_data = np.concatenate((centers,centers[:,[0]]),axis=1)
# label = iris_names # 设置标签
# fig = plt.figure(figsize=(6,6))
# ax = fig.add_subplot(111,polar = True)
# for i in range(len(plot_data)):
#     ax.plot(angles, plot_data[i], 'o-', label='聚类群体'+str(i+1))
# ax.set_thetagrids(angles*180/np.pi, label)
# plt.legend(bbox_to_anchor=(0.8, 1.15),ncol=3) # 设置图例的位置
# plt.show()

你可能感兴趣的:(Python)