大家好!我是未来村村长!就是那个“请你跟我这样做,我就跟你那样做!”的村长!
# coding=gbk
import pandas as pd
import numpy as np
import calendar
pd.options.mode.chained_assignment = None#消除警告
df_salesRecords = pd.read_csv(r"C:\Users\官二的磊子\Desktop\泰迪杯\附件.csv",encoding='gbk')
#一、必要预处理:异常值处理
#1、缺失值处理
all_null = df_salesRecords.isnull().sum().sum()#统计整个df的缺失值情况
print("总缺失值个数:",all_null)
null_rate = df_salesRecords.isnull().sum(axis=0).sum()/len(df_salesRecords)#统计缺失值占比
print("缺失值占比:",null_rate)
df_salesRecords.dropna(how='any',inplace=True)#删除缺失值
#2、重复值处理
print("重复行数为:",df_salesRecords.duplicated().sum())#查看重复值情况
df_salesRecords.drop_duplicates(inplace=True)
#3、处理后一定要重新排序
df_salesRecords = df_salesRecords.reindex(range(len(df_salesRecords)))
###
print(df_salesRecords.head())
#二、统计每个大类商品的销售金额
df_bigCategory_sales = df_salesRecords.loc[:,['大类名称','销售金额']]
group_bigCategory_sales = df_bigCategory_sales.groupby(['大类名称']).agg([np.sum,np.mean,np.std,np.max,np.min])
#三、统计每个中类商品的促销金额和非促销金额
df_middleCategory_sales = df_salesRecords.loc[:,['中类名称','销售金额','是否促销']]
df_promotion = df_middleCategory_sales[df_middleCategory_sales['是否促销'] == '是']
df_notPromotion = df_middleCategory_sales[df_middleCategory_sales['是否促销'] == '否']
group_middleCategory_promotion = df_promotion.groupby(['中类名称']).agg([np.sum,np.mean,np.std])
group_middleCategory_notPromotion = df_notPromotion.groupby(['中类名称']).agg([np.sum,np.mean,np.std])
#四、统计生鲜产品和一般产品的每周销售金额
pd.to_datetime(df_salesRecords['销售日期'])
print(calendar.monthrange(2015,1))
df_salesRecord = df_salesRecords.loc[:,['商品类型','销售金额','销售日期']]
df_salesRecord['周次'] = '第1周'
first_day = 3
week = 1
for i in range(len(df_salesRecord)):
if i == 0:
continue
if df_salesRecord['销售日期'][i] != df_salesRecord['销售日期'][i-1]:
df_salesRecord['周次'][i] = '第'+ str(week) + '周'
first_day += 1
if (first_day % 7) == 0:
week += 1
df_salesRecord['周次'][i] = '第' + str(week) + '周'
df_FreshCategory_sales = df_salesRecord[df_salesRecord['商品类型'] == '生鲜']
df_CommonCategory_sales = df_salesRecord[df_salesRecord['商品类型'] == '一般商品']
print(df_FreshCategory_sales)
group_FreshCategory_sales = df_FreshCategory_sales.loc[:,['周次','销售金额']].groupby(['周次']).agg([np.sum,np.mean,np.std])
group_CommonCategory_sales = df_CommonCategory_sales.loc[:,['周次','销售金额']].groupby(['周次']).agg([np.sum,np.mean,np.std])
#五、求顾客每月的消费额及消费天数
df_salesRecords['用户消费额'] = df_salesRecords['销售数量'] * df_salesRecords['商品单价']
df_customer_Record = df_salesRecords.loc[:,['顾客编号','用户消费额','销售月份']]
group_customer_Record = df_customer_Record.groupby(['销售月份','顾客编号']).agg([np.sum,np.size])
可使用tabluea进行绘图,不在此处细说,编程绘图成本较大,不建议使用。
先在任务一的数据处理的基础上,导出累计消费额排名前十的用户的消费情况。
#统计消费额累计前十的顾客
df_customer_Record2 = df_salesRecords.loc[:,['顾客编号','用户消费额']]
group_customer = df_customer_Record2.groupby(['顾客编号']).sum()
group_customer.reset_index(inplace=True)
group_customer.sort_values(by=['用户消费额'], ascending=False, inplace=True)
group_customer = group_customer.reset_index(drop=True)#重新从0开始索引
customer_10 = group_customer.loc[:9,['顾客编号']]
customer_10_list = customer_10['顾客编号'].values.tolist()
#取出消费额累计前十的顾客的消费信息
df_salesRecords.set_index(['顾客编号'],inplace=True)
df_salesRecord_10 = df_salesRecords.loc[customer_10_list,:]
df_salesRecord_10.reset_index(inplace=True)
#根据顾客编号进行数据分组
group_customer = df_salesRecord_10.loc[:,['顾客编号','中类名称','商品类型','用户消费额']].groupby(['顾客编号'])
#按顾客编号分别导出对应的消费信息
for i in range(len(customer_10_list)):
filename = '用户' + str(customer_10_list[i]) + '.csv'
group_customer.get_group(customer_10_list[i]).to_csv(r"C:\Users\官二的磊子\Desktop\泰迪杯\用户数据" + "\\" + filename ,encoding='gbk')
然后根据其消费信息制作词云
1、词云函数
def Customer_WordCloud(filename,filepath):
with open(filepath, "r") as f: # 打开文件
txt = f.read() # 读取文件
plt.figure(dpi=120)
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
wc = WordCloud(font_path=r'C:\Users\官二的磊子\AppData\Local\Programs\Python\Python38\Lib\site-packages\wordcloud\simhei.ttf', # 如果是中文必须要添加这个,否则会显示成框框
background_color='white',
max_words=200, # 最多显示词数
max_font_size=500, # 字体最大值
width=1000,
height=800,
)
wc.generate(txt)
plt.title("顾客" + str(customer_10_list[i]) + "画像")
plt.imshow(wc) # 用plt显示图片
plt.axis('off') # 不显示坐标轴
plt.savefig(r"C:\Users\官二的磊子\Desktop\泰迪杯\用户数据" + "\\" + filename + ".png")
2、制作词云
for i in range(len(customer_10_list)):
filename = '用户' + str(customer_10_list[i]) + '.csv'
filepath = r"C:\Users\官二的磊子\Desktop\泰迪杯\用户数据" + "\\" + filename
Customer_WordCloud(filename,filepath)
实例:
任务2主要是以大类和销售额为依据,对每周的销售额或总销售额进行汇总统计,对比得出大类的销售情况和销售变化。
任务3主要是通过同类产品促销与不促销去看销量对比。