购药时间:格式为“2018-01-01 星期五”
社保卡号:一个卡号代表一个人
商品编码:一个编码未必只对应一个商品 还存在一个商品有两个编码的情况
商品名称:购买商品的名称
销售数量:售出的该商品数量
应收金额:应收取的金额
实收金额:实际收取的金额 有的与应收金额不同 可能是打折
但同样的应收金额也存在与实收金额相等的情况 所以也可能是价格波动或医保优惠导致
参考:https://blog.csdn.net/huangxiaoyun1900/article/details/82357194
https://blog.csdn.net/zgcr654321/article/details/88018806
https://zhuanlan.zhihu.com/p/37457901?from_voters_page=true
1、客户月均消费次数
2、客户月均消费金额
3、客单价=销售总数/顾客总数 每位客户平均购买商品的金额
4、客户消费金额与消费件数的关系如何
5、医院月均销售金额
6、医院月均销售次数
7、医院每月销售金额趋势
8、医院每月销售次数趋势
9、医院每月平均每单销售金额
10、最大和最小日销售金额及对应日期
11、各个月单独的每日销售金额
12、周几销售金额趋势图
13、药品销售趋势&销量前十的药品
14、销量前三药品的天数销售分布
15、销量前三药品的星期销售分布
16、一周七天内每天卖的最多的药品
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid',font_scale=1.5)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
data = pd.read_csv('朝阳医院2018年销售数据.csv',dtype={'社保卡号':str,'购药时间':str,'商品编码':str},encoding = 'gbk')
data.head()
data.info()
data.isnull().sum()
因为在6000+行中只有一两行有空值,而且购药时间、社保卡号、商品编码、商品名称没法补,所以干脆删掉
#删除缺失值
data.dropna(axis=0,inplace=True)
其他需要归一类型的数据类型已经在读取数据时规定好了
#处理时间
data['date'] = [x.split(' ')[0] for x in data['购药时间']]
data['week'] = [x.split(' ')[1] for x in data['购药时间']]
data['year'] = [x.split('-')[0] for x in data['date']]
data['month'] = [x.split('-')[1] for x in data['date']]
data['day'] = [x.split('-')[2] for x in data['date']]
data.describe()
没有数据说明,无法确定带负数的是否为退货订单,先删掉
data.drop(data[data.销售数量 <= 0].index,inplace=True)
暂不删除重复值,因为可能是一人一天买了多次
整理删除后的数据
#按日期排序
data=data.sort_values(by='购药时间',ascending=True)
data.reset_index()
data.columns
客户月均消费次数=客户总消费次数/月份数7
user_times = data.groupby(['社保卡号']).count()['date']
user_times_by_month = user_times/7
user_times_by_month
通过describe()查看最大值、中位数、平均值,来确定直方图的取值
bins = np.arange(0,1,0.1)#直方图的分布区间
plt.hist(user_times_by_month,bins)
#range只能整数
x = np.arange(0.1,1,0.1)
plt.xticks(x)
plt.locator_params('x',nbins=10)
plt.show()
range(1,10,1) 只能创建int分布区间
创建浮点数分布区间需要用np.arange(0,1,0.1)
#客户月均消费金额=客户总消费金额/月份数7
user_total = data.groupby(['社保卡号']).sum()['实收金额']
user_total_by_month = user_total/7
user_total_by_month
同样通过describe()查看最大值、中位数、平均值,来确定直方图的取值
bins = range(0,100,10)#直方图的分布区间
plt.hist(user_total_by_month,bins)
#range只能整数
x = range(0,100,10)
plt.xticks(x)
plt.locator_params('x',nbins=10)
plt.show()
客单价=销售总数/顾客总数 每位客户平均购买商品的金额
sell_total = data['实收金额'].sum()
#删除重复社保卡号 获取客户总数
user_total = data.drop_duplicates(subset=['社保卡号'],keep='first',inplace=False)
user_total = user_total.iloc[:,0].size#获取行数
kedan = sell_total/user_total
客户数为2413,客单价为126.59
user = data.groupby(['社保卡号']).sum()
#去除异常值以便查看
user.drop(user[user.销售数量 > 100].index,inplace=True)
plt.scatter(user['销售数量'],user['实收金额'])
plt.show()
sell_by_month = sell_total/7
times_by_month = data.iloc[:,0].size/7
#医院每月销售金额趋势
hospital_total_by_month = data.groupby('month').sum()['实收金额']
hospital_total_by_month
plt.figure(figsize=(16,9))
sns.lineplot(data=hospital_total_by_month)
plt.show()
hospital_times_by_month = data.groupby('month').count()['date']
hospital_times_by_month
plt.figure(figsize=(16,9))
sns.lineplot(data=hospital_times_by_month)
plt.show()
医院每月平均每单销售金额=每月的总销售额/每月单数
mean_by_month = data.groupby('month').sum()
mean_by_list = data.groupby('month').count()
mean_by_month_list = mean_by_month['实收金额']/mean_by_list['date']
mean_by_month_list
total_date = data.groupby('date').sum()
date_max = total_date['实收金额'].max()
total_date.loc[total_date['实收金额'] == date_max]
date_min= total_date['实收金额'].min()
total_date.loc[total_date['实收金额'] == date_min]
#各个月单独的每日销售金额
total_by_day = data.groupby('day').sum()['实收金额']
plt.figure(figsize=(16,9))
sns.lineplot(data=total_by_day)
plt.show()
#直方图
total_by_day = data.groupby('date').sum()['实收金额']
total_by_day.describe()
bins = range(0,5300,100)#直方图的分布区间
plt.hist(total_by_day,bins)
#range只能整数
x = range(0,5000,100)
plt.xticks(x)
plt.locator_params('x',nbins=10)
plt.show()
因为排序不对所以用一种很蠢的方法把行名改了
#以星期几展示趋势图
total_by_week= data.groupby('week').sum()['实收金额'].reset_index()
total_by_week.loc[total_by_week.week == '星期一','week'] = '1'
total_by_week.loc[total_by_week.week == '星期三','week'] = '3'
total_by_week.loc[total_by_week.week == '星期二','week'] = '2'
total_by_week.loc[total_by_week.week == '星期五','week'] = '5'
total_by_week.loc[total_by_week.week == '星期六','week'] = '6'
total_by_week.loc[total_by_week.week == '星期四','week'] = '4'
total_by_week.loc[total_by_week.week == '星期日','week'] = '7'
#排序不对
total_by_week = total_by_week.sort_values(by='week')
#行转为列
total_by_week.set_index("week",inplace=True)
total_by_week
plt.figure(figsize=(16,9))
sns.lineplot(data=total_by_week,markers=True)
plt.show()
#药品销售趋势&销量前十和最后十名的药品
total_by_medicine = data.groupby('商品名称').sum()
total_by_medicine.sort_values(by='销售数量')
#总金额
total_sale = data.sum()['实收金额']
total_sale
#计算金额占比
total_by_medicine['金额占比'] = total_by_medicine['实收金额']/total_sale*100
total_by_medicine.sort_values(by='销售数量')
#前十销售量药品的占比
total_by_medicine_head = total_by_medicine.sort_values(by='占比',ascending=False).head(10)
total_by_medicine_head
p1 = plt.bar(x=total_by_medicine_head.index, height=total_by_medicine_head['销售数量'],label=u'销售数量')#取行列名
plt.xticks(rotation=90)
plt.legend(loc=0)#显示图例
p2 = plt.twinx()
p2 = plt.plot(total_by_medicine_head.index,total_by_medicine_head['金额占比'],'o-',color='red',label=u'金额占比')
plt.legend(loc=5)#显示图例
plt.show()
#前三药品看天数分布
total_by_medicine_head = total_by_medicine.sort_values(by='占比',ascending=False).head(3)
data.index取行名
total_by_medicine_day = data.groupby(['商品名称','day']).sum().reset_index()
total_by_medicine_day1 = total_by_medicine_day[total_by_medicine_day['商品名称'] == total_by_medicine_head.index[0]]
total_by_medicine_day1
total_by_medicine_day2 = total_by_medicine_day[total_by_medicine_day['商品名称'] == total_by_medicine_head.index[1]]
total_by_medicine_day3 = total_by_medicine_day[total_by_medicine_day['商品名称'] == total_by_medicine_head.index[2]]
plt.figure(figsize=(10,10))
plt.plot(total_by_medicine_day1['day'],total_by_medicine_day1['销售数量'],label='苯磺酸氨氯地平片(安内真)')
plt.plot(total_by_medicine_day1['day'],total_by_medicine_day2['销售数量'],label='开博通')
plt.plot(total_by_medicine_day1['day'],total_by_medicine_day3['销售数量'],label='酒石酸美托洛尔片(倍他乐克)')
plt.xticks(rotation=90)
plt.legend()#显示图例
plt.show()
total_by_medicine_week = data.groupby(['商品名称','week']).sum().reset_index()
total_by_medicine_week1 = total_by_medicine_week[total_by_medicine_week['商品名称'] == total_by_medicine_head.index[0]]
total_by_medicine_week2 = total_by_medicine_week[total_by_medicine_week['商品名称'] == total_by_medicine_head.index[1]]
total_by_medicine_week3 = total_by_medicine_week[total_by_medicine_week['商品名称'] == total_by_medicine_head.index[2]]
total_by_medicine_week1
又到了愚蠢的改星期名称时间
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期一','week'] = '1'
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期三','week'] = '3'
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期二','week'] = '2'
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期五','week'] = '5'
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期六','week'] = '6'
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期四','week'] = '4'
total_by_medicine_week1.loc[total_by_medicine_week1.week == '星期日','week'] = '7'
#排序不对
total_by_medicine_week1 = total_by_medicine_week1.sort_values(by='week')
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期一','week'] = '1'
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期三','week'] = '3'
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期二','week'] = '2'
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期五','week'] = '5'
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期六','week'] = '6'
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期四','week'] = '4'
total_by_medicine_week2.loc[total_by_medicine_week2.week == '星期日','week'] = '7'
#排序不对
total_by_medicine_week2 = total_by_medicine_week2.sort_values(by='week')
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期一','week'] = '1'
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期三','week'] = '3'
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期二','week'] = '2'
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期五','week'] = '5'
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期六','week'] = '6'
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期四','week'] = '4'
total_by_medicine_week3.loc[total_by_medicine_week3.week == '星期日','week'] = '7'
#排序不对
total_by_medicine_week3 = total_by_medicine_week3.sort_values(by='week')
plt.figure(figsize=(10,10))
plt.plot(total_by_medicine_week1['week'],total_by_medicine_week1['销售数量'],label='苯磺酸氨氯地平片(安内真)')
plt.plot(total_by_medicine_week1['week'],total_by_medicine_week2['销售数量'],label='开博通')
plt.plot(total_by_medicine_week1['week'],total_by_medicine_week3['销售数量'],label='酒石酸美托洛尔片(倍他乐克)')
plt.legend()#显示图例
plt.show()
先按销量排序 再删掉除了周几第一行以外的行 即可获得周几卖得最多的药品
#周几卖最多的 先按多少排序 再删掉除了周几第一行以外的行
total_medince_week = data.groupby(['week','商品名称']).sum()['销售数量'].reset_index()
total_medince_week = total_medince_week.sort_values(by='销售数量',ascending=False)
total_medince_week.drop_duplicates(subset=['week'],keep='first',inplace=True)
total_medince_week = total_medince_week.reset_index()
total_medince_week
没有找到适合的方法改这个,只能这么改
total_medince_week.loc[total_medince_week.week == '星期一','week'] = '1'
total_medince_week.loc[total_medince_week.week == '星期三','week'] = '3'
total_medince_week.loc[total_medince_week.week == '星期二','week'] = '2'
total_medince_week.loc[total_medince_week.week == '星期五','week'] = '5'
total_medince_week.loc[total_medince_week.week == '星期六','week'] = '6'
total_medince_week.loc[total_medince_week.week == '星期四','week'] = '4'
total_medince_week.loc[total_medince_week.week == '星期日','week'] = '7'
#排序不对
total_medince_week = total_medince_week.sort_values(by='week')
#行转为列
#total_medince_week.set_index("week",inplace=True)
1、客户的月均消费次数大多集中在0.1-0.3之间,说明在七个月内大多数客户仅消费了1~2次
2、客户的月均消费金额集中在30元以下,其中0~10元呈压倒性的分布,说明医院开给患者的药价格一般比较低廉
3、七个月内的客户数为2413,客单价为126.59r
4、销售数量与实收金额大概呈线性关系
5、医院月均销售金额为46636r
6、医院月均销售次数为933次
7、7月的销售数据未到月底,故不作比较。2月的销售金额明显下降,相较于1-6月的其他月份处在最低点,可能是因为春节假期导致
8、4月的销售次数达到峰值,可能是由于春季为流感多发期。6月的销售金额较高但销售次数较低,可能说明该月售出的药品更贵
9、通过医院每月平均每单销售金额的数据可以验证8中的结论,6月平均每单销售金额最高,其次为2月
10、2月8日和4月15日分别为上半年中销售金额最低与最高的两日
11、每月的日销售额大概呈现一个每隔10日达到顶峰的规律(why?),日销售额集中在500r-1500r之间
12、周五为全星期消费金额之最,周六次之,周三消费金额最少。这可能是因为患者常在周五下班后或周六去医院看病取药
13、图上可以反映出销量前十药品的单价对比。销量与金额占比都高的药品可以多进货,销量高与金额占比低、销量低与金额占比高的药品控制进货数量,而药品销量在倒数前十的药品可以少进货【后来去看其他人的博客发现这里还分析了前十药品的销售数量占总销售数量的占比并画出了帕累托图】
14、前三药品的天数销量分布与所有药品的天数销量分布趋势基本相同
15、销售第一药品的星期销量分布与所有药品的星期销量分布趋势基本相同,但其他两种药品一周七天内的销售量差别不大,可能是排名第一的药品以一己之力把星期五的销量拉高
16、基本被销量前二的药品囊括。以及周五卖出的苯磺酸氨氯地平片(安内真)高出其他一大截,接下来是销量也较高的周六卖出的苯磺酸氨氯地平片(安内真),说明确实是苯磺酸氨氯地平片(安内真)以一己之力拉高了周五、周六的销售数量,搜索了一下这款药是治疗高血压和心绞痛的,可能是客户周五下班后和周末去医院为父母补充降压药