import pandas as pd
import numpy as np
data=pd.read_excel(r'meal_order_detail.xlsx')
print(data.shape)
(2779, 19)
print(data.columns)
Index(['detail_id', 'order_id', 'dishes_id', 'logicprn_name',
'parent_class_name', 'dishes_name', 'itemis_add', 'counts', 'amounts',
'cost', 'place_order_time', 'discount_amt', 'discount_reason',
'kick_back', 'add_inprice', 'add_info', 'bar_code', 'picture_file',
'emp_id'],
dtype='object')
一、时间处理:
print(data.dtypes)
detail_id int64
order_id int64
dishes_id int64
logicprn_name float64
parent_class_name float64
dishes_name object
itemis_add int64
counts int64
amounts int64
cost float64
place_order_time datetime64[ns]
discount_amt float64
discount_reason float64
kick_back float64
add_inprice int64
add_info float64
bar_code float64
picture_file object
emp_id int64
dtype: object
为了展示字符串类型转标准时间格式,先将原来的标准时间转字符串
place_order_time=data['place_order_time'].astype('str')
print(place_order_time)
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
...
2774 2016-08-10 21:56:24
2775 2016-08-10 21:56:48
2776 2016-08-10 22:01:52
2777 2016-08-10 22:03:58
2778 2016-08-10 22:04:30
Name: place_order_time, Length: 2779, dtype: object
data['place_order_time']=pd.to_datetime(place_order_time)
print(data.head())
detail_id order_id dishes_id logicprn_name parent_class_name \
0 2956 417 610062 NaN NaN
1 2958 417 609957 NaN NaN
2 2961 417 609950 NaN NaN
3 2966 417 610038 NaN NaN
4 2968 417 610003 NaN NaN
dishes_name itemis_add counts amounts cost place_order_time \
0 蒜蓉生蚝 0 1 49 NaN 2016-08-01 11:05:36
1 蒙古烤羊腿 0 1 48 NaN 2016-08-01 11:07:07
2 大蒜苋菜 0 1 30 NaN 2016-08-01 11:07:40
3 芝麻烤紫菜 0 1 25 NaN 2016-08-01 11:11:11
4 蒜香包 0 1 13 NaN 2016-08-01 11:11:30
discount_amt discount_reason kick_back add_inprice add_info bar_code \
0 NaN NaN NaN 0 NaN NaN
1 NaN NaN NaN 0 NaN NaN
2 NaN NaN NaN 0 NaN NaN
3 NaN NaN NaN 0 NaN NaN
4 NaN NaN NaN 0 NaN NaN
picture_file emp_id
0 caipu/104001.jpg 1442
1 caipu/202003.jpg 1442
2 caipu/303001.jpg 1442
3 caipu/105002.jpg 1442
4 caipu/503002.jpg 1442
ser.dt.时间属性:
year、month、day、hour、minute、second、
date(日期)、week(一年中第几个星期)、
quarter(第几个季节)、dayofyear(一年中的第几天)、
dayofweek(一周的第几天,0开始)、is_leap_year(是否是闰年,返回布尔值)、
daysinmonth(这个月有多少天)、
month_name()(月份名)、day_name()(星期名称)
data['year']=data['place_order_time'].dt.year
print(data['place_order_time'].head())
data['place_order_time'].dt.daysinmonth
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
Name: place_order_time, dtype: datetime64[ns]
0 31
1 31
2 31
3 31
4 31
..
2774 31
2775 31
2776 31
2777 31
2778 31
Name: place_order_time, Length: 2779, dtype: int64
(1)标准时间加减运算
time1=data['place_order_time'].head()
#reset_index:重新设置索引,会将原来的索引加入到新列
time2=data['place_order_time'].tail().reset_index()
print(time2)
time2=time2['place_order_time']
print(time1)
print(time2)
# 索引不同不能减因此重置索引
index place_order_time
0 2774 2016-08-10 21:56:24
1 2775 2016-08-10 21:56:48
2 2776 2016-08-10 22:01:52
3 2777 2016-08-10 22:03:58
4 2778 2016-08-10 22:04:30
0 2016-08-01 11:05:36
1 2016-08-01 11:07:07
2 2016-08-01 11:07:40
3 2016-08-01 11:11:11
4 2016-08-01 11:11:30
Name: place_order_time, dtype: datetime64[ns]
0 2016-08-10 21:56:24
1 2016-08-10 21:56:48
2 2016-08-10 22:01:52
3 2016-08-10 22:03:58
4 2016-08-10 22:04:30
Name: place_order_time, dtype: datetime64[ns]
print(time2-time1)
0 9 days 10:50:48
1 9 days 10:49:41
2 9 days 10:54:12
3 9 days 10:52:47
4 9 days 10:53:00
Name: place_order_time, dtype: timedelta64[ns]
(2)整体+时间戳
weeks 星期 miliseconds 毫秒
days 天 hours 小时 minutes 分钟 seconds 秒
pd.Timedelta(days=1)
Timedelta('1 days 00:00:00')
data['place_order_time']+pd.Timedelta(hours=1)
0 2016-08-01 12:05:36
1 2016-08-01 12:07:07
2 2016-08-01 12:07:40
3 2016-08-01 12:11:11
4 2016-08-01 12:11:30
...
2774 2016-08-10 22:56:24
2775 2016-08-10 22:56:48
2776 2016-08-10 23:01:52
2777 2016-08-10 23:03:58
2778 2016-08-10 23:04:30
Name: place_order_time, Length: 2779, dtype: datetime64[ns]
# 每个订单应付金额
# sort_values排序,ascending=False:降序,True(默认)升序
data.groupby(by='order_id')[['amounts','counts']].sum().sort_values(by='counts',ascending=False)
amounts | counts | |
---|---|---|
order_id | ||
557 | 957 | 30 |
1186 | 655 | 28 |
1146 | 944 | 27 |
1026 | 1021 | 24 |
392 | 704 | 24 |
... | ... | ... |
492 | 301 | 3 |
1029 | 123 | 3 |
1054 | 98 | 3 |
703 | 127 | 2 |
1064 | 48 | 1 |
278 rows × 2 columns
# 如果是一维的没有列索引,所以排序的时候就不用传by
data.groupby(by='order_id')['amounts'].sum().sort_values(ascending=True)
# 但是如果amounts有两层中括号就不一样了[['amounts']] 就是二维的了,就得传by值
order_id
1064 48
1054 98
856 107
342 108
1174 110
...
1148 1066
584 1121
1121 1146
408 1148
1317 1210
Name: amounts, Length: 278, dtype: int64
1、pandas、numpy中的统计分析方法:sum、mean、std、var
2、特殊函数完成聚合操作
# agg里面写想要的多个计算方法,
# 如果前面是数值型,就传数值型的计算方法,
# 如果是类别型就传类别性的计算方法
data[['amounts','counts']].agg([np.sum,np.mean,np.std,np.var])
amounts | counts | |
---|---|---|
sum | 125992.000000 | 3088.000000 |
mean | 45.337172 | 1.111191 |
std | 36.808550 | 0.625428 |
var | 1354.869356 | 0.391160 |
# 对amounts求均值,对counts求和、标准差
data[['amounts','counts']].agg({
'amounts':np.mean,'counts':[np.sum,np.std]})
amounts | counts | |
---|---|---|
mean | 45.337172 | NaN |
std | NaN | 0.625428 |
sum | NaN | 3088.000000 |
# 按照订单号分组,获取每组的amounts、counts
data.groupby(by='order_id')[['amounts','counts']].agg([np.mean,np.sum])
amounts | counts | |||
---|---|---|---|---|
mean | sum | mean | sum | |
order_id | ||||
137 | 32.333333 | 194 | 1.500000 | 9 |
165 | 52.944444 | 953 | 1.166667 | 21 |
166 | 48.200000 | 241 | 1.400000 | 7 |
171 | 36.285714 | 254 | 1.428571 | 10 |
177 | 34.250000 | 137 | 1.000000 | 4 |
... | ... | ... | ... | ... |
1309 | 34.076923 | 443 | 1.153846 | 15 |
1314 | 42.333333 | 508 | 1.000000 | 12 |
1317 | 67.222222 | 1210 | 1.000000 | 18 |
1319 | 67.777778 | 610 | 1.000000 | 9 |
1323 | 50.933333 | 764 | 1.000000 | 15 |
278 rows × 4 columns
# 算完之后只获取amounts
data.groupby(by='order_id')[['amounts','counts']].agg([np.mean,np.sum])['amounts']
mean | sum | |
---|---|---|
order_id | ||
137 | 32.333333 | 194 |
165 | 52.944444 | 953 |
166 | 48.200000 | 241 |
171 | 36.285714 | 254 |
177 | 34.250000 | 137 |
... | ... | ... |
1309 | 34.076923 | 443 |
1314 | 42.333333 | 508 |
1317 | 67.222222 | 1210 |
1319 | 67.777778 | 610 |
1323 | 50.933333 | 764 |
278 rows × 2 columns
order=pd.read_csv(r'order-14.3.csv',sep=',',encoding='gbk')
print(order.info())
RangeIndex: 3478 entries, 0 to 3477
Data columns (total 7 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 商品ID 3478 non-null int64
1 类别ID 3478 non-null int64
2 门店编号 3478 non-null object
3 单价 3478 non-null float64
4 销量 3478 non-null float64
5 成交时间 3478 non-null object
6 订单ID 3478 non-null object
dtypes: float64(2), int64(2), object(3)
memory usage: 190.3+ KB
None
data=pd.read_excel(r'D:\jupyter\meal_order_detail.xlsx')
print(data.info())
RangeIndex: 2779 entries, 0 to 2778
Data columns (total 19 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 detail_id 2779 non-null int64
1 order_id 2779 non-null int64
2 dishes_id 2779 non-null int64
3 logicprn_name 0 non-null float64
4 parent_class_name 0 non-null float64
5 dishes_name 2779 non-null object
6 itemis_add 2779 non-null int64
7 counts 2779 non-null int64
8 amounts 2779 non-null int64
9 cost 0 non-null float64
10 place_order_time 2779 non-null datetime64[ns]
11 discount_amt 0 non-null float64
12 discount_reason 0 non-null float64
13 kick_back 0 non-null float64
14 add_inprice 2779 non-null int64
15 add_info 0 non-null float64
16 bar_code 0 non-null float64
17 picture_file 2779 non-null object
18 emp_id 2779 non-null int64
dtypes: datetime64[ns](1), float64(8), int64(8), object(2)
memory usage: 412.6+ KB
None
data['dishes_name']
0 蒜蓉生蚝
1 蒙古烤羊腿
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
...
2774 白饭/大碗
2775 牛尾汤
2776 意文柠檬汁
2777 金玉良缘
2778 酸辣藕丁
Name: dishes_name, Length: 2779, dtype: object
# 去空
data['dishes_name']=data['dishes_name'].str.strip()
# 判断菜名是否包含‘番茄’,包含返回True,不包含返回False
mask=data['dishes_name'].str.contains('番茄')
data.loc[mask,'dishes_name']
9 番茄有机花菜
16 番茄甘蓝
18 番茄炖秋葵
23 番茄炖牛腩
92 番茄甘蓝
...
2579 番茄炖牛腩
2584 番茄有机花菜
2620 番茄甘蓝
2717 番茄炖秋葵
2728 番茄甘蓝
Name: dishes_name, Length: 98, dtype: object
一维没有subset参数
# 一维
data['dishes_name'].drop_duplicates()
0 蒜蓉生蚝
1 蒙古烤羊腿
2 大蒜苋菜
3 芝麻烤紫菜
4 蒜香包
...
1024 海带结豆腐汤
1169 冰镇花螺
1411 冬瓜炒苦瓜
1659 超人气广式肠粉
2438 百里香奶油烤紅酒牛肉
Name: dishes_name, Length: 145, dtype: object
# 二维
data.drop_duplicates(subset='dishes_name')
detail_id | order_id | dishes_id | logicprn_name | parent_class_name | dishes_name | itemis_add | counts | amounts | cost | place_order_time | discount_amt | discount_reason | kick_back | add_inprice | add_info | bar_code | picture_file | emp_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2956 | 417 | 610062 | NaN | NaN | 蒜蓉生蚝 | 0 | 1 | 49 | NaN | 2016-08-01 11:05:36 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/104001.jpg | 1442 |
1 | 2958 | 417 | 609957 | NaN | NaN | 蒙古烤羊腿 | 0 | 1 | 48 | NaN | 2016-08-01 11:07:07 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/202003.jpg | 1442 |
2 | 2961 | 417 | 609950 | NaN | NaN | 大蒜苋菜 | 0 | 1 | 30 | NaN | 2016-08-01 11:07:40 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/303001.jpg | 1442 |
3 | 2966 | 417 | 610038 | NaN | NaN | 芝麻烤紫菜 | 0 | 1 | 25 | NaN | 2016-08-01 11:11:11 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/105002.jpg | 1442 |
4 | 2968 | 417 | 610003 | NaN | NaN | 蒜香包 | 0 | 1 | 13 | NaN | 2016-08-01 11:11:30 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/503002.jpg | 1442 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1024 | 7064 | 800 | 610040 | NaN | NaN | 海带结豆腐汤 | 0 | 1 | 30 | NaN | 2016-08-06 14:11:38 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/105004.jpg | 1159 |
1169 | 4683 | 584 | 610067 | NaN | NaN | 冰镇花螺 | 0 | 1 | 35 | NaN | 2016-08-06 18:04:14 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/104006.jpg | 1487 |
1411 | 4115 | 1148 | 610045 | NaN | NaN | 冬瓜炒苦瓜 | 0 | 1 | 29 | NaN | 2016-08-06 21:23:30 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/305004.jpg | 1092 |
1659 | 7168 | 812 | 610058 | NaN | NaN | 超人气广式肠粉 | 0 | 1 | 18 | NaN | 2016-08-07 12:13:40 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/604002.jpg | 1122 |
2438 | 858 | 165 | 609707 | NaN | NaN | 百里香奶油烤紅酒牛肉 | 0 | 1 | 178 | NaN | 2016-08-09 12:47:36 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/201001.jpg | 1097 |
145 rows × 19 columns
# 当菜名和id同时一样才算重复才删除
data.drop_duplicates(subset=['dishes_name','dishes_id'])
detail_id | order_id | dishes_id | logicprn_name | parent_class_name | dishes_name | itemis_add | counts | amounts | cost | place_order_time | discount_amt | discount_reason | kick_back | add_inprice | add_info | bar_code | picture_file | emp_id | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2956 | 417 | 610062 | NaN | NaN | 蒜蓉生蚝 | 0 | 1 | 49 | NaN | 2016-08-01 11:05:36 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/104001.jpg | 1442 |
1 | 2958 | 417 | 609957 | NaN | NaN | 蒙古烤羊腿 | 0 | 1 | 48 | NaN | 2016-08-01 11:07:07 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/202003.jpg | 1442 |
2 | 2961 | 417 | 609950 | NaN | NaN | 大蒜苋菜 | 0 | 1 | 30 | NaN | 2016-08-01 11:07:40 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/303001.jpg | 1442 |
3 | 2966 | 417 | 610038 | NaN | NaN | 芝麻烤紫菜 | 0 | 1 | 25 | NaN | 2016-08-01 11:11:11 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/105002.jpg | 1442 |
4 | 2968 | 417 | 610003 | NaN | NaN | 蒜香包 | 0 | 1 | 13 | NaN | 2016-08-01 11:11:30 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/503002.jpg | 1442 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1024 | 7064 | 800 | 610040 | NaN | NaN | 海带结豆腐汤 | 0 | 1 | 30 | NaN | 2016-08-06 14:11:38 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/105004.jpg | 1159 |
1169 | 4683 | 584 | 610067 | NaN | NaN | 冰镇花螺 | 0 | 1 | 35 | NaN | 2016-08-06 18:04:14 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/104006.jpg | 1487 |
1411 | 4115 | 1148 | 610045 | NaN | NaN | 冬瓜炒苦瓜 | 0 | 1 | 29 | NaN | 2016-08-06 21:23:30 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/305004.jpg | 1092 |
1659 | 7168 | 812 | 610058 | NaN | NaN | 超人气广式肠粉 | 0 | 1 | 18 | NaN | 2016-08-07 12:13:40 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/604002.jpg | 1122 |
2438 | 858 | 165 | 609707 | NaN | NaN | 百里香奶油烤紅酒牛肉 | 0 | 1 | 178 | NaN | 2016-08-09 12:47:36 | NaN | NaN | NaN | 0 | NaN | NaN | caipu/201001.jpg | 1097 |
146 rows × 19 columns
data.dropna(axis=1,how='all')
detail_id | order_id | dishes_id | dishes_name | itemis_add | counts | amounts | place_order_time | add_inprice | picture_file | emp_id | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2956 | 417 | 610062 | 蒜蓉生蚝 | 0 | 1 | 49 | 2016-08-01 11:05:36 | 0 | caipu/104001.jpg | 1442 |
1 | 2958 | 417 | 609957 | 蒙古烤羊腿 | 0 | 1 | 48 | 2016-08-01 11:07:07 | 0 | caipu/202003.jpg | 1442 |
2 | 2961 | 417 | 609950 | 大蒜苋菜 | 0 | 1 | 30 | 2016-08-01 11:07:40 | 0 | caipu/303001.jpg | 1442 |
3 | 2966 | 417 | 610038 | 芝麻烤紫菜 | 0 | 1 | 25 | 2016-08-01 11:11:11 | 0 | caipu/105002.jpg | 1442 |
4 | 2968 | 417 | 610003 | 蒜香包 | 0 | 1 | 13 | 2016-08-01 11:11:30 | 0 | caipu/503002.jpg | 1442 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2774 | 6750 | 774 | 610011 | 白饭/大碗 | 0 | 1 | 10 | 2016-08-10 21:56:24 | 0 | caipu/601005.jpg | 1138 |
2775 | 6742 | 774 | 609996 | 牛尾汤 | 0 | 1 | 40 | 2016-08-10 21:56:48 | 0 | caipu/201006.jpg | 1138 |
2776 | 6756 | 774 | 609949 | 意文柠檬汁 | 0 | 1 | 13 | 2016-08-10 22:01:52 | 0 | caipu/404005.jpg | 1138 |
2777 | 6763 | 774 | 610014 | 金玉良缘 | 0 | 1 | 30 | 2016-08-10 22:03:58 | 0 | caipu/302003.jpg | 1138 |
2778 | 6764 | 774 | 610017 | 酸辣藕丁 | 0 | 1 | 33 | 2016-08-10 22:04:30 | 0 | caipu/302006.jpg | 1138 |
2779 rows × 11 columns