import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv('./order_info_2016.csv', index_col = 'id')
df.info()
Int64Index: 104557 entries, 1 to 104557
Data columns (total 10 columns):
orderId 104557 non-null int64
userId 104557 non-null int64
productId 104557 non-null int64
cityId 104557 non-null int64
price 104557 non-null int64
payMoney 104557 non-null int64
channelId 104549 non-null object
deviceType 104557 non-null int64
createTime 104557 non-null object
payTime 104557 non-null object
dtypes: int64(7), object(3)
memory usage: 8.8+ MB
df.head()
|
orderId |
userId |
productId |
cityId |
price |
payMoney |
channelId |
deviceType |
createTime |
payTime |
id |
|
|
|
|
|
|
|
|
|
|
1 |
232148841 |
2794924 |
268 |
110001 |
35300 |
35300 |
9058255c90 |
3 |
2016-01-01 12:53:02 |
2016-01-01 12:53:24 |
2 |
222298971 |
1664684 |
801 |
330001 |
51200 |
49900 |
e0e6019897 |
2 |
2016-01-01 21:42:51 |
2016-01-01 21:43:30 |
3 |
211494392 |
2669372 |
934 |
220002 |
62100 |
62100 |
9058255c90 |
3 |
2016-01-01 14:10:13 |
2016-01-01 14:11:18 |
4 |
334575272 |
1924727 |
422 |
230001 |
50600 |
42000 |
46d5cea30d |
2 |
2016-01-01 17:43:35 |
2016-01-01 17:43:53 |
5 |
144825651 |
4148671 |
473 |
130006 |
149100 |
142000 |
6ff1752b69 |
2 |
2016-01-01 18:52:04 |
2016-01-01 18:52:47 |
df.describe()
|
orderId |
userId |
productId |
cityId |
price |
payMoney |
deviceType |
count |
1.045570e+05 |
1.045570e+05 |
104557.000000 |
104557.000000 |
1.045570e+05 |
1.045570e+05 |
104557.000000 |
mean |
2.993341e+08 |
3.270527e+06 |
504.566275 |
154410.947225 |
9.167350e+04 |
8.686689e+04 |
2.385292 |
std |
5.149818e+07 |
4.138208e+07 |
288.130647 |
72197.163762 |
9.158836e+04 |
9.072028e+04 |
0.648472 |
min |
1.035627e+08 |
2.930600e+04 |
0.000000 |
30000.000000 |
6.000000e+02 |
-1.000000e+03 |
1.000000 |
25% |
2.633627e+08 |
2.179538e+06 |
254.000000 |
100011.000000 |
3.790000e+04 |
3.360000e+04 |
2.000000 |
50% |
2.989828e+08 |
2.705995e+06 |
507.000000 |
150001.000000 |
5.920000e+04 |
5.500000e+04 |
2.000000 |
75% |
3.349972e+08 |
3.271237e+06 |
758.000000 |
220002.000000 |
1.080000e+05 |
1.040000e+05 |
3.000000 |
max |
4.871430e+08 |
3.072939e+09 |
1000.000000 |
380001.000000 |
2.295600e+06 |
2.294200e+06 |
6.000000 |
device_type = pd.read_csv('./device_type.txt')
device_type
|
id |
deviceType |
0 |
1 |
PC |
1 |
2 |
Android |
2 |
3 |
iPhone |
3 |
4 |
Wap |
4 |
5 |
other |
df.orderId.unique().size
df.orderId.unique().size < df.orderId.size
True
df.productId[df['productId'] == 0].size
177
df.cityId.unique().size
331
df['price'] = df['price']/100
df.drop(df[df['payMoney'] < 0].index, inplace = True)
df[df.payMoney < 0].index
df['payMoney'] = df['payMoney']/100
df.drop(df[df['channelId'].isnull()].index, inplace = True)
df['createTime'] = pd.to_datetime(df['createTime'])
df['payTime'] = pd.to_datetime(df['payTime'])
import datetime
startTime = datetime.datetime(2016, 1, 1)
endTime = datetime.datetime(2016, 12, 12, 23, 59, 59)
df.drop(df[df['createTime'] < startTime].index, inplace = True)
df.drop(df[df['createTime'] > endTime].index, inplace = True)
df.drop(df[df['createTime'] > df['payTime']].index, inplace = True)
df.drop(df[df['orderId'].duplicated()].index, inplace = True)
print(df[df['orderId'].duplicated()])
df.drop(df[df['productId'] == 0].index, inplace = True)
Empty DataFrame
Columns: [orderId, userId, productId, cityId, price, payMoney, channelId, deviceType, createTime, payTime]
Index: []
print('总订单量',df.orderId.count())
print('总用户量',df.userId.count())
print('总销售额(元)',df.payMoney.sum())
print('有流水的商品数',df.productId.unique().size)
总订单量 100614
总用户量 100614
总销售额(元) 86986488.5
有流水的商品数 1000
df_pro = df.groupby('productId', as_index = False)
df_pro_sort = df_pro.count()[['productId', 'orderId']]
df_pro_sort.rename(columns = {'orderId':'sale_sum'}, inplace = True)
df_pro_sort = df_pro_sort.sort_values('sale_sum')
print('销量前十名商品','\n',df_pro_sort.head())
print('销量后十名商品','\n',df_pro_sort.tail())
销量前十名商品
productId sale_sum
999 1000 13
985 986 16
467 468 17
596 597 18
346 347 20
销量后十名商品
productId sale_sum
37 38 291
586 587 296
102 103 328
761 762 332
894 895 337
df_pro_totalprice = df_pro.sum()[['productId', 'payMoney']]
df_pro_totalprice.sort_values('payMoney', ascending = False, inplace = True)
df_weak_productID = df_pro_sort.tail(200).index.intersection(df_pro_totalprice.tail(200).index)
df_weak_productID
Int64Index([], dtype='int64')
bins = np.arange(0, 24000, 2000)
bins_data = pd.cut(df['payMoney'], bins)
bin_counts = df['payMoney'].groupby(bins_data).count()
bin_counts.plot(kind='bar')
plt.show()
df_xf = df[df['payMoney'] < 2000]
bins2 = np.arange(0, 2000, 200)
bins_data2 = pd.cut(df_xf['payMoney'], bins2)
bin_counts2 = df_xf['payMoney'].groupby(bins_data2).count()
bin_counts2.plot(kind='bar')
plt.show()
df_city = df.groupby('cityId', as_index = False)
df_city_sum = df_city.sum()[['cityId', 'payMoney']]
df_city_cou = df_city.count()[['cityId', 'orderId']]
df_city_sum_cou = pd.merge(df_city_sum, df_city_cou, on = 'cityId',how = 'inner')
df_city_sum_cou.sort_values(['payMoney', 'orderId'], ascending = False, inplace = True)
df_city_sum_cou.head(10)
|
cityId |
payMoney |
orderId |
80 |
110001 |
5897567.8 |
5323 |
215 |
220002 |
4223742.8 |
2921 |
115 |
130001 |
3831381.7 |
3978 |
218 |
220005 |
3193285.7 |
2082 |
13 |
60011 |
2708595.1 |
3540 |
1 |
40001 |
2480940.7 |
3200 |
104 |
120001 |
2294747.8 |
2325 |
24 |
70001 |
2164787.7 |
2045 |
228 |
230001 |
2069079.0 |
2823 |
214 |
220001 |
1896970.4 |
1780 |
df_city_channel = df.groupby(['cityId', 'channelId'], as_index = False)
df_city_channel_sum = df_city_channel.sum()[['channelId', 'payMoney', 'cityId']]
df_city_channel_sum.sort_values('payMoney', ascending = False, inplace = True)
df_city_channel_sum.head(3)
|
channelId |
payMoney |
cityId |
3242 |
9058255c90 |
2207275.9 |
110001 |
9234 |
9058255c90 |
1704169.6 |
220002 |
4747 |
9058255c90 |
1514756.0 |
130001 |