%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings
warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'
path1 = 'trainb/'
入住信息
cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
|
ORDER_PRIMARY_ID |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
0 |
07C5BF73B18B44B0877DEED007F8771D |
NaN |
19800627 |
222405 |
NaN |
NaN |
1 |
3525D57CAE104A078E4962B2B89377B0 |
371099C301202107090001 |
19951025 |
370523 |
2.021071e+11 |
2.021071e+11 |
gc.collect()
216
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(2)
|
ORDER_PRIMARY_ID |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
0 |
07C5BF73B18B44B0877DEED007F8771D |
NaN |
19800627 |
222405 |
0.0 |
0.0 |
1 |
3525D57CAE104A078E4962B2B89377B0 |
371099C301202107090001 |
19951025 |
370523 |
202107092103 |
202107111158 |
cust.info()
Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_PRIMARY_ID 44030 non-null object
1 GUEST_ID 5158 non-null object
2 BDATE 44030 non-null object
3 XZQH 44030 non-null object
4 IN_TIME 44030 non-null object
5 OUT_TIME 44030 non-null object
dtypes: object(6)
memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID 43883
GUEST_ID 3698
BDATE 9503
XZQH 2659
IN_TIME 3396
OUT_TIME 236
dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[5:10]
03158D5CA62E42339D697612EA347FB3 2
0A99D74F1E0248D68A729CB79FC640E6 2
1FE3E448558347D89C56B0AFCEC8ACFB 2
864E417B7BDE4C7B9449B7E087E9F21E 2
3DEFA75566934CA5929D0C47F0B95FDE 2
Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']
|
ORDER_PRIMARY_ID |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
guest_sum |
guest_sum_notnull |
22801 |
70798782D6C04A438360D80AFE4845C1 |
371099B389202107040001 |
19990314 |
130983 |
0.0 |
0.0 |
3 |
2 |
cust['IN_TIME'].max(), cust['IN_TIME'].min()
('202110041413', '0.0')
订单信息
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv')
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
PRE_IN_TIME |
PRE_OUT_TIME |
ORDER_TIME |
STATUS |
CANCEL_TIME |
INSERT_TIME |
MODIFY_TIME |
FIRM |
0 |
923521 |
96BBDB7CC049421C85826AE07020B139 |
278337 |
202008011200 |
202008021200 |
1.596120e+11 |
1 |
NaN |
20200730224152 |
20200730224152 |
3 |
1 |
923696 |
C72F20539AD1447D86CD1A8E5EAEC63A |
282932 |
202008041400 |
202008061200 |
1.596121e+11 |
1 |
NaN |
20200730225524 |
20200730225524 |
3 |
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
PRE_IN_TIME |
PRE_OUT_TIME |
ORDER_TIME |
STATUS |
CANCEL_TIME |
INSERT_TIME |
MODIFY_TIME |
FIRM |
30970 |
4422027 |
3CDEDB5E03534D379687645675898CA4 |
170898 |
202108081358 |
202108101158 |
2.021071e+11 |
3 |
NaN |
20210712210153 |
20210712210153 |
3 |
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
##########################没有考虑包含取消时间的订单
order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order[order['ORDER_ID']==4402846]
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
PRE_IN_TIME |
PRE_OUT_TIME |
ORDER_TIME |
STATUS |
CANCEL_TIME |
INSERT_TIME |
MODIFY_TIME |
FIRM |
30141 |
4402846 |
BC0E8E3602434EA6A5F29A6F6FF42233 |
100177 |
202107111358 |
202107121158 |
202107111032 |
1 |
nan |
20210711140042 |
20210711140042 |
3 |
order.info()
Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 PRE_IN_TIME 29941 non-null object
4 PRE_OUT_TIME 29941 non-null object
5 ORDER_TIME 29941 non-null object
6 STATUS 29941 non-null int64
7 CANCEL_TIME 29941 non-null object
8 INSERT_TIME 29941 non-null object
9 MODIFY_TIME 29941 non-null object
10 FIRM 29941 non-null int64
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order.nunique()
ORDER_ID 29941
ORDER_PRIMARY_ID 29941
HOTELID 4801
PRE_IN_TIME 6875
PRE_OUT_TIME 729
ORDER_TIME 26818
STATUS 3
CANCEL_TIME 1
INSERT_TIME 25432
MODIFY_TIME 25432
FIRM 2
dtype: int64
order_info['FIRM'].value_counts()
3 32029
10 8521
Name: FIRM, dtype: int64
order['STATUS'].value_counts()
1 26137
2 3221
3 583
Name: STATUS, dtype: int64
order[order['INSERT_TIME']!=order['MODIFY_TIME']]
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
PRE_IN_TIME |
PRE_OUT_TIME |
ORDER_TIME |
STATUS |
CANCEL_TIME |
INSERT_TIME |
MODIFY_TIME |
FIRM |
合并数据order和cust
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')
月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天
df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)
#######订单插入时间和订单时间的时间差
df.head(2)
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
INSERT_TIME |
MODIFY_TIME |
FIRM |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
guest_sum |
guest_sum_notnull |
0 |
706648 |
A4AACE06B518418C8A8CA1935DDD1C5A |
227185 |
202007092241 |
1 |
20200714171021 |
20200714171021 |
3 |
NaN |
20010409 |
371002 |
202007151300 |
202007161200 |
1 |
0 |
1 |
748647 |
67D551AACFD049AE9CC2AB65E9870678 |
9483 |
202007132109 |
1 |
20201002101040 |
20201002101040 |
3 |
NaN |
19881023 |
341202 |
202010011400 |
202010021011 |
1 |
0 |
df.info()
Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 29941 non-null int64
1 ORDER_PRIMARY_ID 29941 non-null object
2 HOTELID 29941 non-null object
3 ORDER_TIME 29941 non-null object
4 STATUS 29941 non-null int64
5 INSERT_TIME 29941 non-null object
6 MODIFY_TIME 29941 non-null object
7 FIRM 29941 non-null int64
8 GUEST_ID 3561 non-null object
9 BDATE 29941 non-null object
10 XZQH 29941 non-null object
11 IN_TIME 29941 non-null object
12 OUT_TIME 29941 non-null object
13 guest_sum 29941 non-null int64
14 guest_sum_notnull 29941 non-null int64
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df.nunique();
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
INSERT_TIME |
MODIFY_TIME |
FIRM |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
guest_sum |
guest_sum_notnull |
in_time |
out_time |
0 |
706648 |
A4AACE06B518418C8A8CA1935DDD1C5A |
227185 |
202007092241 |
1 |
20200714171021 |
20200714171021 |
3 |
NaN |
20010409 |
371002 |
202007151300 |
202007161200 |
1 |
0 |
2020-07-15 13:00:00 |
2020-07-16 12:00:00 |
1 |
748647 |
67D551AACFD049AE9CC2AB65E9870678 |
9483 |
202007132109 |
1 |
20201002101040 |
20201002101040 |
3 |
NaN |
19881023 |
341202 |
202010011400 |
202010021011 |
1 |
0 |
2020-10-01 14:00:00 |
2020-10-02 10:11:00 |
#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房
dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
stat1 = pd.DataFrame()
stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
stat1['datetime'] = stat1['datetime'].dt.date.fillna(0).astype('str')
stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
dfs.append(stat1)
gc.collect()
17
df_date = pd.concat(dfs).reset_index(drop=True)
df_date[:3]
|
datetime |
ORDER_PRIMARY_ID |
0 |
2021-06-30 |
00003FC18B254E86803C00F4BBA382E4 |
1 |
2021-07-01 |
00003FC18B254E86803C00F4BBA382E4 |
2 |
2021-07-02 |
00003FC18B254E86803C00F4BBA382E4 |
dfs1 = []
for idx, group in df.groupby(['HOTELID']):
stat = pd.DataFrame()
stat['datetime'] = pd.date_range(start='20200601', end='20210830', freq='D', normalize=False, closed=None)
stat['datetime'] = stat['datetime'].astype('str')
stat['HOTELID'] = group['HOTELID'].values[0]
dfs1.append(stat)
df_date1 = pd.concat(dfs1).reset_index(drop=True)
df_date1[:10]
|
datetime |
ORDER_PRIMARY_ID |
0 |
2020-06-01 |
00003FC18B254E86803C00F4BBA382E4 |
1 |
2020-06-02 |
00003FC18B254E86803C00F4BBA382E4 |
2 |
2020-06-03 |
00003FC18B254E86803C00F4BBA382E4 |
3 |
2020-06-04 |
00003FC18B254E86803C00F4BBA382E4 |
4 |
2020-06-05 |
00003FC18B254E86803C00F4BBA382E4 |
5 |
2020-06-06 |
00003FC18B254E86803C00F4BBA382E4 |
6 |
2020-06-07 |
00003FC18B254E86803C00F4BBA382E4 |
7 |
2020-06-08 |
00003FC18B254E86803C00F4BBA382E4 |
8 |
2020-06-09 |
00003FC18B254E86803C00F4BBA382E4 |
9 |
2020-06-10 |
00003FC18B254E86803C00F4BBA382E4 |
df11 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
_df1 = df_date1.merge(df11, on=['HOTELID', 'datetime'], how='left')
ddf = _df1[_df1['ORDER_ID'].isnull()]
df11['in_date'] = df11['in_time'].dt.date
df11['out_date'] = df11['out_time'].dt.date
df11['datetime'] = pd.to_datetime(df11['datetime'], errors='coerce', format='%Y-%m-%d')
df11['date'] = df11['datetime'].dt.date
_df2 = df11[(df11['out_date']!=df11['date']) | ((df11['out_date']==df11['date']) & (df11['in_date']==df11['out_date']))]
_df2.drop(['in_time', 'out_time', 'in_date', 'out_date', 'datetime'], axis=1, inplace=True)
_df2.rename(columns={'date':'datetime'}, inplace=True)
in_num = _df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
_df2 = _df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
_df2.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'datetime',
'in_hotel_num'],
dtype='object')
ddf.columns
Index(['datetime', 'HOTELID', 'ORDER_ID', 'ORDER_PRIMARY_ID', 'ORDER_TIME',
'STATUS', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE',
'XZQH', 'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull',
'in_time', 'out_time'],
dtype='object')
ddf.drop(['in_time', 'out_time'], axis=1, inplace=True)
ddf['in_hotel_num'] = 0
df1 = pd.concat([_df2, ddf])
df1 = df1.sort_values(['HOTELID', 'datetime'])
df1.head(2).append(df1.tail(2))
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
INSERT_TIME |
MODIFY_TIME |
FIRM |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
guest_sum |
guest_sum_notnull |
datetime |
in_hotel_num |
0 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2020-06-01 |
0 |
1 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2020-06-02 |
0 |
2196800 |
NaN |
NaN |
B109977684 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2021-08-29 |
0 |
2196801 |
NaN |
NaN |
B109977684 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2021-08-30 |
0 |
df1.info()
Int64Index: 2186104 entries, 0 to 2196801
Data columns (total 17 columns):
# Column Dtype
--- ------ -----
0 ORDER_ID float64
1 ORDER_PRIMARY_ID object
2 HOTELID object
3 ORDER_TIME object
4 STATUS float64
5 INSERT_TIME object
6 MODIFY_TIME object
7 FIRM float64
8 GUEST_ID object
9 BDATE object
10 XZQH object
11 IN_TIME object
12 OUT_TIME object
13 guest_sum float64
14 guest_sum_notnull float64
15 datetime object
16 in_hotel_num int64
dtypes: float64(5), int64(1), object(11)
memory usage: 300.2+ MB
test = pd.read_csv('testb/submit_example_2.csv')
test.head()
|
HOTELID |
DATE |
ROOM_EMPTY |
0 |
303760 |
2021-09-04 |
0.1 |
1 |
303760 |
2021-09-05 |
0.1 |
2 |
303760 |
2021-09-11 |
0.1 |
3 |
303760 |
2021-09-12 |
0.1 |
4 |
303760 |
2021-09-19 |
0.1 |
test['type'] = 'test'
df1['type'] = 'train'
df1['DATE'] = df1['datetime'].astype('str')
df2 = df1[df1['DATE'] < '2021-09-01']
df2.shape
(2185510, 19)
df2['datetime'] = pd.to_datetime(df2['datetime'], errors='coerce', format='%Y-%m-%d')
df2['in_year'] = df2['datetime'].dt.year.fillna(0).astype('int')
df2['in_month'] = df2['datetime'].dt.month.fillna(0).astype('int')
df2['in_day'] = df2['datetime'].dt.day.fillna(0).astype('int')
df2['in_quarter'] = df2['datetime'].dt.quarter.fillna(0).astype('int')
df2['in_dayofweek'] = df2['datetime'].dt.dayofweek.fillna(0).astype('int')
df2['in_dayofyear'] = df2['datetime'].dt.dayofyear.fillna(0).astype('int')
df2['in_weekofyear'] = df2['datetime'].dt.weekofyear.fillna(0).astype('int')
df2['in_is_wknd'] = df2['datetime'].dt.dayofweek // 5
df2.head(3)
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
INSERT_TIME |
MODIFY_TIME |
FIRM |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
guest_sum |
guest_sum_notnull |
datetime |
in_hotel_num |
type |
DATE |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
0 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2020-06-01 |
0 |
train |
2020-06-01 |
2020 |
6 |
1 |
2 |
0 |
153 |
23 |
0 |
1 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2020-06-02 |
0 |
train |
2020-06-02 |
2020 |
6 |
2 |
2 |
1 |
154 |
23 |
0 |
2 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2020-06-03 |
0 |
train |
2020-06-03 |
2020 |
6 |
3 |
2 |
2 |
155 |
23 |
0 |
use_month = [5, 6, 7, 8, 9]
df_1 = df2[(df2['in_is_wknd']==1) & (df2['in_month'].isin(use_month))]
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14',
'2021-09-19', '2021-09-20', '2021-09-21']
df_2 = df2[df2['DATE'].isin(holiday)]
df_2['holiday'] = 1
df_1['holiday'] = 0
df3 = pd.concat([df_2, df_1])
df3.shape
(373093, 28)
drop_col = df3.columns.tolist()
drop_col.remove('holiday')
df3.drop_duplicates(drop_col, keep='first', inplace=True)
df3.shape
(354019, 28)
test['datetime'] = pd.to_datetime(test['DATE'], errors='coerce', format='%Y-%m-%d')
test['in_year'] = test['datetime'].dt.year.fillna(0).astype('int')
test['in_month'] = test['datetime'].dt.month.fillna(0).astype('int')
test['in_day'] = test['datetime'].dt.day.fillna(0).astype('int')
test['in_quarter'] = test['datetime'].dt.quarter.fillna(0).astype('int')
test['in_dayofweek'] = test['datetime'].dt.dayofweek.fillna(0).astype('int')
test['in_dayofyear'] = test['datetime'].dt.dayofyear.fillna(0).astype('int')
test['in_weekofyear'] = test['datetime'].dt.weekofyear.fillna(0).astype('int')
test['in_is_wknd'] = test['datetime'].dt.dayofweek // 5
test[:3]
|
HOTELID |
DATE |
ROOM_EMPTY |
type |
datetime |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
0 |
303760 |
2021-09-04 |
0.1 |
test |
2021-09-04 |
2021 |
9 |
4 |
3 |
5 |
247 |
35 |
1 |
1 |
303760 |
2021-09-05 |
0.1 |
test |
2021-09-05 |
2021 |
9 |
5 |
3 |
6 |
248 |
35 |
1 |
2 |
303760 |
2021-09-11 |
0.1 |
test |
2021-09-11 |
2021 |
9 |
11 |
3 |
5 |
254 |
36 |
1 |
test['holiday'] = 0
test['holiday'] = test.apply(lambda x: x['holiday']+1 if x['DATE'] in holiday else x['holiday'], axis=1)
del test['datetime'], df3['datetime'], test['ROOM_EMPTY']
df3.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'in_hotel_num',
'type', 'DATE', 'in_year', 'in_month', 'in_day', 'in_quarter',
'in_dayofweek', 'in_dayofyear', 'in_weekofyear', 'in_is_wknd',
'holiday'],
dtype='object')
df3.drop(['IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df4 = df3.merge(test, on=['HOTELID', 'DATE', 'type', 'in_year', 'in_month', 'in_day', 'in_quarter', 'in_dayofweek',
'in_dayofyear', 'in_weekofyear', 'in_is_wknd', 'holiday'], how='outer')
df4.shape
(357330, 25)
dd = df4[df4['DATE']>'2021-09-01']
dd['type'].value_counts()
test 3311
Name: type, dtype: int64
df4.info()
Int64Index: 357330 entries, 0 to 357329
Data columns (total 25 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 12543 non-null float64
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 357330 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 354019 non-null float64
14 type 357330 non-null object
15 DATE 357330 non-null object
16 in_year 357330 non-null int64
17 in_month 357330 non-null int64
18 in_day 357330 non-null int64
19 in_quarter 357330 non-null int64
20 in_dayofweek 357330 non-null int64
21 in_dayofyear 357330 non-null int64
22 in_weekofyear 357330 non-null int64
23 in_is_wknd 357330 non-null int64
24 holiday 357330 non-null int64
dtypes: float64(6), int64(9), object(10)
memory usage: 70.9+ MB
room信息
room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
|
CODE |
HOTELID |
JYMJ |
ROOM_NUM |
BED_NUM |
FWLY |
CZLY |
CALLED |
CITY_CODE |
BUR_CODE |
STA_CODE |
SSX |
ADDRESS |
MPHM |
JYQK |
FIRM |
DJSJ |
BGSJ |
STATUS |
AUDITSTATUS |
0 |
100177 |
3710830002 |
0.0 |
1 |
1 |
2 |
1 |
【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 |
371000000000 |
NaN |
NaN |
371083 |
乳山市长江路 银泰海景花园 55-301 |
ROOM001 |
【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 |
3 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
NaN |
NaN |
room_info['STATUS'].value_counts()
1.0 2658
Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS','AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)
room_info['FIRM'].value_counts()
3 3429
10 1878
Name: FIRM, dtype: int64
room_info.nunique()
HOTELID 5307
JYMJ 173
ROOM_NUM 11
BED_NUM 17
CZLY 4
CALLED 4232
BUR_CODE 7
STA_CODE 54
SSX 5
ADDRESS 4850
MPHM 584
FIRM 2
DJSJ 3753
BGSJ 3294
dtype: int64
登记时间和变更时间无变化的只有两家
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
st = room_info[col].value_counts().reset_index()
st.columns = [col, col+'_freq']
room_info = room_info.merge(st, on=col, how='left')
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'),
Timestamp('2021-10-19 16:05:41'),
Timestamp('2020-07-15 10:23:06'),
Timestamp('2021-10-19 16:09:15'))
plt.figure(figsize=(20, 8))
sns.countplot(room_info['JYMJ'])
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xMlhlyik-1646247028436)(output_119_1.png)]
经营面积缺失超过一半,3000多,考虑删除
合并room到df中
del room_info['FIRM']
df = df4.merge(room_info, on=['HOTELID'], how='left')
df = df.sort_values(by=['HOTELID', 'DATE'])
gc.collect()
170
df.head(2)
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
INSERT_TIME |
MODIFY_TIME |
FIRM |
GUEST_ID |
BDATE |
XZQH |
guest_sum |
guest_sum_notnull |
in_hotel_num |
type |
DATE |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
holiday |
JYMJ |
ROOM_NUM |
BED_NUM |
CZLY |
CALLED |
BUR_CODE |
STA_CODE |
SSX |
ADDRESS |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
JYMJ_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
BUR_CODE_freq |
STA_CODE_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_ratio |
bed_ratio |
room_bed |
DJ_date |
BG_date |
DJ_gap |
37946 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0.0 |
train |
2020-06-06 |
2020 |
6 |
6 |
2 |
5 |
158 |
23 |
1 |
0 |
NaN |
1 |
1 |
1 |
【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 |
NaN |
NaN |
371083 |
乳山市长江路 银泰海景花园 55-301 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
NaN |
3091 |
1502 |
4727 |
NaN |
NaN |
150 |
2565 |
3429 |
NaN |
NaN |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
37947 |
NaN |
NaN |
100177 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0.0 |
train |
2020-06-07 |
2020 |
6 |
7 |
2 |
6 |
159 |
23 |
1 |
0 |
NaN |
1 |
1 |
1 |
【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 |
NaN |
NaN |
371083 |
乳山市长江路 银泰海景花园 55-301 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
NaN |
3091 |
1502 |
4727 |
NaN |
NaN |
150 |
2565 |
3429 |
NaN |
NaN |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
df.info()
Int64Index: 357330 entries, 37946 to 354018
Data columns (total 54 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 ORDER_ID 12543 non-null float64
1 ORDER_PRIMARY_ID 12543 non-null object
2 HOTELID 357330 non-null object
3 ORDER_TIME 12543 non-null object
4 STATUS 12543 non-null float64
5 INSERT_TIME 12543 non-null object
6 MODIFY_TIME 12543 non-null object
7 FIRM 12543 non-null float64
8 GUEST_ID 1877 non-null object
9 BDATE 12543 non-null object
10 XZQH 12543 non-null object
11 guest_sum 12543 non-null float64
12 guest_sum_notnull 12543 non-null float64
13 in_hotel_num 354019 non-null float64
14 type 357330 non-null object
15 DATE 357330 non-null object
16 in_year 357330 non-null int64
17 in_month 357330 non-null int64
18 in_day 357330 non-null int64
19 in_quarter 357330 non-null int64
20 in_dayofweek 357330 non-null int64
21 in_dayofyear 357330 non-null int64
22 in_weekofyear 357330 non-null int64
23 in_is_wknd 357330 non-null int64
24 holiday 357330 non-null int64
25 JYMJ 141308 non-null float64
26 ROOM_NUM 357330 non-null int64
27 BED_NUM 357330 non-null int64
28 CZLY 357330 non-null int64
29 CALLED 357330 non-null object
30 BUR_CODE 167625 non-null float64
31 STA_CODE 167625 non-null object
32 SSX 357330 non-null int64
33 ADDRESS 357330 non-null object
34 MPHM 357330 non-null object
35 DJSJ 356948 non-null datetime64[ns]
36 BGSJ 357330 non-null datetime64[ns]
37 CALLED_freq 357330 non-null int64
38 ADDRESS_freq 357330 non-null int64
39 JYMJ_freq 141308 non-null float64
40 ROOM_NUM_freq 357330 non-null int64
41 BED_NUM_freq 357330 non-null int64
42 CZLY_freq 357330 non-null int64
43 BUR_CODE_freq 167625 non-null float64
44 STA_CODE_freq 167625 non-null float64
45 SSX_freq 357330 non-null int64
46 MPHM_freq 357330 non-null int64
47 FIRM_freq 357330 non-null int64
48 room_ratio 141308 non-null float64
49 bed_ratio 141308 non-null float64
50 room_bed 357330 non-null float64
51 DJ_date 356948 non-null object
52 BG_date 357330 non-null object
53 DJ_gap 356948 non-null float64
dtypes: datetime64[ns](2), float64(15), int64(21), object(16)
memory usage: 149.9+ MB
missing = df.isnull().sum()
missing = missing[missing>180000]
missing_col = missing.index.tolist()
df.drop(columns=missing_col, inplace=True)
df['DATE'].min(), df['DATE'].max()
('2020-06-06', '2021-09-21')
df.drop(columns=['CALLED', 'ADDRESS'], inplace=True)
df[:5]
|
HOTELID |
in_hotel_num |
type |
DATE |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
holiday |
ROOM_NUM |
BED_NUM |
CZLY |
SSX |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_bed |
DJ_date |
BG_date |
DJ_gap |
37946 |
100177 |
0.0 |
train |
2020-06-06 |
2020 |
6 |
6 |
2 |
5 |
158 |
23 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
37947 |
100177 |
0.0 |
train |
2020-06-07 |
2020 |
6 |
7 |
2 |
6 |
159 |
23 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
37948 |
100177 |
0.0 |
train |
2020-06-13 |
2020 |
6 |
13 |
2 |
5 |
165 |
24 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
37949 |
100177 |
0.0 |
train |
2020-06-14 |
2020 |
6 |
14 |
2 |
6 |
166 |
24 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
37950 |
100177 |
0.0 |
train |
2020-06-20 |
2020 |
6 |
20 |
2 |
5 |
172 |
25 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
def qty_shift(df, val):
df['last_1_qty'] = df.groupby('HOTELID')[val].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_2_qty'] = df.groupby('HOTELID')[val].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_3_qty'] = df.groupby('HOTELID')[val].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_4_qty'] = df.groupby('HOTELID')[val].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_5_qty'] = df.groupby('HOTELID')[val].shift(5).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_6_qty'] = df.groupby('HOTELID')[val].shift(6).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_7_qty'] = df.groupby('HOTELID')[val].shift(7).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_8_qty'] = df.groupby('HOTELID')[val].shift(8).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_10_qty'] = df.groupby('HOTELID')[val].shift(10).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_12_qty'] = df.groupby('HOTELID')[val].shift(12).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_15_qty'] = df.groupby('HOTELID')[val].shift(15).fillna(method='ffill').reset_index().sort_index().set_index('index')
df['last_20_qty'] = df.groupby('HOTELID')[val].shift(20).fillna(method='ffill').reset_index().sort_index().set_index('index')
return df
vals = ['in_hotel_num', 'in_year', 'in_month', 'in_dayofyear', 'in_quarter']
for val in vals:
print(val)
df = qty_shift(df, val)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
def qty_rolling(df, window, val, keys):
df['qty_rolling'+str(window)+'_mean'] = df.groupby(keys)[val].transform(
lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").mean()).values.tolist()
df['qty_rolling'+str(window)+'_max'] = df.groupby(keys)[val].transform(
lambda x: x.shift(1).rolling(window=window, min_periods=3).max()).values.tolist()
df['qty_rolling'+str(window)+'_sum'] = df.groupby(keys)[val].transform(
lambda x: x.shift(1).rolling(window=window, min_periods=3).sum()).values.tolist()
return df
gc.collect()
34
keys = 'HOTELID'
for val in vals:
print(val)
df = qty_rolling(df, 3, val, keys)
df = qty_rolling(df, 4, val, keys)
df = qty_rolling(df, 6, val, keys)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
def qty_ewm(df, alpha, val, keys):
df['qty_ewm'+'_mean'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).mean()).values.tolist()
df['qty_ewm'+'_std'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).std()).values.tolist()
df['qty_ewm'+'_corr'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).corr()).values.tolist()
return df
for val in vals:
print(val)
df = qty_ewm(df, 0.95, val, keys)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
df.select_dtypes(include='object').columns
Index(['HOTELID', 'type', 'DATE', 'MPHM', 'DJ_date', 'BG_date'], dtype='object')
df['in_hotel_num'] = df['in_hotel_num'].apply(lambda x: 1 if x==0 else 0)
df[:5]
|
HOTELID |
in_hotel_num |
type |
DATE |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
holiday |
ROOM_NUM |
BED_NUM |
CZLY |
SSX |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_bed |
DJ_date |
BG_date |
DJ_gap |
last_1_qty |
last_2_qty |
last_3_qty |
last_4_qty |
last_5_qty |
last_6_qty |
last_7_qty |
last_8_qty |
last_10_qty |
last_12_qty |
last_15_qty |
last_20_qty |
qty_rolling3_mean |
qty_rolling3_max |
qty_rolling3_sum |
qty_rolling4_mean |
qty_rolling4_max |
qty_rolling4_sum |
qty_rolling6_mean |
qty_rolling6_max |
qty_rolling6_sum |
qty_ewm_mean |
qty_ewm_std |
qty_ewm_corr |
37946 |
100177 |
1 |
train |
2020-06-06 |
2020 |
6 |
6 |
2 |
5 |
158 |
23 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
37947 |
100177 |
1 |
train |
2020-06-07 |
2020 |
6 |
7 |
2 |
6 |
159 |
23 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
2.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2.0 |
NaN |
NaN |
37948 |
100177 |
1 |
train |
2020-06-13 |
2020 |
6 |
13 |
2 |
5 |
165 |
24 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
2.0 |
2.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2.0 |
0.0 |
NaN |
37949 |
100177 |
1 |
train |
2020-06-14 |
2020 |
6 |
14 |
2 |
6 |
166 |
24 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
2.0 |
2.0 |
2.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2.0 |
2.0 |
6.0 |
2.0 |
2.0 |
6.0 |
2.0 |
2.0 |
6.0 |
2.0 |
0.0 |
NaN |
37950 |
100177 |
1 |
train |
2020-06-20 |
2020 |
6 |
20 |
2 |
5 |
172 |
25 |
1 |
0 |
1 |
1 |
1 |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
3091 |
1502 |
4727 |
150 |
2565 |
3429 |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
2.0 |
2.0 |
2.0 |
2.0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2.0 |
2.0 |
6.0 |
2.0 |
2.0 |
8.0 |
2.0 |
2.0 |
8.0 |
2.0 |
0.0 |
NaN |
df = df.fillna(0)
col_list = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
col_list.remove('in_hotel_num')
col_list
['in_year',
'in_month',
'in_day',
'in_quarter',
'in_dayofweek',
'in_dayofyear',
'in_weekofyear',
'in_is_wknd',
'holiday',
'ROOM_NUM',
'BED_NUM',
'CZLY',
'SSX',
'CALLED_freq',
'ADDRESS_freq',
'ROOM_NUM_freq',
'BED_NUM_freq',
'CZLY_freq',
'SSX_freq',
'MPHM_freq',
'FIRM_freq',
'room_bed',
'DJ_gap',
'last_1_qty',
'last_2_qty',
'last_3_qty',
'last_4_qty',
'last_5_qty',
'last_6_qty',
'last_7_qty',
'last_8_qty',
'last_10_qty',
'last_12_qty',
'last_15_qty',
'last_20_qty',
'qty_rolling3_mean',
'qty_rolling3_max',
'qty_rolling3_sum',
'qty_rolling4_mean',
'qty_rolling4_max',
'qty_rolling4_sum',
'qty_rolling6_mean',
'qty_rolling6_max',
'qty_rolling6_sum',
'qty_ewm_mean',
'qty_ewm_std',
'qty_ewm_corr']
gc.collect()
102
used_features = col_list
cate_cols = ['HOTELID', 'MPHM']
X_train = df[df["DATE"] < '2021-08-02'][used_features].reset_index(drop=True)
y_train = df[df["DATE"] < '2021-08-02']["in_hotel_num"]
X_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')][used_features].reset_index(drop=True)
y_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')]["in_hotel_num"]
X_test = df[df["type"] == 'test'][used_features].reset_index(drop=True)
clf_1 = LGBMClassifier(num_leaves = 256,
n_estimators = 20000,
learning_rate = 0.005,
verbose = -1,
max_bin = 100,
max_depth = 10,
feature_fraction_seed = 66,
feature_fraction = 0.7,
bagging_seed = 66,
bagging_freq = 1,
bagging_fraction = 0.95,
metric = 'auc',
lambda_l1 = 0.1,
lambda_l2 = 0.1,
min_child_weight = 30,
n_jobs=80)
clf_1.fit(X_train, y_train,
eval_set=[(X_valid, y_valid)],
early_stopping_rounds=100,verbose=200)
gc.collect()
[LightGBM] [Warning] feature_fraction is set=0.7, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7
[LightGBM] [Warning] lambda_l1 is set=0.1, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.1
[LightGBM] [Warning] bagging_fraction is set=0.95, subsample=1.0 will be ignored. Current value: bagging_fraction=0.95
[LightGBM] [Warning] lambda_l2 is set=0.1, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.1
[LightGBM] [Warning] bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10] valid_0's auc: 0.541449
357
oof_prob = clf_1.predict_proba(X_valid[used_features])[:, 1]
oof_prob.min(), oof_prob.max()
(0.9610922808728843, 0.9625584615170373)
oof_prob1 = clf_1.predict_proba(X_test[used_features])[:, 1]
oof_prob1.min(), oof_prob1.max()
(0.960959321386754, 0.9624418595928675)
X_test[used_features][:10]
|
STATUS |
FIRM |
guest_sum |
guest_sum_notnull |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
holiday |
JYMJ |
ROOM_NUM |
BED_NUM |
CZLY |
BUR_CODE |
SSX |
CALLED_freq |
ADDRESS_freq |
JYMJ_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
BUR_CODE_freq |
STA_CODE_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_ratio |
bed_ratio |
room_bed |
DJ_gap |
last_1_qty |
last_2_qty |
last_3_qty |
last_4_qty |
last_5_qty |
last_6_qty |
last_7_qty |
last_8_qty |
last_10_qty |
last_12_qty |
last_15_qty |
last_20_qty |
qty_rolling3_mean |
qty_rolling3_max |
qty_rolling3_sum |
qty_rolling4_mean |
qty_rolling4_max |
qty_rolling4_sum |
qty_rolling6_mean |
qty_rolling6_max |
qty_rolling6_sum |
qty_ewm_mean |
qty_ewm_std |
qty_ewm_corr |
0 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
4 |
3 |
5 |
247 |
35 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
2.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
6.328848e-12 |
1.0 |
1 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
5 |
3 |
6 |
248 |
35 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
2.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
1.415174e-12 |
1.0 |
2 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
11 |
3 |
5 |
254 |
36 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
3.164424e-13 |
1.0 |
3 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
12 |
3 |
6 |
255 |
36 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
7.075868e-14 |
1.0 |
4 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
19 |
3 |
6 |
262 |
37 |
1 |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
1.582212e-14 |
1.0 |
5 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
20 |
3 |
0 |
263 |
38 |
0 |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
3.537934e-15 |
1.0 |
6 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
21 |
3 |
1 |
264 |
38 |
0 |
1 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
7.911060e-16 |
1.0 |
7 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
4 |
3 |
5 |
247 |
35 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
2.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
6.328848e-12 |
1.0 |
8 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
5 |
3 |
6 |
248 |
35 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
2.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
1.415174e-12 |
1.0 |
9 |
NaN |
NaN |
NaN |
NaN |
2021 |
9 |
11 |
3 |
5 |
254 |
36 |
1 |
0 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
3.164424e-13 |
1.0 |
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']
train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features]
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'
params = {
'n_estimators': 20000,
'boosting': 'gbdt',
'learning_rate': 0.001,
'num_leaves': 2 ** 5,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 66,
'feature_fraction': 0.7,
'feature_fraction_seed': 66,
'max_bin': 100,
'max_depth': 10,
'metric': {'auc'},
'verbose': -1
}
for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
if fold_n in [0, 1, 2, 3]:
continue
print('Training with validation')
trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
categorical_feature=cate_cols)
val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
categorical_feature=cate_cols)
clf = LGBMClassifier.fit(params, trn_data, valid_sets=[trn_data, val_data], verbose_eval=Verbose,
early_stopping_rounds=Early_Stopping_Rounds)
val = clf.predict(train.iloc[valid_index])
mae_ = mean_absolute_error(labels[valid_index], val)
print('MAE: {}'.format(mae_))
print("ReTraining on all data")
gc.enable()
del trn_data, val_data
gc.collect()
Best_iteration = clf.best_iteration
print("Best_iteration: ", Best_iteration)
trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)
Training with validation
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
in
49 val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
50 categorical_feature=cate_cols)
---> 51 clf = LGBMClassifier.fit(params, train.iloc[train_index], df[df['type'] == 'train'].iloc[train_index]['in_hotel_num'], eval_set=[trn_data, val_data], verbose=Verbose, early_stopping_rounds=Early_Stopping_Rounds)
52 val = clf.predict(train.iloc[valid_index])
53 mae_ = mean_absolute_error(labels[valid_index], val)
F:\anaconda\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
783 _LGBMAssertAllFinite(y)
784 _LGBMCheckClassificationTargets(y)
--> 785 self._le = _LGBMLabelEncoder().fit(y)
786 _y = self._le.transform(y)
787 self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))
AttributeError: 'dict' object has no attribute '_le'
df['HOTELID'] = df['HOTELID'].astype('category')
df['MPHM'] = df['MPHM'].astype('category')
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']
train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features]
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'
params = {
'objective': 'regression',
'boosting': 'gbdt',
'learning_rate': 0.001,
'num_leaves': 2 ** 5,
'bagging_fraction': 0.95,
'bagging_freq': 1,
'bagging_seed': 66,
'feature_fraction': 0.7,
'feature_fraction_seed': 66,
'max_bin': 100,
'max_depth': 10,
'metric': {'l2', 'l1'},
'verbose': -1
}
for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
if fold_n in [0, 1, 2, 3]:
continue
print('Training with validation')
trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
categorical_feature=cate_cols)
val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
categorical_feature=cate_cols)
clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,
early_stopping_rounds=Early_Stopping_Rounds)
val = clf.predict(train.iloc[valid_index])
mae_ = mean_absolute_error(labels[valid_index], val)
print('MAE: {}'.format(mae_))
print("ReTraining on all data")
gc.enable()
del trn_data, val_data
gc.collect()
Best_iteration = clf.best_iteration
print("Best_iteration: ", Best_iteration)
trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)
clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
Training with validation
Training until validation scores don't improve for 100 rounds
[500] training's l2: 0.0357266 training's l1: 0.0733432 valid_1's l2: 0.0140039 valid_1's l1: 0.0424839
[1000] training's l2: 0.034654 training's l1: 0.0715026 valid_1's l2: 0.0136241 valid_1's l1: 0.0362867
[1500] training's l2: 0.0340906 training's l1: 0.0702526 valid_1's l2: 0.0134038 valid_1's l1: 0.0323129
[2000] training's l2: 0.0337541 training's l1: 0.0694053 valid_1's l2: 0.0132898 valid_1's l1: 0.0298233
[2500] training's l2: 0.0335207 training's l1: 0.0688021 valid_1's l2: 0.0131952 valid_1's l1: 0.028312
[3000] training's l2: 0.0333523 training's l1: 0.068367 valid_1's l2: 0.0131123 valid_1's l1: 0.0273034
[3500] training's l2: 0.033219 training's l1: 0.0680436 valid_1's l2: 0.0130372 valid_1's l1: 0.0264549
[4000] training's l2: 0.033109 training's l1: 0.0678005 valid_1's l2: 0.0129586 valid_1's l1: 0.0259314
[4500] training's l2: 0.0330175 training's l1: 0.0676386 valid_1's l2: 0.0129 valid_1's l1: 0.0256456
[5000] training's l2: 0.0329394 training's l1: 0.0675283 valid_1's l2: 0.0128614 valid_1's l1: 0.0254616
[5500] training's l2: 0.0328693 training's l1: 0.0674366 valid_1's l2: 0.0128206 valid_1's l1: 0.0253768
Early stopping, best iteration is:
[5586] training's l2: 0.0328584 training's l1: 0.067422 valid_1's l2: 0.0128154 valid_1's l1: 0.0253677
MAE: 0.025367655553334097
ReTraining on all data
Best_iteration: 5586
pred1 = clf.predict(test)
pred1.min(), pred1.max()
(0.2106748391712024, 0.9860467852769609)
pre_d = df[df['type'] == 'test']
pre_d['ROOM_EMPTY'] = pred1.tolist()
pre_d1 = pre_d[['HOTELID', 'DATE', 'ROOM_EMPTY']]
pp = pre_d1[pre_d1['DATE']=='2021-09-21']
pp[:5]
|
HOTELID |
DATE |
ROOM_EMPTY |
356552 |
10083 |
2021-09-21 |
0.726068 |
355831 |
10125 |
2021-09-21 |
0.825109 |
354872 |
10237 |
2021-09-21 |
0.841640 |
356181 |
10273 |
2021-09-21 |
0.841640 |
356825 |
104814 |
2021-09-21 |
0.726068 |
np.median(pp['ROOM_EMPTY'])
0.8079157059926819
pre_d1[pre_d1['ROOM_EMPTY']>0.807].shape
(2667, 3)
pre_d1['ROOM_EMPTY'] = pre_d1['ROOM_EMPTY'].apply(lambda x: 0 if x>0.807 else 1)
pre_d1['ROOM_EMPTY'].sum()
644
pre_d1.to_csv('pre_02_reg.csv', index=False)
pre_d1['ROOM_EMPTY'].median()
0.8752981387595966
pre_d1['ROOM_EMPTY'].quantile(0.2)
0.8070988321951651
sns.countplot(pre_d1['ROOM_EMPTY'])
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-281lRSik-1646247028443)(output_177_1.png)]
pre_d1.shape
(3311, 3)
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
nan
pred.min(), pred.max()
(0.03637936444857589, 1.8554153046371984)
pre_df = df[df['type'] == 'test']
pre_df['ROOM_EMPTY'] = pred.tolist()
pre_df1.to_csv('origion.csv', index=False)
pre_df[:5]
|
HOTELID |
in_hotel_num |
type |
DATE |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
in_weekofyear |
in_is_wknd |
holiday |
last_1_qty |
last_2_qty |
last_3_qty |
last_4_qty |
last_5_qty |
last_6_qty |
last_7_qty |
last_8_qty |
last_10_qty |
last_12_qty |
last_15_qty |
last_20_qty |
qty_rolling3_mean |
qty_rolling3_max |
qty_rolling3_sum |
qty_rolling4_mean |
qty_rolling4_max |
qty_rolling4_sum |
qty_rolling6_mean |
qty_rolling6_max |
qty_rolling6_sum |
qty_ewm_mean |
qty_ewm_std |
qty_ewm_corr |
ROOM_EMPTY |
356546 |
10083 |
0 |
test |
2021-09-04 |
2021 |
9 |
4 |
3 |
5 |
247 |
35 |
1 |
0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
2.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
6.328848e-12 |
1.0 |
0.054686 |
356547 |
10083 |
0 |
test |
2021-09-05 |
2021 |
9 |
5 |
3 |
6 |
248 |
35 |
1 |
0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
2.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
1.415174e-12 |
1.0 |
0.055554 |
356548 |
10083 |
0 |
test |
2021-09-11 |
2021 |
9 |
11 |
3 |
5 |
254 |
36 |
1 |
0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
3.164424e-13 |
1.0 |
0.247202 |
356549 |
10083 |
0 |
test |
2021-09-12 |
2021 |
9 |
12 |
3 |
6 |
255 |
36 |
1 |
0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
7.075868e-14 |
1.0 |
0.105092 |
356550 |
10083 |
0 |
test |
2021-09-19 |
2021 |
9 |
19 |
3 |
6 |
262 |
37 |
1 |
1 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
3.0 |
9.0 |
3.0 |
3.0 |
12.0 |
3.0 |
3.0 |
18.0 |
3.0 |
1.582212e-14 |
1.0 |
0.094995 |
pre_df1 = pre_df[['HOTELID', 'DATE', 'ROOM_EMPTY']]
pre_df1[pre_df1['ROOM_EMPTY']>0.07].shape
(2380, 3)
pre_df1['ROOM_EMPTY'] = pre_df1['ROOM_EMPTY'].apply(lambda x: 0 if x>=0.25 else 1)
import numpy as np
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
0.7065505364632024
a = pd.DataFrame()
a['y'] = labels[valid_index].tolist()
a['val'] = val.tolist()
fpr, tpr, thresholds = metrics.roc_curve(a['y'], a['val'], pos_label=2)
metrics.auc(fpr, tpr)
nan
np.array(a['val'].values.tolist())
array([0.98077955, 0.98028159, 0.98090901, ..., 0.99187334, 0.9917258 ,
0.99187287])
a['y'].max()
1
a['y'] = a['y'].apply(lambda x: 1 if x==0 else 0)
a['val'] = a['val'].apply(lambda x: (1-x/(4.03212491)))
a[:5]
|
y |
val |
0 |
1 |
0.980780 |
1 |
1 |
0.980282 |
2 |
1 |
0.980909 |
3 |
1 |
0.986415 |
4 |
1 |
0.988064 |
val.max()-val.min()
4.0321249098355265
labels[valid_index]
array([0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0.])
val[140:160]
array([0.02691592, 0.02701038, 0.02690596, 0.02699238, 0.02690596,
0.07806598, 0.07950708, 0.29049198, 0.77018916, 0.98434103,
0.05606983, 0.06270357, 0.04589361, 0.05028036, 0.04938166,
0.05054352, 0.04737767, 0.11532118, 0.05313139, 0.04558791])
gc.collect()
388
from tqdm import tqdm
from sklearn.metrics import *
print("#############################输出第一个模型的评价参数及结果#############################")
def find_best_threshold(y_valid, oof_prob):
best_f2 = 0
recall = 0
precision = 0
best_th = 0
for th in tqdm([i/2000 for i in range(100, 2000)]):
oof_prob_copy = oof_prob.copy()
oof_prob_copy[oof_prob_copy >= th] = 1
oof_prob_copy[oof_prob_copy < th] = 0
fpr, tpr, thresholds = metrics.roc_curve(oof_prob_copy, y_valid, pos_label=2)
aucs = metrics.auc(fpr, tpr)
if aucs > best_f2:
best_th = th
best_f2 = aucs
return best_th, best_f2
val1 = val
y1 = labels[valid_index]
best_th, aucs = find_best_threshold(val1, y1)
print("分界值", best_th)
print("F2评价分数", aucs)
1%|▏ | 10/1900 [00:00<00:23, 80.05it/s]
#############################输出第一个模型的评价参数及结果#############################
100%|██████████████████████████████████████| 1900/1900 [00:20<00:00, 94.61it/s]
分界值 0.05
F2评价分数 0.7065505364632024
pre_df1[20:33]
|
HOTELID |
DATE |
ROOM_EMPTY |
354872 |
10237 |
2021-09-21 |
0 |
356175 |
10273 |
2021-09-04 |
0 |
356176 |
10273 |
2021-09-05 |
0 |
356177 |
10273 |
2021-09-11 |
0 |
356178 |
10273 |
2021-09-12 |
0 |
356179 |
10273 |
2021-09-19 |
0 |
356180 |
10273 |
2021-09-20 |
0 |
356181 |
10273 |
2021-09-21 |
0 |
356819 |
104814 |
2021-09-04 |
0 |
356820 |
104814 |
2021-09-05 |
0 |
356821 |
104814 |
2021-09-11 |
0 |
356822 |
104814 |
2021-09-12 |
0 |
356823 |
104814 |
2021-09-19 |
0 |
pre_df1.to_csv('pre_025_regression.csv', index=False)
pre_df1['ROOM_EMPTY'].mode()
0 0.20966
dtype: float64
pre_df1['ROOM_EMPTY'].median()
0.10509209231427108
plt.figure(figsize=(16,8))
sns.countplot(pre_df1['ROOM_EMPTY'])
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uSxKnO4p-1646247028446)(output_213_1.png)]
df4['date1'] = df4['date'].astype('str')
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]
df_1.shape
(11583, 59)
df_1['in_month'].value_counts()
7 4391
5 2520
6 2192
8 1656
9 824
Name: in_month, dtype: int64
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14',
'2021-09-19', '2021-09-20', '2021-09-21']
df4[df4['date']=='2021-05-03']
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
INSERT_TIME |
MODIFY_TIME |
FIRM |
GUEST_ID |
BDATE |
XZQH |
IN_TIME |
OUT_TIME |
guest_sum |
guest_sum_notnull |
in_time |
out_time |
datetime |
in_date |
out_date |
date |
JYMJ |
ROOM_NUM |
BED_NUM |
CZLY |
CALLED |
BUR_CODE |
STA_CODE |
SSX |
ADDRESS |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
JYMJ_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
BUR_CODE_freq |
STA_CODE_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_ratio |
bed_ratio |
room_bed |
DJ_date |
BG_date |
DJ_gap |
in_hotel_num |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
record_is_wknd |
in_is_wknd |
df_2 = df4[df4['date1'].isin(holiday)]
df_2['holiday'] = 1
df_1['holiday'] = 0
df5 = pd.concat([df_2, df_1])
df5.shape
(14662, 61)
df5.drop(['CALLED', 'ADDRESS', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df5[:3]
|
ORDER_ID |
ORDER_PRIMARY_ID |
HOTELID |
ORDER_TIME |
STATUS |
FIRM |
GUEST_ID |
BDATE |
XZQH |
guest_sum |
in_time |
out_time |
datetime |
in_date |
out_date |
date |
JYMJ |
ROOM_NUM |
BED_NUM |
CZLY |
BUR_CODE |
STA_CODE |
SSX |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
JYMJ_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
BUR_CODE_freq |
STA_CODE_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_ratio |
bed_ratio |
room_bed |
DJ_date |
BG_date |
DJ_gap |
in_hotel_num |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
record_is_wknd |
in_is_wknd |
date1 |
holiday |
9 |
3597092 |
CFD17F6BB142485DB3DEC6B5E2D0B664 |
100193 |
202105012046 |
1 |
3 |
NaN |
19771008 |
132826 |
1 |
2021-05-01 20:46:00 |
2021-05-02 11:58:00 |
2021-05-01 20:46:00 |
2021-05-01 |
2021-05-02 |
2021-05-01 |
NaN |
2 |
2 |
1 |
NaN |
NaN |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
NaN |
1249 |
2281 |
4727 |
NaN |
NaN |
150 |
2565 |
3429 |
NaN |
NaN |
1.0 |
2020-05-18 |
2020-07-15 |
58.0 |
1 |
2021 |
5 |
1 |
2 |
5 |
121 |
1 |
1 |
2021-05-01 |
1 |
11 |
3601837 |
5C96E986C1F04691A3EB07962BCC7B00 |
100195 |
202105021435 |
1 |
3 |
NaN |
19951009 |
370203 |
1 |
2021-05-02 14:35:00 |
2021-05-03 11:58:00 |
2021-05-02 14:35:00 |
2021-05-02 |
2021-05-03 |
2021-05-02 |
NaN |
2 |
3 |
1 |
NaN |
NaN |
371083 |
ROOM001 |
2020-05-18 10:33:55 |
2020-07-15 10:23:58 |
1 |
1 |
NaN |
1249 |
895 |
4727 |
NaN |
NaN |
150 |
2565 |
3429 |
NaN |
NaN |
1.5 |
2020-05-18 |
2020-07-15 |
58.0 |
1 |
2021 |
5 |
2 |
2 |
6 |
122 |
1 |
1 |
2021-05-02 |
1 |
53 |
3314117 |
C0C5516462D3487C895DD654B836F855 |
10037 |
202104082130 |
1 |
3 |
NaN |
19730313 |
120103 |
1 |
2021-04-30 13:58:00 |
2021-05-03 11:58:00 |
2021-05-01 13:58:00 |
2021-04-30 |
2021-05-03 |
2021-05-01 |
NaN |
1 |
2 |
1 |
NaN |
NaN |
371002 |
ROOM001 |
2020-05-18 10:22:13 |
2020-07-15 10:23:58 |
2 |
2 |
NaN |
3091 |
2281 |
4727 |
NaN |
NaN |
4661 |
2565 |
3429 |
NaN |
NaN |
2.0 |
2020-05-18 |
2020-07-15 |
58.0 |
1 |
2021 |
5 |
1 |
2 |
5 |
121 |
1 |
1 |
2021-05-01 |
1 |
drop_col = df5.columns.tolist()
drop_col.remove('holiday')
df5.drop_duplicates(drop_col, keep='first', inplace=True)
df5.shape
(12708, 54)
df5['HOTELID'].nunique()
3133
test = pd.read_csv('testb/submit_example_2.csv')
test[:2]
|
HOTELID |
DATE |
ROOM_EMPTY |
0 |
303760 |
2021-09-04 |
0.1 |
1 |
303760 |
2021-09-05 |
0.1 |
test.info()
RangeIndex: 3311 entries, 0 to 3310
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 HOTELID 3311 non-null object
1 DATE 3311 non-null object
2 ROOM_EMPTY 3311 non-null float64
dtypes: float64(1), object(2)
memory usage: 77.7+ KB
hotel = test['HOTELID'].values.tolist()
df5 = df5[df5['HOTELID'].isin(hotel)]
df5.shape
(3708, 54)
d = df5[df5['in_month']==9]
d['date'].value_counts()
2021-09-04 42
2021-09-05 29
2021-09-11 16
2021-09-19 14
2021-09-20 13
2021-09-18 12
2021-09-12 11
2020-09-06 9
2020-09-13 9
2020-09-05 9
2021-09-21 7
2020-09-19 7
2021-09-26 6
2020-09-27 6
2021-09-25 6
2020-09-26 6
2020-09-20 6
2020-09-12 4
Name: date, dtype: int64
test['date1'] = test['DATE']
test1 = test.merge(df5, on=['HOTELID', 'date1'], how='left')
test.shape
(3311, 4)
test1.shape
(3311, 56)
test1.info()
Int64Index: 3311 entries, 0 to 3310
Data columns (total 56 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 HOTELID 3311 non-null object
1 DATE 3311 non-null object
2 ROOM_EMPTY 3311 non-null float64
3 date1 3311 non-null object
4 ORDER_ID 132 non-null float64
5 ORDER_PRIMARY_ID 132 non-null object
6 ORDER_TIME 132 non-null object
7 STATUS 132 non-null float64
8 FIRM 132 non-null float64
9 GUEST_ID 16 non-null object
10 BDATE 132 non-null object
11 XZQH 132 non-null object
12 guest_sum 132 non-null float64
13 in_time 132 non-null datetime64[ns]
14 out_time 132 non-null datetime64[ns]
15 datetime 132 non-null datetime64[ns]
16 in_date 132 non-null object
17 out_date 132 non-null object
18 date 132 non-null object
19 JYMJ 1 non-null float64
20 ROOM_NUM 132 non-null float64
21 BED_NUM 132 non-null float64
22 CZLY 132 non-null float64
23 BUR_CODE 86 non-null float64
24 STA_CODE 86 non-null object
25 SSX 132 non-null float64
26 MPHM 132 non-null object
27 DJSJ 132 non-null datetime64[ns]
28 BGSJ 132 non-null datetime64[ns]
29 CALLED_freq 132 non-null float64
30 ADDRESS_freq 132 non-null float64
31 JYMJ_freq 1 non-null float64
32 ROOM_NUM_freq 132 non-null float64
33 BED_NUM_freq 132 non-null float64
34 CZLY_freq 132 non-null float64
35 BUR_CODE_freq 86 non-null float64
36 STA_CODE_freq 86 non-null float64
37 SSX_freq 132 non-null float64
38 MPHM_freq 132 non-null float64
39 FIRM_freq 132 non-null float64
40 room_ratio 1 non-null float64
41 bed_ratio 1 non-null float64
42 room_bed 132 non-null float64
43 DJ_date 132 non-null object
44 BG_date 132 non-null object
45 DJ_gap 132 non-null float64
46 in_hotel_num 132 non-null float64
47 in_year 132 non-null float64
48 in_month 132 non-null float64
49 in_day 132 non-null float64
50 in_quarter 132 non-null float64
51 in_dayofweek 132 non-null float64
52 in_dayofyear 132 non-null float64
53 record_is_wknd 132 non-null float64
54 in_is_wknd 132 non-null float64
55 holiday 132 non-null float64
dtypes: datetime64[ns](5), float64(36), object(15)
memory usage: 1.4+ MB
test1[:5]
|
HOTELID |
DATE |
ROOM_EMPTY |
date1 |
ORDER_ID |
ORDER_PRIMARY_ID |
ORDER_TIME |
STATUS |
FIRM |
GUEST_ID |
BDATE |
XZQH |
guest_sum |
in_time |
out_time |
datetime |
in_date |
out_date |
date |
JYMJ |
ROOM_NUM |
BED_NUM |
CZLY |
BUR_CODE |
STA_CODE |
SSX |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
JYMJ_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
BUR_CODE_freq |
STA_CODE_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_ratio |
bed_ratio |
room_bed |
DJ_date |
BG_date |
DJ_gap |
in_hotel_num |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
record_is_wknd |
in_is_wknd |
holiday |
0 |
303760 |
2021-09-04 |
0.1 |
2021-09-04 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
1 |
303760 |
2021-09-05 |
0.1 |
2021-09-05 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
2 |
303760 |
2021-09-11 |
0.1 |
2021-09-11 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
3 |
303760 |
2021-09-12 |
0.1 |
2021-09-12 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
4 |
303760 |
2021-09-19 |
0.1 |
2021-09-19 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
test1['holiday1'] = test1['holiday'].apply(lambda x: 0 if x else 1)
test1['holiday1'].value_counts()
0 3213
1 98
Name: holiday1, dtype: int64
test1[:10]
|
HOTELID |
DATE |
ROOM_EMPTY |
date1 |
ORDER_ID |
ORDER_PRIMARY_ID |
ORDER_TIME |
STATUS |
FIRM |
GUEST_ID |
BDATE |
XZQH |
guest_sum |
in_time |
out_time |
datetime |
in_date |
out_date |
date |
JYMJ |
ROOM_NUM |
BED_NUM |
CZLY |
BUR_CODE |
STA_CODE |
SSX |
MPHM |
DJSJ |
BGSJ |
CALLED_freq |
ADDRESS_freq |
JYMJ_freq |
ROOM_NUM_freq |
BED_NUM_freq |
CZLY_freq |
BUR_CODE_freq |
STA_CODE_freq |
SSX_freq |
MPHM_freq |
FIRM_freq |
room_ratio |
bed_ratio |
room_bed |
DJ_date |
BG_date |
DJ_gap |
in_hotel_num |
in_year |
in_month |
in_day |
in_quarter |
in_dayofweek |
in_dayofyear |
record_is_wknd |
in_is_wknd |
holiday |
holiday1 |
0 |
303760 |
2021-09-04 |
0.1 |
2021-09-04 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
1 |
303760 |
2021-09-05 |
0.1 |
2021-09-05 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
2 |
303760 |
2021-09-11 |
0.1 |
2021-09-11 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
3 |
303760 |
2021-09-12 |
0.1 |
2021-09-12 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
4 |
303760 |
2021-09-19 |
0.1 |
2021-09-19 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
5 |
303760 |
2021-09-20 |
0.1 |
2021-09-20 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
6 |
303760 |
2021-09-21 |
0.1 |
2021-09-21 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
7 |
315471 |
2021-09-04 |
0.1 |
2021-09-04 |
4798603.0 |
A139D5C20BB44529A66957AA491FB88E |
202108212327 |
1.0 |
3.0 |
NaN |
20001101 |
331021 |
1.0 |
2021-09-03 13:58:00 |
2021-09-05 11:58:00 |
2021-09-04 13:58:00 |
2021-09-03 |
2021-09-05 |
2021-09-04 |
NaN |
1.0 |
2.0 |
1.0 |
3.710980e+11 |
371000000007 |
371002.0 |
ROOM001 |
2021-05-05 13:58:58 |
2021-05-05 14:02:45 |
2.0 |
1.0 |
NaN |
3091.0 |
2281.0 |
4727.0 |
852.0 |
433.0 |
4661.0 |
2565.0 |
3429.0 |
NaN |
NaN |
2.0 |
2021-05-05 |
2021-05-05 |
0.0 |
1.0 |
2021.0 |
9.0 |
4.0 |
3.0 |
5.0 |
247.0 |
1.0 |
1.0 |
0.0 |
1 |
8 |
315471 |
2021-09-05 |
0.1 |
2021-09-05 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
9 |
315471 |
2021-09-11 |
0.1 |
2021-09-11 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaT |
NaT |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
0 |
test1['ROOM_EMPTY'] = test1['holiday1']
test2 = test1[['HOTELID', 'DATE', 'ROOM_EMPTY']]
test2[:5]
|
HOTELID |
DATE |
ROOM_EMPTY |
0 |
303760 |
2021-09-04 |
0 |
1 |
303760 |
2021-09-05 |
0 |
2 |
303760 |
2021-09-11 |
0 |
3 |
303760 |
2021-09-12 |
0 |
4 |
303760 |
2021-09-19 |
0 |
test2.to_csv('pre.csv', index=False)
pre = pd.read_csv('file41704172.csv')
pre[:2]
|
HOTELID |
DATE |
room |
0 |
303760 |
2021-09-04 |
0.0 |
1 |
303760 |
2021-09-05 |
0.0 |
pre.rename(columns={'ROOM_EMPTY':'room'}, inplace=True)
pr = test2.merge(pre, on=['HOTELID', 'DATE'], how='left')
pr['ROOM_EMPTY'] = pr['ROOM_EMPTY'] + pr['room']
del pr['room']
pre['room'].value_counts()
0.0 2838
1.0 473
Name: room, dtype: int64
pr.to_csv('pre1.csv', index=False)
dd = df5.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()
dd
|
in_year |
in_month |
ORDER_ID |
0 |
2020 |
7 |
12 |
1 |
2020 |
8 |
102 |
2 |
2020 |
9 |
56 |
3 |
2021 |
5 |
735 |
4 |
2021 |
6 |
833 |
5 |
2021 |
7 |
1279 |
6 |
2021 |
8 |
535 |
7 |
2021 |
9 |
156 |
dd['ym'] = dd['in_year']*100+dd['in_month']
dd['ym'].value_counts()
202105 1
202007 1
202008 1
202009 1
202106 1
202107 1
202108 1
202109 1
Name: ym, dtype: int64
plt.figure(figsize=(15,8))
plt.scatter(dd.index, dd['ORDER_ID'])
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fiksPl1J-1646247028449)(output_270_1.png)]