Untitled555555555

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings


warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'  
path1 = 'trainb/'

入住信息

cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME
0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 NaN NaN
1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 2.021071e+11 2.021071e+11
gc.collect()
216
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust.head(2)
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME
0 07C5BF73B18B44B0877DEED007F8771D NaN 19800627 222405 0.0 0.0
1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 202107092103 202107111158
cust.info()

Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_PRIMARY_ID  44030 non-null  object
 1   GUEST_ID          5158 non-null   object
 2   BDATE             44030 non-null  object
 3   XZQH              44030 non-null  object
 4   IN_TIME           44030 non-null  object
 5   OUT_TIME          44030 non-null  object
dtypes: object(6)
memory usage: 2.4+ MB
cust.nunique()
ORDER_PRIMARY_ID    43883
GUEST_ID             3698
BDATE                9503
XZQH                 2659
IN_TIME              3396
OUT_TIME              236
dtype: int64
cust['ORDER_PRIMARY_ID'].value_counts()[5:10]
03158D5CA62E42339D697612EA347FB3    2
0A99D74F1E0248D68A729CB79FC640E6    2
1FE3E448558347D89C56B0AFCEC8ACFB    2
864E417B7BDE4C7B9449B7E087E9F21E    2
3DEFA75566934CA5929D0C47F0B95FDE    2
Name: ORDER_PRIMARY_ID, dtype: int64
cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']
ORDER_PRIMARY_ID GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull
22801 70798782D6C04A438360D80AFE4845C1 371099B389202107040001 19990314 130983 0.0 0.0 3 2
cust['IN_TIME'].max(), cust['IN_TIME'].min()
('202110041413', '0.0')

订单信息

# platform infomation
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv') 
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
0 923521 96BBDB7CC049421C85826AE07020B139 278337 202008011200 202008021200 1.596120e+11 1 NaN 20200730224152 20200730224152 3
1 923696 C72F20539AD1447D86CD1A8E5EAEC63A 282932 202008041400 202008061200 1.596121e+11 1 NaN 20200730225524 20200730225524 3
order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
30970 4422027 3CDEDB5E03534D379687645675898CA4 170898 202108081358 202108101158 2.021071e+11 3 NaN 20210712210153 20210712210153 3
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

##########################没有考虑包含取消时间的订单

order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
order.shape
(29941, 11)
order[order['ORDER_ID']==4402846]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM
30141 4402846 BC0E8E3602434EA6A5F29A6F6FF42233 100177 202107111358 202107121158 202107111032 1 nan 20210711140042 20210711140042 3
order.info()

Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_ID          29941 non-null  int64 
 1   ORDER_PRIMARY_ID  29941 non-null  object
 2   HOTELID           29941 non-null  object
 3   PRE_IN_TIME       29941 non-null  object
 4   PRE_OUT_TIME      29941 non-null  object
 5   ORDER_TIME        29941 non-null  object
 6   STATUS            29941 non-null  int64 
 7   CANCEL_TIME       29941 non-null  object
 8   INSERT_TIME       29941 non-null  object
 9   MODIFY_TIME       29941 non-null  object
 10  FIRM              29941 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 2.7+ MB
order.nunique()
ORDER_ID            29941
ORDER_PRIMARY_ID    29941
HOTELID              4801
PRE_IN_TIME          6875
PRE_OUT_TIME          729
ORDER_TIME          26818
STATUS                  3
CANCEL_TIME             1
INSERT_TIME         25432
MODIFY_TIME         25432
FIRM                    2
dtype: int64
order_info['FIRM'].value_counts()
3     32029
10     8521
Name: FIRM, dtype: int64
order['STATUS'].value_counts()
1    26137
2     3221
3      583
Name: STATUS, dtype: int64
#插入时间不等于修改时间,就是进行过修改过的订单,最后发现为空,等于没有修改过
order[order['INSERT_TIME']!=order['MODIFY_TIME']]
ORDER_ID ORDER_PRIMARY_ID HOTELID PRE_IN_TIME PRE_OUT_TIME ORDER_TIME STATUS CANCEL_TIME INSERT_TIME MODIFY_TIME FIRM

合并数据order和cust

# order时间包含了cust时间
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')

月份分为1、3、5、7、8、10、12:31天;2:28天;4、6、9、11:30天

df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)

#######订单插入时间和订单时间的时间差

df.head(2)
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull
0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0
1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0
df.info()

Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ORDER_ID           29941 non-null  int64 
 1   ORDER_PRIMARY_ID   29941 non-null  object
 2   HOTELID            29941 non-null  object
 3   ORDER_TIME         29941 non-null  object
 4   STATUS             29941 non-null  int64 
 5   INSERT_TIME        29941 non-null  object
 6   MODIFY_TIME        29941 non-null  object
 7   FIRM               29941 non-null  int64 
 8   GUEST_ID           3561 non-null   object
 9   BDATE              29941 non-null  object
 10  XZQH               29941 non-null  object
 11  IN_TIME            29941 non-null  object
 12  OUT_TIME           29941 non-null  object
 13  guest_sum          29941 non-null  int64 
 14  guest_sum_notnull  29941 non-null  int64 
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df.nunique();
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
df.head(2)
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time
0 706648 A4AACE06B518418C8A8CA1935DDD1C5A 227185 202007092241 1 20200714171021 20200714171021 3 NaN 20010409 371002 202007151300 202007161200 1 0 2020-07-15 13:00:00 2020-07-16 12:00:00
1 748647 67D551AACFD049AE9CC2AB65E9870678 9483 202007132109 1 20201002101040 20201002101040 3 NaN 19881023 341202 202010011400 202010021011 1 0 2020-10-01 14:00:00 2020-10-02 10:11:00
# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')                 #入住时间的年份
# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int')               #退房时间的年份
# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int')               #入住时间的月份
# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int')             #退房时间的月份
# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')                   #入住时间的日期
# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')                 #退房时间的日期
# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int')                 #入住时间的小时
# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int')               #退房时间的小时
#2月份没有
# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)
# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)
# month_ = [1, 3, 5, 7, 8, 10]
# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)
# df['day_gap'].sum()

#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房

# df_0 = df[(df['day_gap']==0)]
# df_0['date'] = df_0['IN_TIME']
# 这里边包含了钟点房和当天入住并退房的。
dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
    stat1 = pd.DataFrame()
    stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
    stat1['datetime'] = stat1['datetime'].dt.date.fillna(0).astype('str')
    stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
    dfs.append(stat1)
gc.collect()
17
df_date = pd.concat(dfs).reset_index(drop=True)
df_date[:3]
datetime ORDER_PRIMARY_ID
0 2021-06-30 00003FC18B254E86803C00F4BBA382E4
1 2021-07-01 00003FC18B254E86803C00F4BBA382E4
2 2021-07-02 00003FC18B254E86803C00F4BBA382E4
dfs1 = []
for idx, group in df.groupby(['HOTELID']):
    stat = pd.DataFrame()
    stat['datetime'] = pd.date_range(start='20200601', end='20210830', freq='D', normalize=False, closed=None)
    stat['datetime'] = stat['datetime'].astype('str')
    stat['HOTELID'] = group['HOTELID'].values[0]
    dfs1.append(stat)
df_date1 = pd.concat(dfs1).reset_index(drop=True)
df_date1[:10]
datetime ORDER_PRIMARY_ID
0 2020-06-01 00003FC18B254E86803C00F4BBA382E4
1 2020-06-02 00003FC18B254E86803C00F4BBA382E4
2 2020-06-03 00003FC18B254E86803C00F4BBA382E4
3 2020-06-04 00003FC18B254E86803C00F4BBA382E4
4 2020-06-05 00003FC18B254E86803C00F4BBA382E4
5 2020-06-06 00003FC18B254E86803C00F4BBA382E4
6 2020-06-07 00003FC18B254E86803C00F4BBA382E4
7 2020-06-08 00003FC18B254E86803C00F4BBA382E4
8 2020-06-09 00003FC18B254E86803C00F4BBA382E4
9 2020-06-10 00003FC18B254E86803C00F4BBA382E4
df11 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
_df1 = df_date1.merge(df11, on=['HOTELID', 'datetime'], how='left') 
# df['guest_sum'] = 0
# df['guest_sum_notnull'] = 0
ddf = _df1[_df1['ORDER_ID'].isnull()]
df11['in_date'] = df11['in_time'].dt.date
df11['out_date'] = df11['out_time'].dt.date
df11['datetime'] = pd.to_datetime(df11['datetime'], errors='coerce', format='%Y-%m-%d')
df11['date'] = df11['datetime'].dt.date
_df2 = df11[(df11['out_date']!=df11['date']) | ((df11['out_date']==df11['date']) & (df11['in_date']==df11['out_date']))]
_df2.drop(['in_time', 'out_time', 'in_date', 'out_date', 'datetime'], axis=1, inplace=True)
_df2.rename(columns={'date':'datetime'}, inplace=True)
#增加特征:民宿单日的总订单量
in_num = _df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
_df2 = _df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
_df2.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
       'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'datetime',
       'in_hotel_num'],
      dtype='object')
ddf.columns
Index(['datetime', 'HOTELID', 'ORDER_ID', 'ORDER_PRIMARY_ID', 'ORDER_TIME',
       'STATUS', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE',
       'XZQH', 'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull',
       'in_time', 'out_time'],
      dtype='object')
ddf.drop(['in_time', 'out_time'], axis=1, inplace=True)
ddf['in_hotel_num'] = 0
df1 = pd.concat([_df2, ddf])
df1 = df1.sort_values(['HOTELID', 'datetime'])
df1.head(2).append(df1.tail(2))
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull datetime in_hotel_num
0 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-01 0
1 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-02 0
2196800 NaN NaN B109977684 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2021-08-29 0
2196801 NaN NaN B109977684 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2021-08-30 0
df1.info()

Int64Index: 2186104 entries, 0 to 2196801
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ORDER_ID           float64
 1   ORDER_PRIMARY_ID   object 
 2   HOTELID            object 
 3   ORDER_TIME         object 
 4   STATUS             float64
 5   INSERT_TIME        object 
 6   MODIFY_TIME        object 
 7   FIRM               float64
 8   GUEST_ID           object 
 9   BDATE              object 
 10  XZQH               object 
 11  IN_TIME            object 
 12  OUT_TIME           object 
 13  guest_sum          float64
 14  guest_sum_notnull  float64
 15  datetime           object 
 16  in_hotel_num       int64  
dtypes: float64(5), int64(1), object(11)
memory usage: 300.2+ MB
test = pd.read_csv('testb/submit_example_2.csv')
test.head()
HOTELID DATE ROOM_EMPTY
0 303760 2021-09-04 0.1
1 303760 2021-09-05 0.1
2 303760 2021-09-11 0.1
3 303760 2021-09-12 0.1
4 303760 2021-09-19 0.1
# #增加特征:民宿单日的总订单量
# in_num = df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
# df2 = df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
test['type'] = 'test'
df1['type'] = 'train'
df1['DATE'] = df1['datetime'].astype('str')
df2 = df1[df1['DATE'] < '2021-09-01']
df2.shape
(2185510, 19)
df2['datetime'] = pd.to_datetime(df2['datetime'], errors='coerce', format='%Y-%m-%d')
df2['in_year'] = df2['datetime'].dt.year.fillna(0).astype('int')
df2['in_month'] = df2['datetime'].dt.month.fillna(0).astype('int')
df2['in_day'] = df2['datetime'].dt.day.fillna(0).astype('int')
df2['in_quarter'] = df2['datetime'].dt.quarter.fillna(0).astype('int')
df2['in_dayofweek'] = df2['datetime'].dt.dayofweek.fillna(0).astype('int')
df2['in_dayofyear'] = df2['datetime'].dt.dayofyear.fillna(0).astype('int')
df2['in_weekofyear'] = df2['datetime'].dt.weekofyear.fillna(0).astype('int')
df2['in_is_wknd'] = df2['datetime'].dt.dayofweek // 5                 #是否周末
df2.head(3)
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull datetime in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd
0 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-01 0 train 2020-06-01 2020 6 1 2 0 153 23 0
1 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-02 0 train 2020-06-02 2020 6 2 2 1 154 23 0
2 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2020-06-03 0 train 2020-06-03 2020 6 3 2 2 155 23 0
#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df2[(df2['in_is_wknd']==1) & (df2['in_month'].isin(use_month))]
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
df_2 = df2[df2['DATE'].isin(holiday)]
# 节假日为1
df_2['holiday'] = 1
df_1['holiday'] = 0
df3 = pd.concat([df_2, df_1])
df3.shape
(373093, 28)
drop_col = df3.columns.tolist()
drop_col.remove('holiday')
df3.drop_duplicates(drop_col, keep='first', inplace=True)
df3.shape
(354019, 28)
test['datetime'] = pd.to_datetime(test['DATE'], errors='coerce', format='%Y-%m-%d')
test['in_year'] = test['datetime'].dt.year.fillna(0).astype('int')
test['in_month'] = test['datetime'].dt.month.fillna(0).astype('int')
test['in_day'] = test['datetime'].dt.day.fillna(0).astype('int')
test['in_quarter'] = test['datetime'].dt.quarter.fillna(0).astype('int')
test['in_dayofweek'] = test['datetime'].dt.dayofweek.fillna(0).astype('int')
test['in_dayofyear'] = test['datetime'].dt.dayofyear.fillna(0).astype('int')
test['in_weekofyear'] = test['datetime'].dt.weekofyear.fillna(0).astype('int')
test['in_is_wknd'] = test['datetime'].dt.dayofweek // 5                 #是否周末
test[:3]
HOTELID DATE ROOM_EMPTY type datetime in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd
0 303760 2021-09-04 0.1 test 2021-09-04 2021 9 4 3 5 247 35 1
1 303760 2021-09-05 0.1 test 2021-09-05 2021 9 5 3 6 248 35 1
2 303760 2021-09-11 0.1 test 2021-09-11 2021 9 11 3 5 254 36 1
test['holiday'] = 0
test['holiday'] = test.apply(lambda x: x['holiday']+1 if x['DATE'] in holiday else x['holiday'], axis=1)
del test['datetime'], df3['datetime'], test['ROOM_EMPTY']
df3.columns
Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
       'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'in_hotel_num',
       'type', 'DATE', 'in_year', 'in_month', 'in_day', 'in_quarter',
       'in_dayofweek', 'in_dayofyear', 'in_weekofyear', 'in_is_wknd',
       'holiday'],
      dtype='object')
df3.drop(['IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df4 = df3.merge(test, on=['HOTELID', 'DATE', 'type', 'in_year', 'in_month', 'in_day', 'in_quarter', 'in_dayofweek', 
                          'in_dayofyear', 'in_weekofyear', 'in_is_wknd', 'holiday'], how='outer')
df4.shape
(357330, 25)
dd = df4[df4['DATE']>'2021-09-01']
dd['type'].value_counts()
test    3311
Name: type, dtype: int64
df4.info()

Int64Index: 357330 entries, 0 to 357329
Data columns (total 25 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ORDER_ID           12543 non-null   float64
 1   ORDER_PRIMARY_ID   12543 non-null   object 
 2   HOTELID            357330 non-null  object 
 3   ORDER_TIME         12543 non-null   object 
 4   STATUS             12543 non-null   float64
 5   INSERT_TIME        12543 non-null   object 
 6   MODIFY_TIME        12543 non-null   object 
 7   FIRM               12543 non-null   float64
 8   GUEST_ID           1877 non-null    object 
 9   BDATE              12543 non-null   object 
 10  XZQH               12543 non-null   object 
 11  guest_sum          12543 non-null   float64
 12  guest_sum_notnull  12543 non-null   float64
 13  in_hotel_num       354019 non-null  float64
 14  type               357330 non-null  object 
 15  DATE               357330 non-null  object 
 16  in_year            357330 non-null  int64  
 17  in_month           357330 non-null  int64  
 18  in_day             357330 non-null  int64  
 19  in_quarter         357330 non-null  int64  
 20  in_dayofweek       357330 non-null  int64  
 21  in_dayofyear       357330 non-null  int64  
 22  in_weekofyear      357330 non-null  int64  
 23  in_is_wknd         357330 non-null  int64  
 24  holiday            357330 non-null  int64  
dtypes: float64(6), int64(9), object(10)
memory usage: 70.9+ MB

room信息

room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)
CODE HOTELID JYMJ ROOM_NUM BED_NUM FWLY CZLY CALLED CITY_CODE BUR_CODE STA_CODE SSX ADDRESS MPHM JYQK FIRM DJSJ BGSJ STATUS AUDITSTATUS
0 100177 3710830002 0.0 1 1 2 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 371000000000 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 3 2020-05-18 10:33:55 2020-07-15 10:23:58 NaN NaN
room_info['STATUS'].value_counts()
1.0    2658
Name: STATUS, dtype: int64
room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS','AUDITSTATUS', 'JYQK'], axis=1, inplace=True)
room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)
room_info['FIRM'].value_counts()
3     3429
10    1878
Name: FIRM, dtype: int64
room_info.nunique()
HOTELID     5307
JYMJ         173
ROOM_NUM      11
BED_NUM       17
CZLY           4
CALLED      4232
BUR_CODE       7
STA_CODE      54
SSX            5
ADDRESS     4850
MPHM         584
FIRM           2
DJSJ        3753
BGSJ        3294
dtype: int64

登记时间和变更时间无变化的只有两家

#经营面积为0的修改为空值
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
#登记时间和变更时间的差值
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
    st = room_info[col].value_counts().reset_index()
    st.columns = [col, col+'_freq']
    room_info = room_info.merge(st, on=col, how='left')
#房间面积率和床位率\单间房床位量
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date
#等级时间和变更时间的时间差
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'),
 Timestamp('2021-10-19 16:05:41'),
 Timestamp('2020-07-15 10:23:06'),
 Timestamp('2021-10-19 16:09:15'))
# room['BG_dayofweek'].value_counts()
plt.figure(figsize=(20, 8))
sns.countplot(room_info['JYMJ'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xMlhlyik-1646247028436)(output_119_1.png)]

经营面积缺失超过一半,3000多,考虑删除

合并room到df中

del room_info['FIRM']
df = df4.merge(room_info, on=['HOTELID'], how='left')
df = df.sort_values(by=['HOTELID', 'DATE'])
gc.collect()
170
#增加民宿单日的总订单量
# in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
# df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')
df.head(2)
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH guest_sum guest_sum_notnull in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap
37946 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 train 2020-06-06 2020 6 6 2 5 158 23 1 0 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0
37947 NaN NaN 100177 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.0 train 2020-06-07 2020 6 7 2 6 159 23 1 0 NaN 1 1 1 【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽 NaN NaN 371083 乳山市长江路 银泰海景花园 55-301 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 3091 1502 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0
df.info()

Int64Index: 357330 entries, 37946 to 354018
Data columns (total 54 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   ORDER_ID           12543 non-null   float64       
 1   ORDER_PRIMARY_ID   12543 non-null   object        
 2   HOTELID            357330 non-null  object        
 3   ORDER_TIME         12543 non-null   object        
 4   STATUS             12543 non-null   float64       
 5   INSERT_TIME        12543 non-null   object        
 6   MODIFY_TIME        12543 non-null   object        
 7   FIRM               12543 non-null   float64       
 8   GUEST_ID           1877 non-null    object        
 9   BDATE              12543 non-null   object        
 10  XZQH               12543 non-null   object        
 11  guest_sum          12543 non-null   float64       
 12  guest_sum_notnull  12543 non-null   float64       
 13  in_hotel_num       354019 non-null  float64       
 14  type               357330 non-null  object        
 15  DATE               357330 non-null  object        
 16  in_year            357330 non-null  int64         
 17  in_month           357330 non-null  int64         
 18  in_day             357330 non-null  int64         
 19  in_quarter         357330 non-null  int64         
 20  in_dayofweek       357330 non-null  int64         
 21  in_dayofyear       357330 non-null  int64         
 22  in_weekofyear      357330 non-null  int64         
 23  in_is_wknd         357330 non-null  int64         
 24  holiday            357330 non-null  int64         
 25  JYMJ               141308 non-null  float64       
 26  ROOM_NUM           357330 non-null  int64         
 27  BED_NUM            357330 non-null  int64         
 28  CZLY               357330 non-null  int64         
 29  CALLED             357330 non-null  object        
 30  BUR_CODE           167625 non-null  float64       
 31  STA_CODE           167625 non-null  object        
 32  SSX                357330 non-null  int64         
 33  ADDRESS            357330 non-null  object        
 34  MPHM               357330 non-null  object        
 35  DJSJ               356948 non-null  datetime64[ns]
 36  BGSJ               357330 non-null  datetime64[ns]
 37  CALLED_freq        357330 non-null  int64         
 38  ADDRESS_freq       357330 non-null  int64         
 39  JYMJ_freq          141308 non-null  float64       
 40  ROOM_NUM_freq      357330 non-null  int64         
 41  BED_NUM_freq       357330 non-null  int64         
 42  CZLY_freq          357330 non-null  int64         
 43  BUR_CODE_freq      167625 non-null  float64       
 44  STA_CODE_freq      167625 non-null  float64       
 45  SSX_freq           357330 non-null  int64         
 46  MPHM_freq          357330 non-null  int64         
 47  FIRM_freq          357330 non-null  int64         
 48  room_ratio         141308 non-null  float64       
 49  bed_ratio          141308 non-null  float64       
 50  room_bed           357330 non-null  float64       
 51  DJ_date            356948 non-null  object        
 52  BG_date            357330 non-null  object        
 53  DJ_gap             356948 non-null  float64       
dtypes: datetime64[ns](2), float64(15), int64(21), object(16)
memory usage: 149.9+ MB
missing = df.isnull().sum()
missing = missing[missing>180000]
missing_col = missing.index.tolist()
df.drop(columns=missing_col, inplace=True)
df['DATE'].min(), df['DATE'].max()
('2020-06-06', '2021-09-21')
df.drop(columns=['CALLED', 'ADDRESS'], inplace=True)
df[:5]
HOTELID in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday ROOM_NUM BED_NUM CZLY SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq SSX_freq MPHM_freq FIRM_freq room_bed DJ_date BG_date DJ_gap
37946 100177 0.0 train 2020-06-06 2020 6 6 2 5 158 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0
37947 100177 0.0 train 2020-06-07 2020 6 7 2 6 159 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0
37948 100177 0.0 train 2020-06-13 2020 6 13 2 5 165 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0
37949 100177 0.0 train 2020-06-14 2020 6 14 2 6 166 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0
37950 100177 0.0 train 2020-06-20 2020 6 20 2 5 172 25 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0
def qty_shift(df, val):
    #昨天,上周,上个月, 去年
    df['last_1_qty'] = df.groupby('HOTELID')[val].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_2_qty'] = df.groupby('HOTELID')[val].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_3_qty'] = df.groupby('HOTELID')[val].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_4_qty'] = df.groupby('HOTELID')[val].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_5_qty'] = df.groupby('HOTELID')[val].shift(5).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_6_qty'] = df.groupby('HOTELID')[val].shift(6).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_7_qty'] = df.groupby('HOTELID')[val].shift(7).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_8_qty'] = df.groupby('HOTELID')[val].shift(8).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_10_qty'] = df.groupby('HOTELID')[val].shift(10).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_12_qty'] = df.groupby('HOTELID')[val].shift(12).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_15_qty'] = df.groupby('HOTELID')[val].shift(15).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_20_qty'] = df.groupby('HOTELID')[val].shift(20).fillna(method='ffill').reset_index().sort_index().set_index('index')
    return df
vals = ['in_hotel_num', 'in_year', 'in_month', 'in_dayofyear', 'in_quarter']           
for val in vals:
    print(val)
    df = qty_shift(df, val)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
# #昨天,上周,上个月, 去年
# df['yesterday_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_2_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_3_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_4_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_52_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(52).fillna(method='ffill').reset_index().sort_index().set_index('index')
def qty_rolling(df, window, val, keys):
    df['qty_rolling'+str(window)+'_mean'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").mean()).values.tolist()
    df['qty_rolling'+str(window)+'_max'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).max()).values.tolist()
    df['qty_rolling'+str(window)+'_sum'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).sum()).values.tolist()
#     df['qty_rolling'+str(window)+'_std'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").std()).values.tolist()
#     df['qty_rolling'+str(window)+'_skew'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).skew()).values.tolist()
#     df['qty_rolling'+str(window)+'_kurt'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).kurt()).values.tolist()
    #df['qty_rolling'+str(window)+'_quantile'] = df.groupby(keys)[val].transform(  
              #lambda x: x.rolling(window=window, min_periods=3).quantile()).values.tolist()
#     df['qty_rolling'+str(window)+'_corr'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).corr()).values.tolist()
    return df

gc.collect()
34
# 滚动7天和14天
keys = 'HOTELID'
for val in vals:
    print(val)
    df = qty_rolling(df, 3, val, keys)
    df = qty_rolling(df, 4, val, keys)
    df = qty_rolling(df, 6, val, keys)
# keys = ['HOTELID']
# df = qty_rolling(df, 2, 'in_hotel_num', keys)

in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
def qty_ewm(df, alpha, val, keys):
    df['qty_ewm'+'_mean'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).mean()).values.tolist()
    df['qty_ewm'+'_std'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).std()).values.tolist()
    df['qty_ewm'+'_corr'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).corr()).values.tolist()
    return df
for val in vals:
    print(val)
    df = qty_ewm(df, 0.95, val, keys)
in_hotel_num
in_year
in_month
in_dayofyear
in_quarter
df.select_dtypes(include='object').columns
Index(['HOTELID', 'type', 'DATE', 'MPHM', 'DJ_date', 'BG_date'], dtype='object')
df['in_hotel_num'] = df['in_hotel_num'].apply(lambda x: 1 if x==0 else 0)
df[:5]
HOTELID in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday ROOM_NUM BED_NUM CZLY SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq SSX_freq MPHM_freq FIRM_freq room_bed DJ_date BG_date DJ_gap last_1_qty last_2_qty last_3_qty last_4_qty last_5_qty last_6_qty last_7_qty last_8_qty last_10_qty last_12_qty last_15_qty last_20_qty qty_rolling3_mean qty_rolling3_max qty_rolling3_sum qty_rolling4_mean qty_rolling4_max qty_rolling4_sum qty_rolling6_mean qty_rolling6_max qty_rolling6_sum qty_ewm_mean qty_ewm_std qty_ewm_corr
37946 100177 1 train 2020-06-06 2020 6 6 2 5 158 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
37947 100177 1 train 2020-06-07 2020 6 7 2 6 159 23 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 NaN NaN
37948 100177 1 train 2020-06-13 2020 6 13 2 5 165 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 0.0 NaN
37949 100177 1 train 2020-06-14 2020 6 14 2 6 166 24 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 2.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.0 6.0 2.0 2.0 6.0 2.0 2.0 6.0 2.0 0.0 NaN
37950 100177 1 train 2020-06-20 2020 6 20 2 5 172 25 1 0 1 1 1 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 3091 1502 4727 150 2565 3429 1.0 2020-05-18 2020-07-15 58.0 2.0 2.0 2.0 2.0 NaN NaN NaN NaN NaN NaN NaN NaN 2.0 2.0 6.0 2.0 2.0 8.0 2.0 2.0 8.0 2.0 0.0 NaN
df = df.fillna(0)
col_list = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
col_list.remove('in_hotel_num')
col_list
['in_year',
 'in_month',
 'in_day',
 'in_quarter',
 'in_dayofweek',
 'in_dayofyear',
 'in_weekofyear',
 'in_is_wknd',
 'holiday',
 'ROOM_NUM',
 'BED_NUM',
 'CZLY',
 'SSX',
 'CALLED_freq',
 'ADDRESS_freq',
 'ROOM_NUM_freq',
 'BED_NUM_freq',
 'CZLY_freq',
 'SSX_freq',
 'MPHM_freq',
 'FIRM_freq',
 'room_bed',
 'DJ_gap',
 'last_1_qty',
 'last_2_qty',
 'last_3_qty',
 'last_4_qty',
 'last_5_qty',
 'last_6_qty',
 'last_7_qty',
 'last_8_qty',
 'last_10_qty',
 'last_12_qty',
 'last_15_qty',
 'last_20_qty',
 'qty_rolling3_mean',
 'qty_rolling3_max',
 'qty_rolling3_sum',
 'qty_rolling4_mean',
 'qty_rolling4_max',
 'qty_rolling4_sum',
 'qty_rolling6_mean',
 'qty_rolling6_max',
 'qty_rolling6_sum',
 'qty_ewm_mean',
 'qty_ewm_std',
 'qty_ewm_corr']
gc.collect()
102
used_features = col_list
cate_cols = ['HOTELID', 'MPHM']
X_train = df[df["DATE"] < '2021-08-02'][used_features].reset_index(drop=True)
y_train = df[df["DATE"] < '2021-08-02']["in_hotel_num"]
X_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')][used_features].reset_index(drop=True)
y_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')]["in_hotel_num"]
X_test = df[df["type"] == 'test'][used_features].reset_index(drop=True)

clf_1 = LGBMClassifier(num_leaves = 256,
                     n_estimators = 20000,
                     learning_rate = 0.005,
                     verbose = -1,
                     max_bin = 100,
                     max_depth = 10,
                     feature_fraction_seed = 66,
                     feature_fraction = 0.7,
                     bagging_seed = 66,
                     bagging_freq = 1,
                     bagging_fraction = 0.95,
                     metric = 'auc', # MultiAuc_score
                     lambda_l1 = 0.1,
                     lambda_l2 = 0.1, 
                     min_child_weight = 30,
                     n_jobs=80)

clf_1.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,verbose=200) #, categorical_feature = object_list

gc.collect()
[LightGBM] [Warning] feature_fraction is set=0.7, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7
[LightGBM] [Warning] lambda_l1 is set=0.1, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.1
[LightGBM] [Warning] bagging_fraction is set=0.95, subsample=1.0 will be ignored. Current value: bagging_fraction=0.95
[LightGBM] [Warning] lambda_l2 is set=0.1, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.1
[LightGBM] [Warning] bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.541449





357
oof_prob = clf_1.predict_proba(X_valid[used_features])[:, 1]  
oof_prob.min(), oof_prob.max()
(0.9610922808728843, 0.9625584615170373)
oof_prob1 = clf_1.predict_proba(X_test[used_features])[:, 1]  
oof_prob1.min(), oof_prob1.max()
(0.960959321386754, 0.9624418595928675)
X_test[used_features][:10]
STATUS FIRM guest_sum guest_sum_notnull in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE SSX CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_gap last_1_qty last_2_qty last_3_qty last_4_qty last_5_qty last_6_qty last_7_qty last_8_qty last_10_qty last_12_qty last_15_qty last_20_qty qty_rolling3_mean qty_rolling3_max qty_rolling3_sum qty_rolling4_mean qty_rolling4_max qty_rolling4_sum qty_rolling6_mean qty_rolling6_max qty_rolling6_sum qty_ewm_mean qty_ewm_std qty_ewm_corr
0 NaN NaN NaN NaN 2021 9 4 3 5 247 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 6.328848e-12 1.0
1 NaN NaN NaN NaN 2021 9 5 3 6 248 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.415174e-12 1.0
2 NaN NaN NaN NaN 2021 9 11 3 5 254 36 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.164424e-13 1.0
3 NaN NaN NaN NaN 2021 9 12 3 6 255 36 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 7.075868e-14 1.0
4 NaN NaN NaN NaN 2021 9 19 3 6 262 37 1 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.582212e-14 1.0
5 NaN NaN NaN NaN 2021 9 20 3 0 263 38 0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.537934e-15 1.0
6 NaN NaN NaN NaN 2021 9 21 3 1 264 38 0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 7.911060e-16 1.0
7 NaN NaN NaN NaN 2021 9 4 3 5 247 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 6.328848e-12 1.0
8 NaN NaN NaN NaN 2021 9 5 3 6 248 35 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.415174e-12 1.0
9 NaN NaN NaN NaN 2021 9 11 3 5 254 36 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.164424e-13 1.0
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
#     'objective': 'binary',
    'n_estimators': 20000,
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'auc'},
    'verbose': -1
}

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
        continue  
  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
                          categorical_feature=cate_cols)  
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
                          categorical_feature=cate_cols)  
    clf = LGBMClassifier.fit(params, trn_data, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    early_stopping_rounds=Early_Stopping_Rounds)
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
  
    print('MAE: {}'.format(mae_))  
  
    print("ReTraining on all data")  
    gc.enable()  
    del trn_data, val_data  
    gc.collect()  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
#     clf = LGBMClassifier.fit(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

 in 
     49     val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
     50                           categorical_feature=cate_cols)  
---> 51     clf = LGBMClassifier.fit(params, train.iloc[train_index], df[df['type'] == 'train'].iloc[train_index]['in_hotel_num'], eval_set=[trn_data, val_data], verbose=Verbose, early_stopping_rounds=Early_Stopping_Rounds)
     52     val = clf.predict(train.iloc[valid_index])
     53     mae_ = mean_absolute_error(labels[valid_index], val)


F:\anaconda\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
    783         _LGBMAssertAllFinite(y)
    784         _LGBMCheckClassificationTargets(y)
--> 785         self._le = _LGBMLabelEncoder().fit(y)
    786         _y = self._le.transform(y)
    787         self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))


AttributeError: 'dict' object has no attribute '_le'
df['HOTELID'] = df['HOTELID'].astype('category')
df['MPHM'] = df['MPHM'].astype('category')
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'l2', 'l1'},
    'verbose': -1
}

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
        continue  
  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
                          categorical_feature=cate_cols)  
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
                          categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    early_stopping_rounds=Early_Stopping_Rounds)
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
  
    print('MAE: {}'.format(mae_))  
  
    print("ReTraining on all data")  
    gc.enable()  
    del trn_data, val_data  
    gc.collect()  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation
Training until validation scores don't improve for 100 rounds
[500]	training's l2: 0.0357266	training's l1: 0.0733432	valid_1's l2: 0.0140039	valid_1's l1: 0.0424839
[1000]	training's l2: 0.034654	training's l1: 0.0715026	valid_1's l2: 0.0136241	valid_1's l1: 0.0362867
[1500]	training's l2: 0.0340906	training's l1: 0.0702526	valid_1's l2: 0.0134038	valid_1's l1: 0.0323129
[2000]	training's l2: 0.0337541	training's l1: 0.0694053	valid_1's l2: 0.0132898	valid_1's l1: 0.0298233
[2500]	training's l2: 0.0335207	training's l1: 0.0688021	valid_1's l2: 0.0131952	valid_1's l1: 0.028312
[3000]	training's l2: 0.0333523	training's l1: 0.068367	valid_1's l2: 0.0131123	valid_1's l1: 0.0273034
[3500]	training's l2: 0.033219	training's l1: 0.0680436	valid_1's l2: 0.0130372	valid_1's l1: 0.0264549
[4000]	training's l2: 0.033109	training's l1: 0.0678005	valid_1's l2: 0.0129586	valid_1's l1: 0.0259314
[4500]	training's l2: 0.0330175	training's l1: 0.0676386	valid_1's l2: 0.0129	valid_1's l1: 0.0256456
[5000]	training's l2: 0.0329394	training's l1: 0.0675283	valid_1's l2: 0.0128614	valid_1's l1: 0.0254616
[5500]	training's l2: 0.0328693	training's l1: 0.0674366	valid_1's l2: 0.0128206	valid_1's l1: 0.0253768
Early stopping, best iteration is:
[5586]	training's l2: 0.0328584	training's l1: 0.067422	valid_1's l2: 0.0128154	valid_1's l1: 0.0253677
MAE: 0.025367655553334097
ReTraining on all data
Best_iteration:  5586
pred1 = clf.predict(test)
pred1.min(), pred1.max()
(0.2106748391712024, 0.9860467852769609)
pre_d = df[df['type'] == 'test']
pre_d['ROOM_EMPTY'] = pred1.tolist()
pre_d1 = pre_d[['HOTELID', 'DATE', 'ROOM_EMPTY']]
pp = pre_d1[pre_d1['DATE']=='2021-09-21']
pp[:5]
HOTELID DATE ROOM_EMPTY
356552 10083 2021-09-21 0.726068
355831 10125 2021-09-21 0.825109
354872 10237 2021-09-21 0.841640
356181 10273 2021-09-21 0.841640
356825 104814 2021-09-21 0.726068
np.median(pp['ROOM_EMPTY'])
0.8079157059926819
pre_d1[pre_d1['ROOM_EMPTY']>0.807].shape
(2667, 3)
pre_d1['ROOM_EMPTY'] = pre_d1['ROOM_EMPTY'].apply(lambda x: 0 if x>0.807 else 1)
pre_d1['ROOM_EMPTY'].sum()
644
pre_d1.to_csv('pre_02_reg.csv', index=False)


pre_d1['ROOM_EMPTY'].median()
0.8752981387595966
pre_d1['ROOM_EMPTY'].quantile(0.2)
0.8070988321951651
sns.countplot(pre_d1['ROOM_EMPTY'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-281lRSik-1646247028443)(output_177_1.png)]

pre_d1.shape
(3311, 3)
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
nan




pred.min(), pred.max()
(0.03637936444857589, 1.8554153046371984)
pre_df = df[df['type'] == 'test']
pre_df['ROOM_EMPTY'] = pred.tolist()
pre_df1.to_csv('origion.csv', index=False)
pre_df[:5]
HOTELID in_hotel_num type DATE in_year in_month in_day in_quarter in_dayofweek in_dayofyear in_weekofyear in_is_wknd holiday last_1_qty last_2_qty last_3_qty last_4_qty last_5_qty last_6_qty last_7_qty last_8_qty last_10_qty last_12_qty last_15_qty last_20_qty qty_rolling3_mean qty_rolling3_max qty_rolling3_sum qty_rolling4_mean qty_rolling4_max qty_rolling4_sum qty_rolling6_mean qty_rolling6_max qty_rolling6_sum qty_ewm_mean qty_ewm_std qty_ewm_corr ROOM_EMPTY
356546 10083 0 test 2021-09-04 2021 9 4 3 5 247 35 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 6.328848e-12 1.0 0.054686
356547 10083 0 test 2021-09-05 2021 9 5 3 6 248 35 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 2.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.415174e-12 1.0 0.055554
356548 10083 0 test 2021-09-11 2021 9 11 3 5 254 36 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 3.164424e-13 1.0 0.247202
356549 10083 0 test 2021-09-12 2021 9 12 3 6 255 36 1 0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 7.075868e-14 1.0 0.105092
356550 10083 0 test 2021-09-19 2021 9 19 3 6 262 37 1 1 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 9.0 3.0 3.0 12.0 3.0 3.0 18.0 3.0 1.582212e-14 1.0 0.094995
pre_df1 = pre_df[['HOTELID', 'DATE', 'ROOM_EMPTY']]
# 8月份500多
pre_df1[pre_df1['ROOM_EMPTY']>0.07].shape
(2380, 3)
pre_df1['ROOM_EMPTY'] = pre_df1['ROOM_EMPTY'].apply(lambda x: 0 if x>=0.25 else 1)

# mean_absolute_error(labels[valid_index], val)  
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
0.7065505364632024
a = pd.DataFrame()
a['y'] = labels[valid_index].tolist()
a['val'] = val.tolist()
# _val = np.array(a['val'].values.tolist())
# _y = np.array(a['y'].values.tolist())
fpr, tpr, thresholds = metrics.roc_curve(a['y'], a['val'], pos_label=2)
metrics.auc(fpr, tpr)
nan
np.array(a['val'].values.tolist())
array([0.98077955, 0.98028159, 0.98090901, ..., 0.99187334, 0.9917258 ,
       0.99187287])
a['y'].max()
1
a['y'] = a['y'].apply(lambda x: 1 if x==0 else 0)
a['val'] = a['val'].apply(lambda x: (1-x/(4.03212491)))
a[:5]
y val
0 1 0.980780
1 1 0.980282
2 1 0.980909
3 1 0.986415
4 1 0.988064
val.max()-val.min()
4.0321249098355265
labels[valid_index]
array([0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])
val[140:160]
array([0.02691592, 0.02701038, 0.02690596, 0.02699238, 0.02690596,
       0.07806598, 0.07950708, 0.29049198, 0.77018916, 0.98434103,
       0.05606983, 0.06270357, 0.04589361, 0.05028036, 0.04938166,
       0.05054352, 0.04737767, 0.11532118, 0.05313139, 0.04558791])

gc.collect()
388
from tqdm import tqdm
from sklearn.metrics import *
print("#############################输出第一个模型的评价参数及结果#############################")

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    recall = 0
    precision = 0 
    best_th = 0
    for th in tqdm([i/2000 for i in range(100, 2000)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0

#         recall = recall_score(y_valid, oof_prob_copy)
#         precision = precision_score(y_valid, oof_prob_copy)
        fpr, tpr, thresholds = metrics.roc_curve(oof_prob_copy, y_valid, pos_label=2)
        aucs = metrics.auc(fpr, tpr)
        if aucs > best_f2:
            best_th = th
            best_f2 = aucs
#         gc.collect()
#     recall = recall
#     precision = precision
        
    return best_th, best_f2
val1 = val
y1 = labels[valid_index]
best_th, aucs = find_best_threshold(val1, y1)
print("分界值", best_th)
print("F2评价分数", aucs)
# print("recall召回率", recall)
# print("precision精确度", precision)
  1%|▏                                       | 10/1900 [00:00<00:23, 80.05it/s]

#############################输出第一个模型的评价参数及结果#############################


100%|██████████████████████████████████████| 1900/1900 [00:20<00:00, 94.61it/s]

分界值 0.05
F2评价分数 0.7065505364632024
pre_df1[20:33]
HOTELID DATE ROOM_EMPTY
354872 10237 2021-09-21 0
356175 10273 2021-09-04 0
356176 10273 2021-09-05 0
356177 10273 2021-09-11 0
356178 10273 2021-09-12 0
356179 10273 2021-09-19 0
356180 10273 2021-09-20 0
356181 10273 2021-09-21 0
356819 104814 2021-09-04 0
356820 104814 2021-09-05 0
356821 104814 2021-09-11 0
356822 104814 2021-09-12 0
356823 104814 2021-09-19 0
pre_df1.to_csv('pre_025_regression.csv', index=False)
pre_df1['ROOM_EMPTY'].mode()
0    0.20966
dtype: float64
pre_df1['ROOM_EMPTY'].median()
0.10509209231427108



plt.figure(figsize=(16,8))
sns.countplot(pre_df1['ROOM_EMPTY'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uSxKnO4p-1646247028446)(output_213_1.png)]


# df4['in_year'] = df4['datetime'].dt.year.fillna(0).astype('int')
# df4['in_month'] = df4['datetime'].dt.month.fillna(0).astype('int')
# df4['in_day'] = df4['datetime'].dt.day.fillna(0).astype('int')
# df4['in_quarter'] = df4['datetime'].dt.quarter.fillna(0).astype('int')
# df4['in_dayofweek'] = df4['datetime'].dt.dayofweek.fillna(0).astype('int')
# df4['in_dayofyear'] = df4['datetime'].dt.dayofyear.fillna(0).astype('int')
# df4['in_is_wknd'] = df4['datetime'].dt.dayofweek // 5                 #是否周末
df4['date1'] = df4['date'].astype('str')
#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]
df_1.shape
(11583, 59)
df_1['in_month'].value_counts()
7    4391
5    2520
6    2192
8    1656
9     824
Name: in_month, dtype: int64
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
df4[df4['date']=='2021-05-03']
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS INSERT_TIME MODIFY_TIME FIRM GUEST_ID BDATE XZQH IN_TIME OUT_TIME guest_sum guest_sum_notnull in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY CALLED BUR_CODE STA_CODE SSX ADDRESS MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd
df_2 = df4[df4['date1'].isin(holiday)]
df_2['holiday'] = 1
df_1['holiday'] = 0
df5 = pd.concat([df_2, df_1])
df5.shape
(14662, 61)
df5.drop(['CALLED', 'ADDRESS', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df5[:3]
ORDER_ID ORDER_PRIMARY_ID HOTELID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd date1 holiday
9 3597092 CFD17F6BB142485DB3DEC6B5E2D0B664 100193 202105012046 1 3 NaN 19771008 132826 1 2021-05-01 20:46:00 2021-05-02 11:58:00 2021-05-01 20:46:00 2021-05-01 2021-05-02 2021-05-01 NaN 2 2 1 NaN NaN 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 1249 2281 4727 NaN NaN 150 2565 3429 NaN NaN 1.0 2020-05-18 2020-07-15 58.0 1 2021 5 1 2 5 121 1 1 2021-05-01 1
11 3601837 5C96E986C1F04691A3EB07962BCC7B00 100195 202105021435 1 3 NaN 19951009 370203 1 2021-05-02 14:35:00 2021-05-03 11:58:00 2021-05-02 14:35:00 2021-05-02 2021-05-03 2021-05-02 NaN 2 3 1 NaN NaN 371083 ROOM001 2020-05-18 10:33:55 2020-07-15 10:23:58 1 1 NaN 1249 895 4727 NaN NaN 150 2565 3429 NaN NaN 1.5 2020-05-18 2020-07-15 58.0 1 2021 5 2 2 6 122 1 1 2021-05-02 1
53 3314117 C0C5516462D3487C895DD654B836F855 10037 202104082130 1 3 NaN 19730313 120103 1 2021-04-30 13:58:00 2021-05-03 11:58:00 2021-05-01 13:58:00 2021-04-30 2021-05-03 2021-05-01 NaN 1 2 1 NaN NaN 371002 ROOM001 2020-05-18 10:22:13 2020-07-15 10:23:58 2 2 NaN 3091 2281 4727 NaN NaN 4661 2565 3429 NaN NaN 2.0 2020-05-18 2020-07-15 58.0 1 2021 5 1 2 5 121 1 1 2021-05-01 1
drop_col = df5.columns.tolist()
drop_col.remove('holiday')
df5.drop_duplicates(drop_col, keep='first', inplace=True)
df5.shape
(12708, 54)
df5['HOTELID'].nunique()
3133
test = pd.read_csv('testb/submit_example_2.csv')
test[:2]
HOTELID DATE ROOM_EMPTY
0 303760 2021-09-04 0.1
1 303760 2021-09-05 0.1
test.info()

RangeIndex: 3311 entries, 0 to 3310
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   HOTELID     3311 non-null   object 
 1   DATE        3311 non-null   object 
 2   ROOM_EMPTY  3311 non-null   float64
dtypes: float64(1), object(2)
memory usage: 77.7+ KB
hotel = test['HOTELID'].values.tolist()
df5 = df5[df5['HOTELID'].isin(hotel)]
df5.shape
(3708, 54)
d = df5[df5['in_month']==9]
d['date'].value_counts()
2021-09-04    42
2021-09-05    29
2021-09-11    16
2021-09-19    14
2021-09-20    13
2021-09-18    12
2021-09-12    11
2020-09-06     9
2020-09-13     9
2020-09-05     9
2021-09-21     7
2020-09-19     7
2021-09-26     6
2020-09-27     6
2021-09-25     6
2020-09-26     6
2020-09-20     6
2020-09-12     4
Name: date, dtype: int64
test['date1'] = test['DATE']
test1 = test.merge(df5, on=['HOTELID', 'date1'], how='left')
test.shape
(3311, 4)
test1.shape
(3311, 56)
test1.info()

Int64Index: 3311 entries, 0 to 3310
Data columns (total 56 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   HOTELID           3311 non-null   object        
 1   DATE              3311 non-null   object        
 2   ROOM_EMPTY        3311 non-null   float64       
 3   date1             3311 non-null   object        
 4   ORDER_ID          132 non-null    float64       
 5   ORDER_PRIMARY_ID  132 non-null    object        
 6   ORDER_TIME        132 non-null    object        
 7   STATUS            132 non-null    float64       
 8   FIRM              132 non-null    float64       
 9   GUEST_ID          16 non-null     object        
 10  BDATE             132 non-null    object        
 11  XZQH              132 non-null    object        
 12  guest_sum         132 non-null    float64       
 13  in_time           132 non-null    datetime64[ns]
 14  out_time          132 non-null    datetime64[ns]
 15  datetime          132 non-null    datetime64[ns]
 16  in_date           132 non-null    object        
 17  out_date          132 non-null    object        
 18  date              132 non-null    object        
 19  JYMJ              1 non-null      float64       
 20  ROOM_NUM          132 non-null    float64       
 21  BED_NUM           132 non-null    float64       
 22  CZLY              132 non-null    float64       
 23  BUR_CODE          86 non-null     float64       
 24  STA_CODE          86 non-null     object        
 25  SSX               132 non-null    float64       
 26  MPHM              132 non-null    object        
 27  DJSJ              132 non-null    datetime64[ns]
 28  BGSJ              132 non-null    datetime64[ns]
 29  CALLED_freq       132 non-null    float64       
 30  ADDRESS_freq      132 non-null    float64       
 31  JYMJ_freq         1 non-null      float64       
 32  ROOM_NUM_freq     132 non-null    float64       
 33  BED_NUM_freq      132 non-null    float64       
 34  CZLY_freq         132 non-null    float64       
 35  BUR_CODE_freq     86 non-null     float64       
 36  STA_CODE_freq     86 non-null     float64       
 37  SSX_freq          132 non-null    float64       
 38  MPHM_freq         132 non-null    float64       
 39  FIRM_freq         132 non-null    float64       
 40  room_ratio        1 non-null      float64       
 41  bed_ratio         1 non-null      float64       
 42  room_bed          132 non-null    float64       
 43  DJ_date           132 non-null    object        
 44  BG_date           132 non-null    object        
 45  DJ_gap            132 non-null    float64       
 46  in_hotel_num      132 non-null    float64       
 47  in_year           132 non-null    float64       
 48  in_month          132 non-null    float64       
 49  in_day            132 non-null    float64       
 50  in_quarter        132 non-null    float64       
 51  in_dayofweek      132 non-null    float64       
 52  in_dayofyear      132 non-null    float64       
 53  record_is_wknd    132 non-null    float64       
 54  in_is_wknd        132 non-null    float64       
 55  holiday           132 non-null    float64       
dtypes: datetime64[ns](5), float64(36), object(15)
memory usage: 1.4+ MB
test1[:5]
HOTELID DATE ROOM_EMPTY date1 ORDER_ID ORDER_PRIMARY_ID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd holiday
0 303760 2021-09-04 0.1 2021-09-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 303760 2021-09-05 0.1 2021-09-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 303760 2021-09-11 0.1 2021-09-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 303760 2021-09-12 0.1 2021-09-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 303760 2021-09-19 0.1 2021-09-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
test1['holiday1'] = test1['holiday'].apply(lambda x: 0 if x else 1)
test1['holiday1'].value_counts()
0    3213
1      98
Name: holiday1, dtype: int64
test1[:10]
HOTELID DATE ROOM_EMPTY date1 ORDER_ID ORDER_PRIMARY_ID ORDER_TIME STATUS FIRM GUEST_ID BDATE XZQH guest_sum in_time out_time datetime in_date out_date date JYMJ ROOM_NUM BED_NUM CZLY BUR_CODE STA_CODE SSX MPHM DJSJ BGSJ CALLED_freq ADDRESS_freq JYMJ_freq ROOM_NUM_freq BED_NUM_freq CZLY_freq BUR_CODE_freq STA_CODE_freq SSX_freq MPHM_freq FIRM_freq room_ratio bed_ratio room_bed DJ_date BG_date DJ_gap in_hotel_num in_year in_month in_day in_quarter in_dayofweek in_dayofyear record_is_wknd in_is_wknd holiday holiday1
0 303760 2021-09-04 0.1 2021-09-04 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
1 303760 2021-09-05 0.1 2021-09-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
2 303760 2021-09-11 0.1 2021-09-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
3 303760 2021-09-12 0.1 2021-09-12 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
4 303760 2021-09-19 0.1 2021-09-19 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
5 303760 2021-09-20 0.1 2021-09-20 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
6 303760 2021-09-21 0.1 2021-09-21 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
7 315471 2021-09-04 0.1 2021-09-04 4798603.0 A139D5C20BB44529A66957AA491FB88E 202108212327 1.0 3.0 NaN 20001101 331021 1.0 2021-09-03 13:58:00 2021-09-05 11:58:00 2021-09-04 13:58:00 2021-09-03 2021-09-05 2021-09-04 NaN 1.0 2.0 1.0 3.710980e+11 371000000007 371002.0 ROOM001 2021-05-05 13:58:58 2021-05-05 14:02:45 2.0 1.0 NaN 3091.0 2281.0 4727.0 852.0 433.0 4661.0 2565.0 3429.0 NaN NaN 2.0 2021-05-05 2021-05-05 0.0 1.0 2021.0 9.0 4.0 3.0 5.0 247.0 1.0 1.0 0.0 1
8 315471 2021-09-05 0.1 2021-09-05 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
9 315471 2021-09-11 0.1 2021-09-11 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaT NaT NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0
test1['ROOM_EMPTY'] = test1['holiday1']
test2 = test1[['HOTELID', 'DATE', 'ROOM_EMPTY']]
test2[:5]
HOTELID DATE ROOM_EMPTY
0 303760 2021-09-04 0
1 303760 2021-09-05 0
2 303760 2021-09-11 0
3 303760 2021-09-12 0
4 303760 2021-09-19 0
test2.to_csv('pre.csv', index=False)
pre = pd.read_csv('file41704172.csv')
pre[:2]
HOTELID DATE room
0 303760 2021-09-04 0.0
1 303760 2021-09-05 0.0
pre.rename(columns={'ROOM_EMPTY':'room'}, inplace=True)
pr = test2.merge(pre, on=['HOTELID', 'DATE'], how='left')
pr['ROOM_EMPTY'] = pr['ROOM_EMPTY'] + pr['room']
del pr['room']
pre['room'].value_counts()
0.0    2838
1.0     473
Name: room, dtype: int64
pr.to_csv('pre1.csv', index=False)



dd = df5.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()
dd
in_year in_month ORDER_ID
0 2020 7 12
1 2020 8 102
2 2020 9 56
3 2021 5 735
4 2021 6 833
5 2021 7 1279
6 2021 8 535
7 2021 9 156
dd['ym'] = dd['in_year']*100+dd['in_month']
dd['ym'].value_counts()
202105    1
202007    1
202008    1
202009    1
202106    1
202107    1
202108    1
202109    1
Name: ym, dtype: int64
plt.figure(figsize=(15,8))
plt.scatter(dd.index, dd['ORDER_ID'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fiksPl1J-1646247028449)(output_270_1.png)]










你可能感兴趣的:(数据分析,python,数据挖掘,开发语言)