大佬喝可乐

Untitled555555555

%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings


warnings.filterwarnings('ignore')

plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

path = 'traina/'  
path1 = 'trainb/'

入住信息

cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
cust.head(2)

	ORDER_PRIMARY_ID	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME
0	07C5BF73B18B44B0877DEED007F8771D	NaN	19800627	222405	NaN	NaN
1	3525D57CAE104A078E4962B2B89377B0	371099C301202107090001	19951025	370523	2.021071e+11	2.021071e+11

gc.collect()

cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

cust.head(2)

	ORDER_PRIMARY_ID	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME
0	07C5BF73B18B44B0877DEED007F8771D	NaN	19800627	222405	0.0	0.0
1	3525D57CAE104A078E4962B2B89377B0	371099C301202107090001	19951025	370523	202107092103	202107111158

cust.info()


Int64Index: 44030 entries, 0 to 3343
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_PRIMARY_ID  44030 non-null  object
 1   GUEST_ID          5158 non-null   object
 2   BDATE             44030 non-null  object
 3   XZQH              44030 non-null  object
 4   IN_TIME           44030 non-null  object
 5   OUT_TIME          44030 non-null  object
dtypes: object(6)
memory usage: 2.4+ MB

cust.nunique()

ORDER_PRIMARY_ID    43883
GUEST_ID             3698
BDATE                9503
XZQH                 2659
IN_TIME              3396
OUT_TIME              236
dtype: int64

cust['ORDER_PRIMARY_ID'].value_counts()[5:10]

03158D5CA62E42339D697612EA347FB3    2
0A99D74F1E0248D68A729CB79FC640E6    2
1FE3E448558347D89C56B0AFCEC8ACFB    2
864E417B7BDE4C7B9449B7E087E9F21E    2
3DEFA75566934CA5929D0C47F0B95FDE    2
Name: ORDER_PRIMARY_ID, dtype: int64

cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)

stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')

cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)

cust[cust['ORDER_PRIMARY_ID']=='70798782D6C04A438360D80AFE4845C1']

	ORDER_PRIMARY_ID	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME	guest_sum	guest_sum_notnull
22801	70798782D6C04A438360D80AFE4845C1	371099B389202107040001	19990314	130983	0.0	0.0	3	2

cust['IN_TIME'].max(), cust['IN_TIME'].min()

('202110041413', '0.0')

订单信息

# platform infomation
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv') 
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order.head(2)

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	PRE_IN_TIME	PRE_OUT_TIME	ORDER_TIME	STATUS	CANCEL_TIME	INSERT_TIME	MODIFY_TIME	FIRM
0	923521	96BBDB7CC049421C85826AE07020B139	278337	202008011200	202008021200	1.596120e+11	1	NaN	20200730224152	20200730224152	3
1	923696	C72F20539AD1447D86CD1A8E5EAEC63A	282932	202008041400	202008061200	1.596121e+11	1	NaN	20200730225524	20200730225524	3

order[order['ORDER_PRIMARY_ID']=='3CDEDB5E03534D379687645675898CA4']

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	PRE_IN_TIME	PRE_OUT_TIME	ORDER_TIME	STATUS	CANCEL_TIME	INSERT_TIME	MODIFY_TIME	FIRM
30970	4422027	3CDEDB5E03534D379687645675898CA4	170898	202108081358	202108101158	2.021071e+11	3	NaN	20210712210153	20210712210153	3

order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')

order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])

##########################没有考虑包含取消时间的订单

order = order[(order['CANCEL_TIME']=='nan')]

order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])

order.drop_duplicates('ORDER_ID', keep='last', inplace=True)

order.shape

(29941, 11)

order[order['ORDER_ID']==4402846]

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	PRE_IN_TIME	PRE_OUT_TIME	ORDER_TIME	STATUS	CANCEL_TIME	INSERT_TIME	MODIFY_TIME	FIRM
30141	4402846	BC0E8E3602434EA6A5F29A6F6FF42233	100177	202107111358	202107121158	202107111032	1	nan	20210711140042	20210711140042	3

order.info()


Int64Index: 29941 entries, 2 to 18590
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ORDER_ID          29941 non-null  int64 
 1   ORDER_PRIMARY_ID  29941 non-null  object
 2   HOTELID           29941 non-null  object
 3   PRE_IN_TIME       29941 non-null  object
 4   PRE_OUT_TIME      29941 non-null  object
 5   ORDER_TIME        29941 non-null  object
 6   STATUS            29941 non-null  int64 
 7   CANCEL_TIME       29941 non-null  object
 8   INSERT_TIME       29941 non-null  object
 9   MODIFY_TIME       29941 non-null  object
 10  FIRM              29941 non-null  int64 
dtypes: int64(3), object(8)
memory usage: 2.7+ MB

order.nunique()

ORDER_ID            29941
ORDER_PRIMARY_ID    29941
HOTELID              4801
PRE_IN_TIME          6875
PRE_OUT_TIME          729
ORDER_TIME          26818
STATUS                  3
CANCEL_TIME             1
INSERT_TIME         25432
MODIFY_TIME         25432
FIRM                    2
dtype: int64

order_info['FIRM'].value_counts()

3     32029
10     8521
Name: FIRM, dtype: int64

order['STATUS'].value_counts()

1    26137
2     3221
3      583
Name: STATUS, dtype: int64

#插入时间不等于修改时间，就是进行过修改过的订单,最后发现为空，等于没有修改过
order[order['INSERT_TIME']!=order['MODIFY_TIME']]

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	PRE_IN_TIME	PRE_OUT_TIME	ORDER_TIME	STATUS	CANCEL_TIME	INSERT_TIME	MODIFY_TIME	FIRM

合并数据order和cust

# order时间包含了cust时间
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')

月份分为1、3、5、7、8、10、12：31天；2：28天；4、6、9、11：30天

df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)

df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)

df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)

#######订单插入时间和订单时间的时间差

df.head(2)

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	INSERT_TIME	MODIFY_TIME	FIRM	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME	guest_sum	guest_sum_notnull
0	706648	A4AACE06B518418C8A8CA1935DDD1C5A	227185	202007092241	1	20200714171021	20200714171021	3	NaN	20010409	371002	202007151300	202007161200	1	0
1	748647	67D551AACFD049AE9CC2AB65E9870678	9483	202007132109	1	20201002101040	20201002101040	3	NaN	19881023	341202	202010011400	202010021011	1	0

df.info()


Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ORDER_ID           29941 non-null  int64 
 1   ORDER_PRIMARY_ID   29941 non-null  object
 2   HOTELID            29941 non-null  object
 3   ORDER_TIME         29941 non-null  object
 4   STATUS             29941 non-null  int64 
 5   INSERT_TIME        29941 non-null  object
 6   MODIFY_TIME        29941 non-null  object
 7   FIRM               29941 non-null  int64 
 8   GUEST_ID           3561 non-null   object
 9   BDATE              29941 non-null  object
 10  XZQH               29941 non-null  object
 11  IN_TIME            29941 non-null  object
 12  OUT_TIME           29941 non-null  object
 13  guest_sum          29941 non-null  int64 
 14  guest_sum_notnull  29941 non-null  int64 
dtypes: int64(5), object(10)
memory usage: 3.7+ MB

df.nunique();

df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')

df = df[~df['out_time'].isnull()]

df.head(2)

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	INSERT_TIME	MODIFY_TIME	FIRM	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME	guest_sum	guest_sum_notnull	in_time	out_time
0	706648	A4AACE06B518418C8A8CA1935DDD1C5A	227185	202007092241	1	20200714171021	20200714171021	3	NaN	20010409	371002	202007151300	202007161200	1	0	2020-07-15 13:00:00	2020-07-16 12:00:00
1	748647	67D551AACFD049AE9CC2AB65E9870678	9483	202007132109	1	20201002101040	20201002101040	3	NaN	19881023	341202	202010011400	202010021011	1	0	2020-10-01 14:00:00	2020-10-02 10:11:00

# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')                 #入住时间的年份
# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int')               #退房时间的年份
# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int')               #入住时间的月份
# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int')             #退房时间的月份
# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')                   #入住时间的日期
# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')                 #退房时间的日期
# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int')                 #入住时间的小时
# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int')               #退房时间的小时

#2月份没有
# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)

# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)

# month_ = [1, 3, 5, 7, 8, 10]
# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)

# df['day_gap'].sum()

#后移一天
df[‘lock_time’][0] + pd.Timedelta(days=1)
#前移一天
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天；；；；；；；还有钟点房

# df_0 = df[(df['day_gap']==0)]
# df_0['date'] = df_0['IN_TIME']
# 这里边包含了钟点房和当天入住并退房的。

dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
    stat1 = pd.DataFrame()
    stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
    stat1['datetime'] = stat1['datetime'].dt.date.fillna(0).astype('str')
    stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
    dfs.append(stat1)

gc.collect()

df_date = pd.concat(dfs).reset_index(drop=True)

df_date[:3]

	datetime	ORDER_PRIMARY_ID
0	2021-06-30	00003FC18B254E86803C00F4BBA382E4
1	2021-07-01	00003FC18B254E86803C00F4BBA382E4
2	2021-07-02	00003FC18B254E86803C00F4BBA382E4

dfs1 = []
for idx, group in df.groupby(['HOTELID']):
    stat = pd.DataFrame()
    stat['datetime'] = pd.date_range(start='20200601', end='20210830', freq='D', normalize=False, closed=None)
    stat['datetime'] = stat['datetime'].astype('str')
    stat['HOTELID'] = group['HOTELID'].values[0]
    dfs1.append(stat)

df_date1 = pd.concat(dfs1).reset_index(drop=True)

df_date1[:10]

	datetime	ORDER_PRIMARY_ID
0	2020-06-01	00003FC18B254E86803C00F4BBA382E4
1	2020-06-02	00003FC18B254E86803C00F4BBA382E4
2	2020-06-03	00003FC18B254E86803C00F4BBA382E4
3	2020-06-04	00003FC18B254E86803C00F4BBA382E4
4	2020-06-05	00003FC18B254E86803C00F4BBA382E4
5	2020-06-06	00003FC18B254E86803C00F4BBA382E4
6	2020-06-07	00003FC18B254E86803C00F4BBA382E4
7	2020-06-08	00003FC18B254E86803C00F4BBA382E4
8	2020-06-09	00003FC18B254E86803C00F4BBA382E4
9	2020-06-10	00003FC18B254E86803C00F4BBA382E4

df11 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')

_df1 = df_date1.merge(df11, on=['HOTELID', 'datetime'], how='left')

# df['guest_sum'] = 0
# df['guest_sum_notnull'] = 0

ddf = _df1[_df1['ORDER_ID'].isnull()]

df11['in_date'] = df11['in_time'].dt.date
df11['out_date'] = df11['out_time'].dt.date
df11['datetime'] = pd.to_datetime(df11['datetime'], errors='coerce', format='%Y-%m-%d')
df11['date'] = df11['datetime'].dt.date

_df2 = df11[(df11['out_date']!=df11['date']) | ((df11['out_date']==df11['date']) & (df11['in_date']==df11['out_date']))]

_df2.drop(['in_time', 'out_time', 'in_date', 'out_date', 'datetime'], axis=1, inplace=True)
_df2.rename(columns={'date':'datetime'}, inplace=True)

#增加特征：民宿单日的总订单量
in_num = _df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
_df2 = _df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')

_df2.columns

Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
       'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'datetime',
       'in_hotel_num'],
      dtype='object')

ddf.columns

Index(['datetime', 'HOTELID', 'ORDER_ID', 'ORDER_PRIMARY_ID', 'ORDER_TIME',
       'STATUS', 'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE',
       'XZQH', 'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull',
       'in_time', 'out_time'],
      dtype='object')

ddf.drop(['in_time', 'out_time'], axis=1, inplace=True)

ddf['in_hotel_num'] = 0

df1 = pd.concat([_df2, ddf])

df1 = df1.sort_values(['HOTELID', 'datetime'])

df1.head(2).append(df1.tail(2))

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	INSERT_TIME	MODIFY_TIME	FIRM	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME	guest_sum	guest_sum_notnull	datetime
0	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020-06-01
1	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020-06-02
2196800	NaN	NaN	B109977684	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2021-08-29
2196801	NaN	NaN	B109977684	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2021-08-30

df1.info()


Int64Index: 2186104 entries, 0 to 2196801
Data columns (total 17 columns):
 #   Column             Dtype  
---  ------             -----  
 0   ORDER_ID           float64
 1   ORDER_PRIMARY_ID   object 
 2   HOTELID            object 
 3   ORDER_TIME         object 
 4   STATUS             float64
 5   INSERT_TIME        object 
 6   MODIFY_TIME        object 
 7   FIRM               float64
 8   GUEST_ID           object 
 9   BDATE              object 
 10  XZQH               object 
 11  IN_TIME            object 
 12  OUT_TIME           object 
 13  guest_sum          float64
 14  guest_sum_notnull  float64
 15  datetime           object 
 16  in_hotel_num       int64  
dtypes: float64(5), int64(1), object(11)
memory usage: 300.2+ MB

test = pd.read_csv('testb/submit_example_2.csv')

test.head()

	HOTELID	DATE	ROOM_EMPTY
0	303760	2021-09-04	0.1
1	303760	2021-09-05	0.1
2	303760	2021-09-11	0.1
3	303760	2021-09-12	0.1
4	303760	2021-09-19	0.1

# #增加特征：民宿单日的总订单量
# in_num = df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
# df2 = df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')

test['type'] = 'test'
df1['type'] = 'train'

df1['DATE'] = df1['datetime'].astype('str')

df2 = df1[df1['DATE'] < '2021-09-01']

df2.shape

(2185510, 19)

df2['datetime'] = pd.to_datetime(df2['datetime'], errors='coerce', format='%Y-%m-%d')
df2['in_year'] = df2['datetime'].dt.year.fillna(0).astype('int')
df2['in_month'] = df2['datetime'].dt.month.fillna(0).astype('int')
df2['in_day'] = df2['datetime'].dt.day.fillna(0).astype('int')
df2['in_quarter'] = df2['datetime'].dt.quarter.fillna(0).astype('int')
df2['in_dayofweek'] = df2['datetime'].dt.dayofweek.fillna(0).astype('int')
df2['in_dayofyear'] = df2['datetime'].dt.dayofyear.fillna(0).astype('int')
df2['in_weekofyear'] = df2['datetime'].dt.weekofyear.fillna(0).astype('int')
df2['in_is_wknd'] = df2['datetime'].dt.dayofweek // 5                 #是否周末

df2.head(3)

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	INSERT_TIME	MODIFY_TIME	FIRM	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME	guest_sum	guest_sum_notnull	datetime	type	DATE	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear
0	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020-06-01	train	2020-06-01	2020	6	1	2	0	153	23
1	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020-06-02	train	2020-06-02	2020	6	2	2	1	154	23
2	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2020-06-03	train	2020-06-03	2020	6	3	2	2	155	23

#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df2[(df2['in_is_wknd']==1) & (df2['in_month'].isin(use_month))]

holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
df_2 = df2[df2['DATE'].isin(holiday)]

# 节假日为1
df_2['holiday'] = 1
df_1['holiday'] = 0

df3 = pd.concat([df_2, df_1])
df3.shape

(373093, 28)

drop_col = df3.columns.tolist()
drop_col.remove('holiday')

df3.drop_duplicates(drop_col, keep='first', inplace=True)
df3.shape

(354019, 28)

test['datetime'] = pd.to_datetime(test['DATE'], errors='coerce', format='%Y-%m-%d')

test['in_year'] = test['datetime'].dt.year.fillna(0).astype('int')
test['in_month'] = test['datetime'].dt.month.fillna(0).astype('int')
test['in_day'] = test['datetime'].dt.day.fillna(0).astype('int')
test['in_quarter'] = test['datetime'].dt.quarter.fillna(0).astype('int')
test['in_dayofweek'] = test['datetime'].dt.dayofweek.fillna(0).astype('int')
test['in_dayofyear'] = test['datetime'].dt.dayofyear.fillna(0).astype('int')
test['in_weekofyear'] = test['datetime'].dt.weekofyear.fillna(0).astype('int')
test['in_is_wknd'] = test['datetime'].dt.dayofweek // 5                 #是否周末

test[:3]

	HOTELID	DATE	ROOM_EMPTY	type	datetime	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear	in_is_wknd
0	303760	2021-09-04	0.1	test	2021-09-04	2021	9	4	3	5	247	35	1
1	303760	2021-09-05	0.1	test	2021-09-05	2021	9	5	3	6	248	35	1
2	303760	2021-09-11	0.1	test	2021-09-11	2021	9	11	3	5	254	36	1

test['holiday'] = 0
test['holiday'] = test.apply(lambda x: x['holiday']+1 if x['DATE'] in holiday else x['holiday'], axis=1)

del test['datetime'], df3['datetime'], test['ROOM_EMPTY']

df3.columns

Index(['ORDER_ID', 'ORDER_PRIMARY_ID', 'HOTELID', 'ORDER_TIME', 'STATUS',
       'INSERT_TIME', 'MODIFY_TIME', 'FIRM', 'GUEST_ID', 'BDATE', 'XZQH',
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'in_hotel_num',
       'type', 'DATE', 'in_year', 'in_month', 'in_day', 'in_quarter',
       'in_dayofweek', 'in_dayofyear', 'in_weekofyear', 'in_is_wknd',
       'holiday'],
      dtype='object')

df3.drop(['IN_TIME', 'OUT_TIME'], axis=1, inplace=True)

df4 = df3.merge(test, on=['HOTELID', 'DATE', 'type', 'in_year', 'in_month', 'in_day', 'in_quarter', 'in_dayofweek', 
                          'in_dayofyear', 'in_weekofyear', 'in_is_wknd', 'holiday'], how='outer')

df4.shape

(357330, 25)

dd = df4[df4['DATE']>'2021-09-01']
dd['type'].value_counts()

test    3311
Name: type, dtype: int64

df4.info()


Int64Index: 357330 entries, 0 to 357329
Data columns (total 25 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   ORDER_ID           12543 non-null   float64
 1   ORDER_PRIMARY_ID   12543 non-null   object 
 2   HOTELID            357330 non-null  object 
 3   ORDER_TIME         12543 non-null   object 
 4   STATUS             12543 non-null   float64
 5   INSERT_TIME        12543 non-null   object 
 6   MODIFY_TIME        12543 non-null   object 
 7   FIRM               12543 non-null   float64
 8   GUEST_ID           1877 non-null    object 
 9   BDATE              12543 non-null   object 
 10  XZQH               12543 non-null   object 
 11  guest_sum          12543 non-null   float64
 12  guest_sum_notnull  12543 non-null   float64
 13  in_hotel_num       354019 non-null  float64
 14  type               357330 non-null  object 
 15  DATE               357330 non-null  object 
 16  in_year            357330 non-null  int64  
 17  in_month           357330 non-null  int64  
 18  in_day             357330 non-null  int64  
 19  in_quarter         357330 non-null  int64  
 20  in_dayofweek       357330 non-null  int64  
 21  in_dayofyear       357330 non-null  int64  
 22  in_weekofyear      357330 non-null  int64  
 23  in_is_wknd         357330 non-null  int64  
 24  holiday            357330 non-null  int64  
dtypes: float64(6), int64(9), object(10)
memory usage: 70.9+ MB

room信息

room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info.head(1)

	CODE	HOTELID	JYMJ	ROOM_NUM	BED_NUM	FWLY	CZLY	CALLED	CITY_CODE	BUR_CODE	STA_CODE	SSX	ADDRESS	MPHM	JYQK	FIRM	DJSJ	BGSJ	STATUS	AUDITSTATUS
0	100177	3710830002	0.0	1	1	2	1	【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽	371000000000	NaN	NaN	371083	乳山市长江路银泰海景花园 55-301	ROOM001	【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽	3	2020-05-18 10:33:55	2020-07-15 10:23:58	NaN	NaN

room_info['STATUS'].value_counts()

1.0    2658
Name: STATUS, dtype: int64

room_info.drop(['HOTELID', 'FWLY', 'CITY_CODE', 'STATUS','AUDITSTATUS', 'JYQK'], axis=1, inplace=True)

room_info.rename(columns={'CODE':'HOTELID'}, inplace= True)

room_info['FIRM'].value_counts()

3     3429
10    1878
Name: FIRM, dtype: int64

room_info.nunique()

HOTELID     5307
JYMJ         173
ROOM_NUM      11
BED_NUM       17
CZLY           4
CALLED      4232
BUR_CODE       7
STA_CODE      54
SSX            5
ADDRESS     4850
MPHM         584
FIRM           2
DJSJ        3753
BGSJ        3294
dtype: int64

登记时间和变更时间无变化的只有两家

#经营面积为0的修改为空值
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)

#登记时间和变更时间的差值
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
    st = room_info[col].value_counts().reset_index()
    st.columns = [col, col+'_freq']
    room_info = room_info.merge(st, on=col, how='left')

#房间面积率和床位率\单间房床位量
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']

room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')

room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date

#等级时间和变更时间的时间差
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days

room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()

(Timestamp('2020-05-12 13:16:24'),
 Timestamp('2021-10-19 16:05:41'),
 Timestamp('2020-07-15 10:23:06'),
 Timestamp('2021-10-19 16:09:15'))

# room['BG_dayofweek'].value_counts()

plt.figure(figsize=(20, 8))
sns.countplot(room_info['JYMJ'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xMlhlyik-1646247028436)(output_119_1.png)]

经营面积缺失超过一半，3000多，考虑删除

合并room到df中

del room_info['FIRM']

df = df4.merge(room_info, on=['HOTELID'], how='left')

df = df.sort_values(by=['HOTELID', 'DATE'])

gc.collect()

#增加民宿单日的总订单量
# in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
# df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')

df.head(2)

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	INSERT_TIME	MODIFY_TIME	FIRM	GUEST_ID	BDATE	XZQH	guest_sum	guest_sum_notnull	in_hotel_num	type	DATE	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear	in_is_wknd	holiday	JYMJ	ROOM_NUM	BED_NUM	CZLY	CALLED	BUR_CODE	STA_CODE	SSX	ADDRESS	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	JYMJ_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	BUR_CODE_freq	STA_CODE_freq	SSX_freq	MPHM_freq	FIRM_freq	room_ratio	bed_ratio	room_bed	DJ_date	BG_date	DJ_gap
37946	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	train	2020-06-06	2020	6	6	2	5	158	23	1	0	NaN	1	1	1	【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽	NaN	NaN	371083	乳山市长江路银泰海景花园 55-301	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	NaN	3091	1502	4727	NaN	NaN	150	2565	3429	NaN	NaN	1.0	2020-05-18	2020-07-15	58.0
37947	NaN	NaN	100177	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0.0	train	2020-06-07	2020	6	7	2	6	159	23	1	0	NaN	1	1	1	【山海边公寓】海景大床房设施齐全空调做饭宽带应有尽	NaN	NaN	371083	乳山市长江路银泰海景花园 55-301	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	NaN	3091	1502	4727	NaN	NaN	150	2565	3429	NaN	NaN	1.0	2020-05-18	2020-07-15	58.0

df.info()


Int64Index: 357330 entries, 37946 to 354018
Data columns (total 54 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   ORDER_ID           12543 non-null   float64       
 1   ORDER_PRIMARY_ID   12543 non-null   object        
 2   HOTELID            357330 non-null  object        
 3   ORDER_TIME         12543 non-null   object        
 4   STATUS             12543 non-null   float64       
 5   INSERT_TIME        12543 non-null   object        
 6   MODIFY_TIME        12543 non-null   object        
 7   FIRM               12543 non-null   float64       
 8   GUEST_ID           1877 non-null    object        
 9   BDATE              12543 non-null   object        
 10  XZQH               12543 non-null   object        
 11  guest_sum          12543 non-null   float64       
 12  guest_sum_notnull  12543 non-null   float64       
 13  in_hotel_num       354019 non-null  float64       
 14  type               357330 non-null  object        
 15  DATE               357330 non-null  object        
 16  in_year            357330 non-null  int64         
 17  in_month           357330 non-null  int64         
 18  in_day             357330 non-null  int64         
 19  in_quarter         357330 non-null  int64         
 20  in_dayofweek       357330 non-null  int64         
 21  in_dayofyear       357330 non-null  int64         
 22  in_weekofyear      357330 non-null  int64         
 23  in_is_wknd         357330 non-null  int64         
 24  holiday            357330 non-null  int64         
 25  JYMJ               141308 non-null  float64       
 26  ROOM_NUM           357330 non-null  int64         
 27  BED_NUM            357330 non-null  int64         
 28  CZLY               357330 non-null  int64         
 29  CALLED             357330 non-null  object        
 30  BUR_CODE           167625 non-null  float64       
 31  STA_CODE           167625 non-null  object        
 32  SSX                357330 non-null  int64         
 33  ADDRESS            357330 non-null  object        
 34  MPHM               357330 non-null  object        
 35  DJSJ               356948 non-null  datetime64[ns]
 36  BGSJ               357330 non-null  datetime64[ns]
 37  CALLED_freq        357330 non-null  int64         
 38  ADDRESS_freq       357330 non-null  int64         
 39  JYMJ_freq          141308 non-null  float64       
 40  ROOM_NUM_freq      357330 non-null  int64         
 41  BED_NUM_freq       357330 non-null  int64         
 42  CZLY_freq          357330 non-null  int64         
 43  BUR_CODE_freq      167625 non-null  float64       
 44  STA_CODE_freq      167625 non-null  float64       
 45  SSX_freq           357330 non-null  int64         
 46  MPHM_freq          357330 non-null  int64         
 47  FIRM_freq          357330 non-null  int64         
 48  room_ratio         141308 non-null  float64       
 49  bed_ratio          141308 non-null  float64       
 50  room_bed           357330 non-null  float64       
 51  DJ_date            356948 non-null  object        
 52  BG_date            357330 non-null  object        
 53  DJ_gap             356948 non-null  float64       
dtypes: datetime64[ns](2), float64(15), int64(21), object(16)
memory usage: 149.9+ MB

missing = df.isnull().sum()
missing = missing[missing>180000]

missing_col = missing.index.tolist()

df.drop(columns=missing_col, inplace=True)

df['DATE'].min(), df['DATE'].max()

('2020-06-06', '2021-09-21')

df.drop(columns=['CALLED', 'ADDRESS'], inplace=True)

df[:5]

	HOTELID	type	DATE	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear	in_is_wknd	ROOM_NUM	BED_NUM	CZLY	SSX	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	SSX_freq	MPHM_freq	FIRM_freq	room_bed	DJ_date	BG_date	DJ_gap
37946	100177	train	2020-06-06	2020	6	6	2	5	158	23	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0
37947	100177	train	2020-06-07	2020	6	7	2	6	159	23	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0
37948	100177	train	2020-06-13	2020	6	13	2	5	165	24	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0
37949	100177	train	2020-06-14	2020	6	14	2	6	166	24	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0
37950	100177	train	2020-06-20	2020	6	20	2	5	172	25	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0

def qty_shift(df, val):
    #昨天，上周，上个月， 去年
    df['last_1_qty'] = df.groupby('HOTELID')[val].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_2_qty'] = df.groupby('HOTELID')[val].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_3_qty'] = df.groupby('HOTELID')[val].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_4_qty'] = df.groupby('HOTELID')[val].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_5_qty'] = df.groupby('HOTELID')[val].shift(5).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_6_qty'] = df.groupby('HOTELID')[val].shift(6).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_7_qty'] = df.groupby('HOTELID')[val].shift(7).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_8_qty'] = df.groupby('HOTELID')[val].shift(8).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_10_qty'] = df.groupby('HOTELID')[val].shift(10).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_12_qty'] = df.groupby('HOTELID')[val].shift(12).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_15_qty'] = df.groupby('HOTELID')[val].shift(15).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_20_qty'] = df.groupby('HOTELID')[val].shift(20).fillna(method='ffill').reset_index().sort_index().set_index('index')
    return df

vals = ['in_hotel_num', 'in_year', 'in_month', 'in_dayofyear', 'in_quarter']           
for val in vals:
    print(val)
    df = qty_shift(df, val)

in_hotel_num
in_year
in_month
in_dayofyear
in_quarter

# #昨天，上周，上个月， 去年
# df['yesterday_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_2_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_3_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_4_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_52_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(52).fillna(method='ffill').reset_index().sort_index().set_index('index')

def qty_rolling(df, window, val, keys):
    df['qty_rolling'+str(window)+'_mean'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").mean()).values.tolist()
    df['qty_rolling'+str(window)+'_max'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).max()).values.tolist()
    df['qty_rolling'+str(window)+'_sum'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).sum()).values.tolist()
#     df['qty_rolling'+str(window)+'_std'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").std()).values.tolist()
#     df['qty_rolling'+str(window)+'_skew'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).skew()).values.tolist()
#     df['qty_rolling'+str(window)+'_kurt'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).kurt()).values.tolist()
    #df['qty_rolling'+str(window)+'_quantile'] = df.groupby(keys)[val].transform(  
              #lambda x: x.rolling(window=window, min_periods=3).quantile()).values.tolist()
#     df['qty_rolling'+str(window)+'_corr'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).corr()).values.tolist()
    return df

gc.collect()

# 滚动7天和14天
keys = 'HOTELID'
for val in vals:
    print(val)
    df = qty_rolling(df, 3, val, keys)
    df = qty_rolling(df, 4, val, keys)
    df = qty_rolling(df, 6, val, keys)
# keys = ['HOTELID']
# df = qty_rolling(df, 2, 'in_hotel_num', keys)

in_hotel_num
in_year
in_month
in_dayofyear
in_quarter

def qty_ewm(df, alpha, val, keys):
    df['qty_ewm'+'_mean'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).mean()).values.tolist()
    df['qty_ewm'+'_std'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).std()).values.tolist()
    df['qty_ewm'+'_corr'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).corr()).values.tolist()
    return df

for val in vals:
    print(val)
    df = qty_ewm(df, 0.95, val, keys)

in_hotel_num
in_year
in_month
in_dayofyear
in_quarter

df.select_dtypes(include='object').columns

Index(['HOTELID', 'type', 'DATE', 'MPHM', 'DJ_date', 'BG_date'], dtype='object')

df['in_hotel_num'] = df['in_hotel_num'].apply(lambda x: 1 if x==0 else 0)

df[:5]

	HOTELID	in_hotel_num	type	DATE	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear	in_is_wknd	ROOM_NUM	BED_NUM	CZLY	SSX	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	SSX_freq	MPHM_freq	FIRM_freq	room_bed	DJ_date	BG_date	DJ_gap	last_1_qty	last_2_qty	last_3_qty	last_4_qty	last_5_qty	last_6_qty	last_7_qty	last_8_qty	last_10_qty	last_12_qty	last_15_qty	last_20_qty	qty_rolling3_mean	qty_rolling3_max	qty_rolling3_sum	qty_rolling4_mean	qty_rolling4_max	qty_rolling4_sum	qty_rolling6_mean	qty_rolling6_max	qty_rolling6_sum	qty_ewm_mean	qty_ewm_std	qty_ewm_corr
37946	100177	1	train	2020-06-06	2020	6	6	2	5	158	23	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
37947	100177	1	train	2020-06-07	2020	6	7	2	6	159	23	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	NaN	NaN
37948	100177	1	train	2020-06-13	2020	6	13	2	5	165	24	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0	2.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	0.0	NaN
37949	100177	1	train	2020-06-14	2020	6	14	2	6	166	24	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0	2.0	2.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	2.0	6.0	2.0	2.0	6.0	2.0	2.0	6.0	2.0	0.0	NaN
37950	100177	1	train	2020-06-20	2020	6	20	2	5	172	25	1	1	1	1	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	3091	1502	4727	150	2565	3429	1.0	2020-05-18	2020-07-15	58.0	2.0	2.0	2.0	2.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	2.0	6.0	2.0	2.0	8.0	2.0	2.0	8.0	2.0	0.0	NaN

df = df.fillna(0)

col_list = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
col_list.remove('in_hotel_num')

col_list

['in_year',
 'in_month',
 'in_day',
 'in_quarter',
 'in_dayofweek',
 'in_dayofyear',
 'in_weekofyear',
 'in_is_wknd',
 'holiday',
 'ROOM_NUM',
 'BED_NUM',
 'CZLY',
 'SSX',
 'CALLED_freq',
 'ADDRESS_freq',
 'ROOM_NUM_freq',
 'BED_NUM_freq',
 'CZLY_freq',
 'SSX_freq',
 'MPHM_freq',
 'FIRM_freq',
 'room_bed',
 'DJ_gap',
 'last_1_qty',
 'last_2_qty',
 'last_3_qty',
 'last_4_qty',
 'last_5_qty',
 'last_6_qty',
 'last_7_qty',
 'last_8_qty',
 'last_10_qty',
 'last_12_qty',
 'last_15_qty',
 'last_20_qty',
 'qty_rolling3_mean',
 'qty_rolling3_max',
 'qty_rolling3_sum',
 'qty_rolling4_mean',
 'qty_rolling4_max',
 'qty_rolling4_sum',
 'qty_rolling6_mean',
 'qty_rolling6_max',
 'qty_rolling6_sum',
 'qty_ewm_mean',
 'qty_ewm_std',
 'qty_ewm_corr']

gc.collect()

used_features = col_list

cate_cols = ['HOTELID', 'MPHM']

X_train = df[df["DATE"] < '2021-08-02'][used_features].reset_index(drop=True)
y_train = df[df["DATE"] < '2021-08-02']["in_hotel_num"]
X_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')][used_features].reset_index(drop=True)
y_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')]["in_hotel_num"]
X_test = df[df["type"] == 'test'][used_features].reset_index(drop=True)


clf_1 = LGBMClassifier(num_leaves = 256,
                     n_estimators = 20000,
                     learning_rate = 0.005,
                     verbose = -1,
                     max_bin = 100,
                     max_depth = 10,
                     feature_fraction_seed = 66,
                     feature_fraction = 0.7,
                     bagging_seed = 66,
                     bagging_freq = 1,
                     bagging_fraction = 0.95,
                     metric = 'auc', # MultiAuc_score
                     lambda_l1 = 0.1,
                     lambda_l2 = 0.1, 
                     min_child_weight = 30,
                     n_jobs=80)

clf_1.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,verbose=200) #, categorical_feature = object_list

gc.collect()

[LightGBM] [Warning] feature_fraction is set=0.7, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7
[LightGBM] [Warning] lambda_l1 is set=0.1, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.1
[LightGBM] [Warning] bagging_fraction is set=0.95, subsample=1.0 will be ignored. Current value: bagging_fraction=0.95
[LightGBM] [Warning] lambda_l2 is set=0.1, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.1
[LightGBM] [Warning] bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.541449





357

oof_prob = clf_1.predict_proba(X_valid[used_features])[:, 1]

oof_prob.min(), oof_prob.max()

(0.9610922808728843, 0.9625584615170373)

oof_prob1 = clf_1.predict_proba(X_test[used_features])[:, 1]

oof_prob1.min(), oof_prob1.max()

(0.960959321386754, 0.9624418595928675)

X_test[used_features][:10]

	STATUS	FIRM	guest_sum	guest_sum_notnull	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear	in_is_wknd	holiday	JYMJ	ROOM_NUM	BED_NUM	CZLY	BUR_CODE	SSX	CALLED_freq	ADDRESS_freq	JYMJ_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	BUR_CODE_freq	STA_CODE_freq	SSX_freq	MPHM_freq	FIRM_freq	room_ratio	bed_ratio	room_bed	DJ_gap	last_1_qty	last_2_qty	last_3_qty	last_4_qty	last_5_qty	last_6_qty	last_7_qty	last_8_qty	last_10_qty	last_12_qty	last_15_qty	last_20_qty	qty_rolling3_mean	qty_rolling3_max	qty_rolling3_sum	qty_rolling4_mean	qty_rolling4_max	qty_rolling4_sum	qty_rolling6_mean	qty_rolling6_max	qty_rolling6_sum	qty_ewm_mean	qty_ewm_std	qty_ewm_corr
0	NaN	NaN	NaN	NaN	2021	9	4	3	5	247	35	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	2.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	6.328848e-12	1.0
1	NaN	NaN	NaN	NaN	2021	9	5	3	6	248	35	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	2.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	1.415174e-12	1.0
2	NaN	NaN	NaN	NaN	2021	9	11	3	5	254	36	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	3.164424e-13	1.0
3	NaN	NaN	NaN	NaN	2021	9	12	3	6	255	36	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	7.075868e-14	1.0
4	NaN	NaN	NaN	NaN	2021	9	19	3	6	262	37	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	1.582212e-14	1.0
5	NaN	NaN	NaN	NaN	2021	9	20	3	0	263	38	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	3.537934e-15	1.0
6	NaN	NaN	NaN	NaN	2021	9	21	3	1	264	38	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	7.911060e-16	1.0
7	NaN	NaN	NaN	NaN	2021	9	4	3	5	247	35	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	2.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	6.328848e-12	1.0
8	NaN	NaN	NaN	NaN	2021	9	5	3	6	248	35	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	2.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	1.415174e-12	1.0
9	NaN	NaN	NaN	NaN	2021	9	11	3	5	254	36	1	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	3.164424e-13	1.0

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
#     'objective': 'binary',
    'n_estimators': 20000,
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'auc'},
    'verbose': -1
}

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
        continue  
  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
                          categorical_feature=cate_cols)  
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
                          categorical_feature=cate_cols)  
    clf = LGBMClassifier.fit(params, trn_data, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    early_stopping_rounds=Early_Stopping_Rounds)
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
  
    print('MAE: {}'.format(mae_))  
  
    print("ReTraining on all data")  
    gc.enable()  
    del trn_data, val_data  
    gc.collect()  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
#     clf = LGBMClassifier.fit(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation



---------------------------------------------------------------------------

AttributeError                            Traceback (most recent call last)

 in 
     49     val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
     50                           categorical_feature=cate_cols)  
---> 51     clf = LGBMClassifier.fit(params, train.iloc[train_index], df[df['type'] == 'train'].iloc[train_index]['in_hotel_num'], eval_set=[trn_data, val_data], verbose=Verbose, early_stopping_rounds=Early_Stopping_Rounds)
     52     val = clf.predict(train.iloc[valid_index])
     53     mae_ = mean_absolute_error(labels[valid_index], val)


F:\anaconda\lib\site-packages\lightgbm\sklearn.py in fit(self, X, y, sample_weight, init_score, eval_set, eval_names, eval_sample_weight, eval_class_weight, eval_init_score, eval_metric, early_stopping_rounds, verbose, feature_name, categorical_feature, callbacks, init_model)
    783         _LGBMAssertAllFinite(y)
    784         _LGBMCheckClassificationTargets(y)
--> 785         self._le = _LGBMLabelEncoder().fit(y)
    786         _y = self._le.transform(y)
    787         self._class_map = dict(zip_(self._le.classes_, self._le.transform(self._le.classes_)))


AttributeError: 'dict' object has no attribute '_le'

df['HOTELID'] = df['HOTELID'].astype('category')
df['MPHM'] = df['MPHM'].astype('category')

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'l2', 'l1'},
    'verbose': -1
}

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
        continue  
  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
                          categorical_feature=cate_cols)  
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
                          categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    early_stopping_rounds=Early_Stopping_Rounds)
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
  
    print('MAE: {}'.format(mae_))  
  
    print("ReTraining on all data")  
    gc.enable()  
    del trn_data, val_data  
    gc.collect()  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation
Training until validation scores don't improve for 100 rounds
[500]	training's l2: 0.0357266	training's l1: 0.0733432	valid_1's l2: 0.0140039	valid_1's l1: 0.0424839
[1000]	training's l2: 0.034654	training's l1: 0.0715026	valid_1's l2: 0.0136241	valid_1's l1: 0.0362867
[1500]	training's l2: 0.0340906	training's l1: 0.0702526	valid_1's l2: 0.0134038	valid_1's l1: 0.0323129
[2000]	training's l2: 0.0337541	training's l1: 0.0694053	valid_1's l2: 0.0132898	valid_1's l1: 0.0298233
[2500]	training's l2: 0.0335207	training's l1: 0.0688021	valid_1's l2: 0.0131952	valid_1's l1: 0.028312
[3000]	training's l2: 0.0333523	training's l1: 0.068367	valid_1's l2: 0.0131123	valid_1's l1: 0.0273034
[3500]	training's l2: 0.033219	training's l1: 0.0680436	valid_1's l2: 0.0130372	valid_1's l1: 0.0264549
[4000]	training's l2: 0.033109	training's l1: 0.0678005	valid_1's l2: 0.0129586	valid_1's l1: 0.0259314
[4500]	training's l2: 0.0330175	training's l1: 0.0676386	valid_1's l2: 0.0129	valid_1's l1: 0.0256456
[5000]	training's l2: 0.0329394	training's l1: 0.0675283	valid_1's l2: 0.0128614	valid_1's l1: 0.0254616
[5500]	training's l2: 0.0328693	training's l1: 0.0674366	valid_1's l2: 0.0128206	valid_1's l1: 0.0253768
Early stopping, best iteration is:
[5586]	training's l2: 0.0328584	training's l1: 0.067422	valid_1's l2: 0.0128154	valid_1's l1: 0.0253677
MAE: 0.025367655553334097
ReTraining on all data
Best_iteration:  5586

pred1 = clf.predict(test)

pred1.min(), pred1.max()

(0.2106748391712024, 0.9860467852769609)

pre_d = df[df['type'] == 'test']
pre_d['ROOM_EMPTY'] = pred1.tolist()

pre_d1 = pre_d[['HOTELID', 'DATE', 'ROOM_EMPTY']]

pp = pre_d1[pre_d1['DATE']=='2021-09-21']

pp[:5]

	HOTELID	DATE	ROOM_EMPTY
356552	10083	2021-09-21	0.726068
355831	10125	2021-09-21	0.825109
354872	10237	2021-09-21	0.841640
356181	10273	2021-09-21	0.841640
356825	104814	2021-09-21	0.726068

np.median(pp['ROOM_EMPTY'])

0.8079157059926819

pre_d1[pre_d1['ROOM_EMPTY']>0.807].shape

(2667, 3)

pre_d1['ROOM_EMPTY'] = pre_d1['ROOM_EMPTY'].apply(lambda x: 0 if x>0.807 else 1)

pre_d1['ROOM_EMPTY'].sum()

pre_d1.to_csv('pre_02_reg.csv', index=False)

pre_d1['ROOM_EMPTY'].median()

0.8752981387595966

pre_d1['ROOM_EMPTY'].quantile(0.2)

0.8070988321951651

sns.countplot(pre_d1['ROOM_EMPTY'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-281lRSik-1646247028443)(output_177_1.png)]

pre_d1.shape

(3311, 3)

import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)

nan

pred.min(), pred.max()

(0.03637936444857589, 1.8554153046371984)

pre_df = df[df['type'] == 'test']
pre_df['ROOM_EMPTY'] = pred.tolist()

pre_df1.to_csv('origion.csv', index=False)

pre_df[:5]

	HOTELID	type	DATE	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	in_weekofyear	in_is_wknd	holiday	last_1_qty	last_2_qty	last_3_qty	last_4_qty	last_5_qty	last_6_qty	last_7_qty	last_8_qty	last_10_qty	last_12_qty	last_15_qty	last_20_qty	qty_rolling3_mean	qty_rolling3_max	qty_rolling3_sum	qty_rolling4_mean	qty_rolling4_max	qty_rolling4_sum	qty_rolling6_mean	qty_rolling6_max	qty_rolling6_sum	qty_ewm_mean	qty_ewm_std	qty_ewm_corr	ROOM_EMPTY
356546	10083	test	2021-09-04	2021	9	4	3	5	247	35	1	0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	2.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	6.328848e-12	1.0	0.054686
356547	10083	test	2021-09-05	2021	9	5	3	6	248	35	1	0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	2.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	1.415174e-12	1.0	0.055554
356548	10083	test	2021-09-11	2021	9	11	3	5	254	36	1	0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	3.164424e-13	1.0	0.247202
356549	10083	test	2021-09-12	2021	9	12	3	6	255	36	1	0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	7.075868e-14	1.0	0.105092
356550	10083	test	2021-09-19	2021	9	19	3	6	262	37	1	1	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	3.0	9.0	3.0	3.0	12.0	3.0	3.0	18.0	3.0	1.582212e-14	1.0	0.094995

pre_df1 = pre_df[['HOTELID', 'DATE', 'ROOM_EMPTY']]

# 8月份500多
pre_df1[pre_df1['ROOM_EMPTY']>0.07].shape

(2380, 3)

pre_df1['ROOM_EMPTY'] = pre_df1['ROOM_EMPTY'].apply(lambda x: 0 if x>=0.25 else 1)

# mean_absolute_error(labels[valid_index], val)  
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)

0.7065505364632024

a = pd.DataFrame()
a['y'] = labels[valid_index].tolist()
a['val'] = val.tolist()

# _val = np.array(a['val'].values.tolist())
# _y = np.array(a['y'].values.tolist())
fpr, tpr, thresholds = metrics.roc_curve(a['y'], a['val'], pos_label=2)
metrics.auc(fpr, tpr)

nan

np.array(a['val'].values.tolist())

array([0.98077955, 0.98028159, 0.98090901, ..., 0.99187334, 0.9917258 ,
       0.99187287])

a['y'].max()

a['y'] = a['y'].apply(lambda x: 1 if x==0 else 0)

a['val'] = a['val'].apply(lambda x: (1-x/(4.03212491)))

a[:5]

	y	val
0	1	0.980780
1	1	0.980282
2	1	0.980909
3	1	0.986415
4	1	0.988064

val.max()-val.min()

4.0321249098355265

labels[valid_index]

array([0., 0., 0., 0., 0., 0., 2., 2., 2., 2., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.])

val[140:160]

array([0.02691592, 0.02701038, 0.02690596, 0.02699238, 0.02690596,
       0.07806598, 0.07950708, 0.29049198, 0.77018916, 0.98434103,
       0.05606983, 0.06270357, 0.04589361, 0.05028036, 0.04938166,
       0.05054352, 0.04737767, 0.11532118, 0.05313139, 0.04558791])

gc.collect()

from tqdm import tqdm
from sklearn.metrics import *
print("#############################输出第一个模型的评价参数及结果#############################")

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    recall = 0
    precision = 0 
    best_th = 0
    for th in tqdm([i/2000 for i in range(100, 2000)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0

#         recall = recall_score(y_valid, oof_prob_copy)
#         precision = precision_score(y_valid, oof_prob_copy)
        fpr, tpr, thresholds = metrics.roc_curve(oof_prob_copy, y_valid, pos_label=2)
        aucs = metrics.auc(fpr, tpr)
        if aucs > best_f2:
            best_th = th
            best_f2 = aucs
#         gc.collect()
#     recall = recall
#     precision = precision
        
    return best_th, best_f2
val1 = val
y1 = labels[valid_index]
best_th, aucs = find_best_threshold(val1, y1)
print("分界值", best_th)
print("F2评价分数", aucs)
# print("recall召回率", recall)
# print("precision精确度", precision)

  1%|▏                                       | 10/1900 [00:00<00:23, 80.05it/s]

#############################输出第一个模型的评价参数及结果#############################


100%|██████████████████████████████████████| 1900/1900 [00:20<00:00, 94.61it/s]

分界值 0.05
F2评价分数 0.7065505364632024

pre_df1[20:33]

	HOTELID	DATE
354872	10237	2021-09-21
356175	10273	2021-09-04
356176	10273	2021-09-05
356177	10273	2021-09-11
356178	10273	2021-09-12
356179	10273	2021-09-19
356180	10273	2021-09-20
356181	10273	2021-09-21
356819	104814	2021-09-04
356820	104814	2021-09-05
356821	104814	2021-09-11
356822	104814	2021-09-12
356823	104814	2021-09-19

pre_df1.to_csv('pre_025_regression.csv', index=False)

pre_df1['ROOM_EMPTY'].mode()

0    0.20966
dtype: float64

pre_df1['ROOM_EMPTY'].median()

0.10509209231427108

plt.figure(figsize=(16,8))
sns.countplot(pre_df1['ROOM_EMPTY'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uSxKnO4p-1646247028446)(output_213_1.png)]

# df4['in_year'] = df4['datetime'].dt.year.fillna(0).astype('int')
# df4['in_month'] = df4['datetime'].dt.month.fillna(0).astype('int')
# df4['in_day'] = df4['datetime'].dt.day.fillna(0).astype('int')
# df4['in_quarter'] = df4['datetime'].dt.quarter.fillna(0).astype('int')
# df4['in_dayofweek'] = df4['datetime'].dt.dayofweek.fillna(0).astype('int')
# df4['in_dayofyear'] = df4['datetime'].dt.dayofyear.fillna(0).astype('int')
# df4['in_is_wknd'] = df4['datetime'].dt.dayofweek // 5                 #是否周末

df4['date1'] = df4['date'].astype('str')

#总数据量49547
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]

df_1.shape

(11583, 59)

df_1['in_month'].value_counts()

7    4391
5    2520
6    2192
8    1656
9     824
Name: in_month, dtype: int64

holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']

df4[df4['date']=='2021-05-03']

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	INSERT_TIME	MODIFY_TIME	FIRM	GUEST_ID	BDATE	XZQH	IN_TIME	OUT_TIME	guest_sum	guest_sum_notnull	in_time	out_time	datetime	in_date	out_date	date	JYMJ	ROOM_NUM	BED_NUM	CZLY	CALLED	BUR_CODE	STA_CODE	SSX	ADDRESS	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	JYMJ_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	BUR_CODE_freq	STA_CODE_freq	SSX_freq	MPHM_freq	FIRM_freq	room_ratio	bed_ratio	room_bed	DJ_date	BG_date	DJ_gap	in_hotel_num	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	record_is_wknd	in_is_wknd

df_2 = df4[df4['date1'].isin(holiday)]

df_2['holiday'] = 1

df_1['holiday'] = 0

df5 = pd.concat([df_2, df_1])

df5.shape

(14662, 61)

df5.drop(['CALLED', 'ADDRESS', 'INSERT_TIME', 'MODIFY_TIME', 'IN_TIME', 'OUT_TIME'], axis=1, inplace=True)

df5[:3]

	ORDER_ID	ORDER_PRIMARY_ID	HOTELID	ORDER_TIME	STATUS	FIRM	GUEST_ID	BDATE	XZQH	guest_sum	in_time	out_time	datetime	in_date	out_date	date	JYMJ	ROOM_NUM	BED_NUM	CZLY	BUR_CODE	STA_CODE	SSX	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	JYMJ_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	BUR_CODE_freq	STA_CODE_freq	SSX_freq	MPHM_freq	FIRM_freq	room_ratio	bed_ratio	room_bed	DJ_date	BG_date	DJ_gap	in_hotel_num	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	record_is_wknd	in_is_wknd	date1	holiday
9	3597092	CFD17F6BB142485DB3DEC6B5E2D0B664	100193	202105012046	1	3	NaN	19771008	132826	1	2021-05-01 20:46:00	2021-05-02 11:58:00	2021-05-01 20:46:00	2021-05-01	2021-05-02	2021-05-01	NaN	2	2	1	NaN	NaN	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	NaN	1249	2281	4727	NaN	NaN	150	2565	3429	NaN	NaN	1.0	2020-05-18	2020-07-15	58.0	1	2021	5	1	2	5	121	1	1	2021-05-01	1
11	3601837	5C96E986C1F04691A3EB07962BCC7B00	100195	202105021435	1	3	NaN	19951009	370203	1	2021-05-02 14:35:00	2021-05-03 11:58:00	2021-05-02 14:35:00	2021-05-02	2021-05-03	2021-05-02	NaN	2	3	1	NaN	NaN	371083	ROOM001	2020-05-18 10:33:55	2020-07-15 10:23:58	1	1	NaN	1249	895	4727	NaN	NaN	150	2565	3429	NaN	NaN	1.5	2020-05-18	2020-07-15	58.0	1	2021	5	2	2	6	122	1	1	2021-05-02	1
53	3314117	C0C5516462D3487C895DD654B836F855	10037	202104082130	1	3	NaN	19730313	120103	1	2021-04-30 13:58:00	2021-05-03 11:58:00	2021-05-01 13:58:00	2021-04-30	2021-05-03	2021-05-01	NaN	1	2	1	NaN	NaN	371002	ROOM001	2020-05-18 10:22:13	2020-07-15 10:23:58	2	2	NaN	3091	2281	4727	NaN	NaN	4661	2565	3429	NaN	NaN	2.0	2020-05-18	2020-07-15	58.0	1	2021	5	1	2	5	121	1	1	2021-05-01	1

drop_col = df5.columns.tolist()

drop_col.remove('holiday')

df5.drop_duplicates(drop_col, keep='first', inplace=True)

df5.shape

(12708, 54)

df5['HOTELID'].nunique()

test = pd.read_csv('testb/submit_example_2.csv')

test[:2]

	HOTELID	DATE	ROOM_EMPTY
0	303760	2021-09-04	0.1
1	303760	2021-09-05	0.1

test.info()


RangeIndex: 3311 entries, 0 to 3310
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   HOTELID     3311 non-null   object 
 1   DATE        3311 non-null   object 
 2   ROOM_EMPTY  3311 non-null   float64
dtypes: float64(1), object(2)
memory usage: 77.7+ KB

hotel = test['HOTELID'].values.tolist()

df5 = df5[df5['HOTELID'].isin(hotel)]

df5.shape

(3708, 54)

d = df5[df5['in_month']==9]

d['date'].value_counts()

2021-09-04    42
2021-09-05    29
2021-09-11    16
2021-09-19    14
2021-09-20    13
2021-09-18    12
2021-09-12    11
2020-09-06     9
2020-09-13     9
2020-09-05     9
2021-09-21     7
2020-09-19     7
2021-09-26     6
2020-09-27     6
2021-09-25     6
2020-09-26     6
2020-09-20     6
2020-09-12     4
Name: date, dtype: int64

test['date1'] = test['DATE']

test1 = test.merge(df5, on=['HOTELID', 'date1'], how='left')

test.shape

(3311, 4)

test1.shape

(3311, 56)

test1.info()


Int64Index: 3311 entries, 0 to 3310
Data columns (total 56 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   HOTELID           3311 non-null   object        
 1   DATE              3311 non-null   object        
 2   ROOM_EMPTY        3311 non-null   float64       
 3   date1             3311 non-null   object        
 4   ORDER_ID          132 non-null    float64       
 5   ORDER_PRIMARY_ID  132 non-null    object        
 6   ORDER_TIME        132 non-null    object        
 7   STATUS            132 non-null    float64       
 8   FIRM              132 non-null    float64       
 9   GUEST_ID          16 non-null     object        
 10  BDATE             132 non-null    object        
 11  XZQH              132 non-null    object        
 12  guest_sum         132 non-null    float64       
 13  in_time           132 non-null    datetime64[ns]
 14  out_time          132 non-null    datetime64[ns]
 15  datetime          132 non-null    datetime64[ns]
 16  in_date           132 non-null    object        
 17  out_date          132 non-null    object        
 18  date              132 non-null    object        
 19  JYMJ              1 non-null      float64       
 20  ROOM_NUM          132 non-null    float64       
 21  BED_NUM           132 non-null    float64       
 22  CZLY              132 non-null    float64       
 23  BUR_CODE          86 non-null     float64       
 24  STA_CODE          86 non-null     object        
 25  SSX               132 non-null    float64       
 26  MPHM              132 non-null    object        
 27  DJSJ              132 non-null    datetime64[ns]
 28  BGSJ              132 non-null    datetime64[ns]
 29  CALLED_freq       132 non-null    float64       
 30  ADDRESS_freq      132 non-null    float64       
 31  JYMJ_freq         1 non-null      float64       
 32  ROOM_NUM_freq     132 non-null    float64       
 33  BED_NUM_freq      132 non-null    float64       
 34  CZLY_freq         132 non-null    float64       
 35  BUR_CODE_freq     86 non-null     float64       
 36  STA_CODE_freq     86 non-null     float64       
 37  SSX_freq          132 non-null    float64       
 38  MPHM_freq         132 non-null    float64       
 39  FIRM_freq         132 non-null    float64       
 40  room_ratio        1 non-null      float64       
 41  bed_ratio         1 non-null      float64       
 42  room_bed          132 non-null    float64       
 43  DJ_date           132 non-null    object        
 44  BG_date           132 non-null    object        
 45  DJ_gap            132 non-null    float64       
 46  in_hotel_num      132 non-null    float64       
 47  in_year           132 non-null    float64       
 48  in_month          132 non-null    float64       
 49  in_day            132 non-null    float64       
 50  in_quarter        132 non-null    float64       
 51  in_dayofweek      132 non-null    float64       
 52  in_dayofyear      132 non-null    float64       
 53  record_is_wknd    132 non-null    float64       
 54  in_is_wknd        132 non-null    float64       
 55  holiday           132 non-null    float64       
dtypes: datetime64[ns](5), float64(36), object(15)
memory usage: 1.4+ MB

test1[:5]

	HOTELID	DATE	ROOM_EMPTY	date1	ORDER_ID	ORDER_PRIMARY_ID	ORDER_TIME	STATUS	FIRM	GUEST_ID	BDATE	XZQH	guest_sum	in_time	out_time	datetime	in_date	out_date	date	JYMJ	ROOM_NUM	BED_NUM	CZLY	BUR_CODE	STA_CODE	SSX	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	JYMJ_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	BUR_CODE_freq	STA_CODE_freq	SSX_freq	MPHM_freq	FIRM_freq	room_ratio	bed_ratio	room_bed	DJ_date	BG_date	DJ_gap	in_hotel_num	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	record_is_wknd	in_is_wknd	holiday
0	303760	2021-09-04	0.1	2021-09-04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	303760	2021-09-05	0.1	2021-09-05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	303760	2021-09-11	0.1	2021-09-11	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	303760	2021-09-12	0.1	2021-09-12	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	303760	2021-09-19	0.1	2021-09-19	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

test1['holiday1'] = test1['holiday'].apply(lambda x: 0 if x else 1)

test1['holiday1'].value_counts()

0    3213
1      98
Name: holiday1, dtype: int64

test1[:10]

	HOTELID	DATE	ROOM_EMPTY	date1	ORDER_ID	ORDER_PRIMARY_ID	ORDER_TIME	STATUS	FIRM	GUEST_ID	BDATE	XZQH	guest_sum	in_time	out_time	datetime	in_date	out_date	date	JYMJ	ROOM_NUM	BED_NUM	CZLY	BUR_CODE	STA_CODE	SSX	MPHM	DJSJ	BGSJ	CALLED_freq	ADDRESS_freq	JYMJ_freq	ROOM_NUM_freq	BED_NUM_freq	CZLY_freq	BUR_CODE_freq	STA_CODE_freq	SSX_freq	MPHM_freq	FIRM_freq	room_ratio	bed_ratio	room_bed	DJ_date	BG_date	DJ_gap	in_hotel_num	in_year	in_month	in_day	in_quarter	in_dayofweek	in_dayofyear	record_is_wknd	in_is_wknd	holiday	holiday1
0	303760	2021-09-04	0.1	2021-09-04	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
1	303760	2021-09-05	0.1	2021-09-05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
2	303760	2021-09-11	0.1	2021-09-11	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
3	303760	2021-09-12	0.1	2021-09-12	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
4	303760	2021-09-19	0.1	2021-09-19	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
5	303760	2021-09-20	0.1	2021-09-20	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
6	303760	2021-09-21	0.1	2021-09-21	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
7	315471	2021-09-04	0.1	2021-09-04	4798603.0	A139D5C20BB44529A66957AA491FB88E	202108212327	1.0	3.0	NaN	20001101	331021	1.0	2021-09-03 13:58:00	2021-09-05 11:58:00	2021-09-04 13:58:00	2021-09-03	2021-09-05	2021-09-04	NaN	1.0	2.0	1.0	3.710980e+11	371000000007	371002.0	ROOM001	2021-05-05 13:58:58	2021-05-05 14:02:45	2.0	1.0	NaN	3091.0	2281.0	4727.0	852.0	433.0	4661.0	2565.0	3429.0	NaN	NaN	2.0	2021-05-05	2021-05-05	0.0	1.0	2021.0	9.0	4.0	3.0	5.0	247.0	1.0	1.0	0.0	1
8	315471	2021-09-05	0.1	2021-09-05	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0
9	315471	2021-09-11	0.1	2021-09-11	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaT	NaT	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0

test1['ROOM_EMPTY'] = test1['holiday1']

test2 = test1[['HOTELID', 'DATE', 'ROOM_EMPTY']]

test2[:5]

	HOTELID	DATE
0	303760	2021-09-04
1	303760	2021-09-05
2	303760	2021-09-11
3	303760	2021-09-12
4	303760	2021-09-19

test2.to_csv('pre.csv', index=False)

pre = pd.read_csv('file41704172.csv')

pre[:2]

	HOTELID	DATE	room
0	303760	2021-09-04	0.0
1	303760	2021-09-05	0.0

pre.rename(columns={'ROOM_EMPTY':'room'}, inplace=True)

pr = test2.merge(pre, on=['HOTELID', 'DATE'], how='left')

pr['ROOM_EMPTY'] = pr['ROOM_EMPTY'] + pr['room']

del pr['room']

pre['room'].value_counts()

0.0    2838
1.0     473
Name: room, dtype: int64

pr.to_csv('pre1.csv', index=False)

dd = df5.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()

dd

	in_year	in_month	ORDER_ID
0	2020	7	12
1	2020	8	102
2	2020	9	56
3	2021	5	735
4	2021	6	833
5	2021	7	1279
6	2021	8	535
7	2021	9	156

dd['ym'] = dd['in_year']*100+dd['in_month']

dd['ym'].value_counts()

202105    1
202007    1
202008    1
202009    1
202106    1
202107    1
202108    1
202109    1
Name: ym, dtype: int64

plt.figure(figsize=(15,8))
plt.scatter(dd.index, dd['ORDER_ID'])

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-fiksPl1J-1646247028449)(output_270_1.png)]