
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import os
import re
import gc
import warnings


plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False #用来正常显示负号
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', 100)
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
path = 'traina/'  
path1 = 'trainb/'


cust_info = pd.read_csv(path + '网约房平台入住人表.csv')
cust_info1 = pd.read_csv(path1 + '网约房平台入住人表.csv')
cust = pd.concat([cust_info, cust_info1])
1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 2.021071e+11 2.021071e+11
cust['BDATE'] = cust['BDATE'].astype('str')
cust['XZQH'] = cust['XZQH'].astype('str')
cust['IN_TIME'] = cust['IN_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
cust['OUT_TIME'] = cust['OUT_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])
1 3525D57CAE104A078E4962B2B89377B0 371099C301202107090001 19951025 370523 202107092103 202107111158

cust.sort_values(['ORDER_PRIMARY_ID', 'GUEST_ID'], inplace=True)
stat = cust.groupby(['ORDER_PRIMARY_ID'])['BDATE'].count().reset_index()
stat.columns = ['ORDER_PRIMARY_ID', 'guest_sum']
stat['guest_sum_notnull'] = cust.groupby(['ORDER_PRIMARY_ID'])['GUEST_ID'].agg('count').values
cust = cust.merge(stat, on=['ORDER_PRIMARY_ID'], how='left')
cust.drop_duplicates('ORDER_PRIMARY_ID', keep='first', inplace=True)
cust['IN_TIME'].max(), cust['IN_TIME'].min()
('202110041413', '0.0')


# platform infomation
order_info = pd.read_csv(path + '网约平台旅客订单信息.csv') 
order1 = pd.read_csv(path1 + '网约平台旅客订单信息.csv')
order = pd.concat([order_info, order1])
order['PRE_IN_TIME'] = order['PRE_IN_TIME'].astype('str')
order['PRE_OUT_TIME'] = order['PRE_OUT_TIME'].astype('str')
# order['ORDER_TIME'] = order['ORDER_TIME'].astype('str')
order['INSERT_TIME'] = order['INSERT_TIME'].astype('str')
order['MODIFY_TIME'] = order['MODIFY_TIME'].astype('str')
order['CANCEL_TIME'] = order['CANCEL_TIME'].astype('str')
order['ORDER_TIME'] = order['ORDER_TIME'].fillna(0).astype('str').apply(lambda x: x[:12])


order = order[(order['CANCEL_TIME']=='nan')]
order = order.sort_values(['ORDER_ID', 'MODIFY_TIME'])
order.drop_duplicates('ORDER_ID', keep='last', inplace=True)
# order时间包含了cust时间
df = pd.merge(order, cust, on='ORDER_PRIMARY_ID')


df['IN_TIME'] = df['IN_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['OUT_TIME'] = df['OUT_TIME'].apply(lambda x: np.nan if x=='0.0' else x)
df['IN_TIME'].fillna(df['PRE_IN_TIME'], inplace=True)
df['OUT_TIME'].fillna(df['PRE_OUT_TIME'], inplace=True)
df['OUT_TIME'] = df.apply(lambda x: x['OUT_TIME'] if x['OUT_TIME'] >= x['PRE_OUT_TIME'] else x['PRE_OUT_TIME'], axis=1)
df['IN_TIME'] = df.apply(lambda x: x['IN_TIME'] if x['IN_TIME'] >= x['PRE_IN_TIME'] else x['PRE_IN_TIME'], axis=1)
df.drop(['PRE_IN_TIME', 'PRE_OUT_TIME', 'CANCEL_TIME'], axis=1, inplace=True)


Int64Index: 29941 entries, 0 to 29940
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   ORDER_ID           29941 non-null  int64 
 1   ORDER_PRIMARY_ID   29941 non-null  object
 2   HOTELID            29941 non-null  object
 3   ORDER_TIME         29941 non-null  object
 4   STATUS             29941 non-null  int64 
 5   INSERT_TIME        29941 non-null  object
 6   MODIFY_TIME        29941 non-null  object
 7   FIRM               29941 non-null  int64 
 8   GUEST_ID           3561 non-null   object
 9   BDATE              29941 non-null  object
 10  XZQH               29941 non-null  object
 11  IN_TIME            29941 non-null  object
 12  OUT_TIME           29941 non-null  object
 13  guest_sum          29941 non-null  int64 
 14  guest_sum_notnull  29941 non-null  int64 
dtypes: int64(5), object(10)
memory usage: 3.7+ MB
df['in_time'] = pd.to_datetime(df['IN_TIME'], errors='coerce', format='%Y%m%d%H%M')
df['out_time'] = pd.to_datetime(df['OUT_TIME'], errors='coerce', format='%Y%m%d%H%M')
df = df[~df['out_time'].isnull()]
# df['in_time_year'] = df['in_time'].dt.year.fillna(0).astype('int')                 #入住时间的年份
# df['out_time_year'] = df['out_time'].dt.year.fillna(0).astype('int')               #退房时间的年份
# df['in_time_month'] = df['in_time'].dt.month.fillna(0).astype('int')               #入住时间的月份
# df['out_time_month'] = df['out_time'].dt.month.fillna(0).astype('int')             #退房时间的月份
# df['in_time_day'] = df['in_time'].dt.day.fillna(0).astype('int')                   #入住时间的日期
# df['out_time_day'] = df['out_time'].dt.day.fillna(0).astype('int')                 #退房时间的日期
# df['in_time_hour'] = df['in_time'].dt.hour.fillna(0).astype('int')                 #入住时间的小时
# df['out_time_hour'] = df['out_time'].dt.hour.fillna(0).astype('int')               #退房时间的小时
# df['day_gap'] = df['out_time_day'] - df['in_time_day'] + (df['out_time_month'] - df['in_time_month']).apply(lambda x: x*30 if x>0 else 0)
# df['day_gap'] = df.apply(lambda x: x['day_gap']+31 if x['in_time_month']==12 and x['out_time_month']==1 else x['day_gap'], axis=1)
# month_ = [1, 3, 5, 7, 8, 10]
# df['day_gap'] = df.apply(lambda x: x['day_gap']+1 if x['in_time_month'] in month_ and x['out_time_month']>x['in_time_month'] else x['day_gap'], axis=1)
# df['day_gap'].sum()

df[‘lock_time’][0] + pd.Timedelta(days=1)
df[‘lock_time’][0] + pd.Timedelta(days=-1)
#df[‘in_time’] - timedelta(days=1) #将day_gap列为0的数据全部往后推迟一天;;;;;;;还有钟点房

# df_0 = df[(df['day_gap']==0)]
# df_0['date'] = df_0['IN_TIME']
# 这里边包含了钟点房和当天入住并退房的。
dfs = []
for idx, group in df.groupby(['ORDER_PRIMARY_ID']):
    stat1 = pd.DataFrame()
    stat1['datetime'] = pd.date_range(start=group['IN_TIME'].values[0], end=group['OUT_TIME'].values[0], freq='D', normalize=False, closed=None)
    stat1['datetime'] = stat1['datetime'].dt.date.fillna(0).astype('str')
    stat1['ORDER_PRIMARY_ID'] = group['ORDER_PRIMARY_ID'].values[0]
df_date = pd.concat(dfs).reset_index(drop=True)
0 2021-06-30 00003FC18B254E86803C00F4BBA382E4
1 2021-07-01 00003FC18B254E86803C00F4BBA382E4
2 2021-07-02 00003FC18B254E86803C00F4BBA382E4
dfs1 = []
for idx, group in df.groupby(['HOTELID']):
    stat = pd.DataFrame()
    stat['datetime'] = pd.date_range(start='20200601', end='20210830', freq='D', normalize=False, closed=None)
    stat['datetime'] = stat['datetime'].astype('str')
    stat['HOTELID'] = group['HOTELID'].values[0]
df_date1 = pd.concat(dfs1).reset_index(drop=True)
0 2020-06-01 00003FC18B254E86803C00F4BBA382E4
1 2020-06-02 00003FC18B254E86803C00F4BBA382E4
2 2020-06-03 00003FC18B254E86803C00F4BBA382E4
3 2020-06-04 00003FC18B254E86803C00F4BBA382E4
4 2020-06-05 00003FC18B254E86803C00F4BBA382E4
5 2020-06-06 00003FC18B254E86803C00F4BBA382E4
6 2020-06-07 00003FC18B254E86803C00F4BBA382E4
7 2020-06-08 00003FC18B254E86803C00F4BBA382E4
8 2020-06-09 00003FC18B254E86803C00F4BBA382E4
9 2020-06-10 00003FC18B254E86803C00F4BBA382E4
df11 = df.merge(df_date, on=['ORDER_PRIMARY_ID'], how='left')
_df1 = df_date1.merge(df11, on=['HOTELID', 'datetime'], how='left') 
# df['guest_sum'] = 0
# df['guest_sum_notnull'] = 0
ddf = _df1[_df1['ORDER_ID'].isnull()]
df11['in_date'] = df11['in_time'].dt.date
df11['out_date'] = df11['out_time'].dt.date
df11['datetime'] = pd.to_datetime(df11['datetime'], errors='coerce', format='%Y-%m-%d')
df11['date'] = df11['datetime'].dt.date
_df2 = df11[(df11['out_date']!=df11['date']) | ((df11['out_date']==df11['date']) & (df11['in_date']==df11['out_date']))]
_df2.drop(['in_time', 'out_time', 'in_date', 'out_date', 'datetime'], axis=1, inplace=True)
_df2.rename(columns={'date':'datetime'}, inplace=True)
in_num = _df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
_df2 = _df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
ddf.drop(['in_time', 'out_time'], axis=1, inplace=True)
ddf['in_hotel_num'] = 0
df1 = pd.concat([_df2, ddf])
df1 = df1.sort_values(['HOTELID', 'datetime'])
# #增加特征:民宿单日的总订单量
# in_num = df2.groupby(['HOTELID', 'datetime'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'datetime', 'in_hotel_num']
# df2 = df2.merge(in_num, on=['HOTELID', 'datetime'], how='left')
test['type'] = 'test'
df1['type'] = 'train'
df1['DATE'] = df1['datetime'].astype('str')
df2 = df1[df1['DATE'] < '2021-09-01']
df2['datetime'] = pd.to_datetime(df2['datetime'], errors='coerce', format='%Y-%m-%d')
df2['in_year'] = df2['datetime'].dt.year.fillna(0).astype('int')
df2['in_month'] = df2['datetime'].dt.month.fillna(0).astype('int')
df2['in_day'] = df2['datetime'].dt.day.fillna(0).astype('int')
df2['in_quarter'] = df2['datetime'].dt.quarter.fillna(0).astype('int')
df2['in_dayofweek'] = df2['datetime'].dt.dayofweek.fillna(0).astype('int')
df2['in_dayofyear'] = df2['datetime'].dt.dayofyear.fillna(0).astype('int')
df2['in_weekofyear'] = df2['datetime'].dt.weekofyear.fillna(0).astype('int')
df2['in_is_wknd'] = df2['datetime'].dt.dayofweek // 5                 #是否周末
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df2[(df2['in_is_wknd']==1) & (df2['in_month'].isin(use_month))]
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
df_2 = df2[df2['DATE'].isin(holiday)]
# 节假日为1
df_2['holiday'] = 1
df_1['holiday'] = 0
df3 = pd.concat([df_2, df_1])
drop_col = df3.columns.tolist()
df3.drop_duplicates(drop_col, keep='first', inplace=True)
test['datetime'] = pd.to_datetime(test['DATE'], errors='coerce', format='%Y-%m-%d')
test['in_year'] = test['datetime'].dt.year.fillna(0).astype('int')
test['in_month'] = test['datetime'].dt.month.fillna(0).astype('int')
test['in_day'] = test['datetime'].dt.day.fillna(0).astype('int')
test['in_quarter'] = test['datetime'].dt.quarter.fillna(0).astype('int')
test['in_dayofweek'] = test['datetime'].dt.dayofweek.fillna(0).astype('int')
test['in_dayofyear'] = test['datetime'].dt.dayofyear.fillna(0).astype('int')
test['in_weekofyear'] = test['datetime'].dt.weekofyear.fillna(0).astype('int')
test['in_is_wknd'] = test['datetime'].dt.dayofweek // 5                 #是否周末
test['holiday'] = 0
test['holiday'] = test.apply(lambda x: x['holiday']+1 if x['DATE'] in holiday else x['holiday'], axis=1)
del test['datetime'], df3['datetime'], test['ROOM_EMPTY']
       'IN_TIME', 'OUT_TIME', 'guest_sum', 'guest_sum_notnull', 'in_hotel_num',
       'type', 'DATE', 'in_year', 'in_month', 'in_day', 'in_quarter',
       'in_dayofweek', 'in_dayofyear', 'in_weekofyear', 'in_is_wknd',
df3.drop(['IN_TIME', 'OUT_TIME'], axis=1, inplace=True)
df4 = df3.merge(test, on=['HOTELID', 'DATE', 'type', 'in_year', 'in_month', 'in_day', 'in_quarter', 'in_dayofweek', 
                          'in_dayofyear', 'in_weekofyear', 'in_is_wknd', 'holiday'], how='outer')
dd = df4[df4['DATE']>'2021-09-01']
test    3311
room_info = pd.read_csv(path + '网约房注册民宿.csv')
room_info['JYMJ'] = room_info['JYMJ'].apply(lambda x: np.nan if x==0 else x)
room_use_col = ['HOTELID', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'FIRM', 'STATUS']
freq_col = ['CALLED', 'ADDRESS', 'JYMJ', 'ROOM_NUM', 'BED_NUM', 'CZLY', 'BUR_CODE', 'STA_CODE', 'SSX', 'MPHM', 'FIRM']
for col in freq_col:
    st = room_info[col].value_counts().reset_index()
    st.columns = [col, col+'_freq']
    room_info = room_info.merge(st, on=col, how='left')
room_info['room_ratio'] = room_info['JYMJ'] / room_info['ROOM_NUM']
room_info['bed_ratio'] = room_info['JYMJ'] / room_info['BED_NUM']
room_info['room_bed'] = room_info['BED_NUM'] / room_info['ROOM_NUM']
room_info['DJSJ'] = pd.to_datetime(room_info['DJSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['BGSJ'] = pd.to_datetime(room_info['BGSJ'], format='%Y-%m-%d %H:%M:%S', errors='coerce')
room_info['DJ_date'] = room_info['DJSJ'].dt.date
room_info['BG_date'] = room_info['BGSJ'].dt.date
room_info['DJ_gap'] = (room_info['BG_date'] - room_info['DJ_date']).dt.days
room_info['DJSJ'].min(), room_info['DJSJ'].max(), room_info['BGSJ'].min(), room_info['BGSJ'].max()
(Timestamp('2020-05-12 13:16:24'),
 Timestamp('2021-10-19 16:05:41'),
 Timestamp('2020-07-15 10:23:06'),
 Timestamp('2021-10-19 16:09:15'))
# room['BG_dayofweek'].value_counts()
plt.figure(figsize=(20, 8))




del room_info['FIRM']
df = df4.merge(room_info, on=['HOTELID'], how='left')
df = df.sort_values(by=['HOTELID', 'DATE'])
# in_num = df3.groupby(['HOTELID', 'date'])['ORDER_ID'].count().reset_index()
# in_num.columns = ['HOTELID', 'date', 'in_hotel_num']
# df4 = df3.merge(in_num, on=['HOTELID', 'date'], how='left')
missing = df.isnull().sum()
missing = missing[missing>180000]
missing_col = missing.index.tolist()
df.drop(columns=missing_col, inplace=True)
df['DATE'].min(), df['DATE'].max()
('2020-06-06', '2021-09-21')
df.drop(columns=['CALLED', 'ADDRESS'], inplace=True)
def qty_shift(df, val):
    #昨天,上周,上个月, 去年
    df['last_1_qty'] = df.groupby('HOTELID')[val].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_2_qty'] = df.groupby('HOTELID')[val].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_3_qty'] = df.groupby('HOTELID')[val].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_4_qty'] = df.groupby('HOTELID')[val].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_5_qty'] = df.groupby('HOTELID')[val].shift(5).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_6_qty'] = df.groupby('HOTELID')[val].shift(6).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_7_qty'] = df.groupby('HOTELID')[val].shift(7).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_8_qty'] = df.groupby('HOTELID')[val].shift(8).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_10_qty'] = df.groupby('HOTELID')[val].shift(10).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_12_qty'] = df.groupby('HOTELID')[val].shift(12).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_15_qty'] = df.groupby('HOTELID')[val].shift(15).fillna(method='ffill').reset_index().sort_index().set_index('index')
    df['last_20_qty'] = df.groupby('HOTELID')[val].shift(20).fillna(method='ffill').reset_index().sort_index().set_index('index')
    return df
vals = ['in_hotel_num', 'in_year', 'in_month', 'in_dayofyear', 'in_quarter']           
for val in vals:
    df = qty_shift(df, val)
# #昨天,上周,上个月, 去年
# df['yesterday_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(1).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_2_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(2).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_3_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(3).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_4_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(4).fillna(method='ffill').reset_index().sort_index().set_index('index')
# df['last_52_qty'] = df.groupby('HOTELID')['in_hotel_num'].shift(52).fillna(method='ffill').reset_index().sort_index().set_index('index')
def qty_rolling(df, window, val, keys):
    df['qty_rolling'+str(window)+'_mean'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").mean()).values.tolist()
    df['qty_rolling'+str(window)+'_max'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).max()).values.tolist()
    df['qty_rolling'+str(window)+'_sum'] = df.groupby(keys)[val].transform(  
              lambda x: x.shift(1).rolling(window=window, min_periods=3).sum()).values.tolist()
#     df['qty_rolling'+str(window)+'_std'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3, win_type="triang").std()).values.tolist()
#     df['qty_rolling'+str(window)+'_skew'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).skew()).values.tolist()
#     df['qty_rolling'+str(window)+'_kurt'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).kurt()).values.tolist()
    #df['qty_rolling'+str(window)+'_quantile'] = df.groupby(keys)[val].transform(  
              #lambda x: x.rolling(window=window, min_periods=3).quantile()).values.tolist()
#     df['qty_rolling'+str(window)+'_corr'] = df.groupby(keys)[val].transform(  
#               lambda x: x.shift(1).rolling(window=window, min_periods=3).corr()).values.tolist()
    return df

# 滚动7天和14天
keys = 'HOTELID'
for val in vals:
    df = qty_rolling(df, 3, val, keys)
    df = qty_rolling(df, 4, val, keys)
    df = qty_rolling(df, 6, val, keys)
# keys = ['HOTELID']
# df = qty_rolling(df, 2, 'in_hotel_num', keys)

def qty_ewm(df, alpha, val, keys):
    df['qty_ewm'+'_mean'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).mean()).values.tolist()
    df['qty_ewm'+'_std'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).std()).values.tolist()
    df['qty_ewm'+'_corr'] = df.groupby(keys)[val].transform(lambda x: x.shift(1).ewm(alpha=alpha).corr()).values.tolist()
    return df
for val in vals:
    df = qty_ewm(df, 0.95, val, keys)
df['in_hotel_num'] = df['in_hotel_num'].apply(lambda x: 1 if x==0 else 0)
df = df.fillna(0)
col_list = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
used_features = col_list
cate_cols = ['HOTELID', 'MPHM']
X_train = df[df["DATE"] < '2021-08-02'][used_features].reset_index(drop=True)
y_train = df[df["DATE"] < '2021-08-02']["in_hotel_num"]
X_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')][used_features].reset_index(drop=True)
y_valid = df[(df["DATE"] > '2021-08-02') & (df["DATE"] < '2021-09-01')]["in_hotel_num"]
X_test = df[df["type"] == 'test'][used_features].reset_index(drop=True)

clf_1 = LGBMClassifier(num_leaves = 256,
                     n_estimators = 20000,
                     learning_rate = 0.005,
                     verbose = -1,
                     max_bin = 100,
                     max_depth = 10,
                     feature_fraction_seed = 66,
                     feature_fraction = 0.7,
                     bagging_seed = 66,
                     bagging_freq = 1,
                     bagging_fraction = 0.95,
                     metric = 'auc', # MultiAuc_score
                     lambda_l1 = 0.1,
                     lambda_l2 = 0.1, 
                     min_child_weight = 30,

clf_1.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        early_stopping_rounds=100,verbose=200) #, categorical_feature = object_list

[LightGBM] [Warning] feature_fraction is set=0.7, colsample_bytree=1.0 will be ignored. Current value: feature_fraction=0.7
[LightGBM] [Warning] lambda_l1 is set=0.1, reg_alpha=0.0 will be ignored. Current value: lambda_l1=0.1
[LightGBM] [Warning] bagging_fraction is set=0.95, subsample=1.0 will be ignored. Current value: bagging_fraction=0.95
[LightGBM] [Warning] lambda_l2 is set=0.1, reg_lambda=0.0 will be ignored. Current value: lambda_l2=0.1
[LightGBM] [Warning] bagging_freq is set=1, subsample_freq=0 will be ignored. Current value: bagging_freq=1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[10]	valid_0's auc: 0.541449

oof_prob = clf_1.predict_proba(X_valid[used_features])[:, 1]  
oof_prob.min(), oof_prob.max()
(0.9610922808728843, 0.9625584615170373)
oof_prob1 = clf_1.predict_proba(X_test[used_features])[:, 1]  
oof_prob1.min(), oof_prob1.max()
(0.960959321386754, 0.9624418595928675)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import LGBMClassifier
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
#     'objective': 'binary',
    'n_estimators': 20000,
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'auc'},
    'verbose': -1

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
    clf = LGBMClassifier.fit(params, trn_data, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
    print('MAE: {}'.format(mae_))  
    print("ReTraining on all data")  
    del trn_data, val_data  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
#     clf = LGBMClassifier.fit(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation


df['HOTELID'] = df['HOTELID'].astype('category')
df['MPHM'] = df['MPHM'].astype('category')
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
import gc
cate_cols = []
drop_col = ['in_hotel_num', 'type']

train = df[df['type'] == 'train']
labels = np.array(train['in_hotel_num'].values.tolist())
train.drop(drop_col, axis=1, inplace=True)
train = train[used_features] 
test = df[df['type'] == 'test']
test_label = test['in_hotel_num'].values.tolist()
test.drop(drop_col, axis=1, inplace=True)
test = test[used_features]

# used_features = importance_fea
ts_folds = TimeSeriesSplit(n_splits = 5)
N_round = 20000
Verbose = 500
Early_Stopping_Rounds = 100
target = 'in_hotel_num'

params = {
    'objective': 'regression',
    'boosting': 'gbdt',
    'learning_rate': 0.001,
    'num_leaves': 2 ** 5,
    'bagging_fraction': 0.95,
    'bagging_freq': 1,
    'bagging_seed': 66,
    'feature_fraction': 0.7,
    'feature_fraction_seed': 66,
    'max_bin': 100,
    'max_depth': 10,
    'metric': {'l2', 'l1'},
    'verbose': -1

for fold_n, (train_index, valid_index) in enumerate(ts_folds.split(train)):
    if fold_n in [0, 1, 2, 3]:  
    print('Training with validation') 
    trn_data = lgb.Dataset(train.iloc[train_index], label=labels[train_index],
    val_data = lgb.Dataset(train.iloc[valid_index], label=labels[valid_index],
    clf = lgb.train(params, trn_data, num_boost_round=N_round, valid_sets=[trn_data, val_data], verbose_eval=Verbose,  
    val = clf.predict(train.iloc[valid_index])   
    mae_ = mean_absolute_error(labels[valid_index], val)  
    print('MAE: {}'.format(mae_))  
    print("ReTraining on all data")  
    del trn_data, val_data  
    Best_iteration = clf.best_iteration  
    print("Best_iteration: ", Best_iteration)  
    trn_data = lgb.Dataset(train, label=labels, categorical_feature=cate_cols)  
    clf = lgb.train(params, trn_data, num_boost_round=int(Best_iteration * 1.2))
  #valid_sets=[trn_data], verbose_eval=Verbose)  
  #pred = clf.predict(test[used_features])

Training with validation
Training until validation scores don't improve for 100 rounds
[500]	training's l2: 0.0357266	training's l1: 0.0733432	valid_1's l2: 0.0140039	valid_1's l1: 0.0424839
[1000]	training's l2: 0.034654	training's l1: 0.0715026	valid_1's l2: 0.0136241	valid_1's l1: 0.0362867
[1500]	training's l2: 0.0340906	training's l1: 0.0702526	valid_1's l2: 0.0134038	valid_1's l1: 0.0323129
[2000]	training's l2: 0.0337541	training's l1: 0.0694053	valid_1's l2: 0.0132898	valid_1's l1: 0.0298233
[2500]	training's l2: 0.0335207	training's l1: 0.0688021	valid_1's l2: 0.0131952	valid_1's l1: 0.028312
[3000]	training's l2: 0.0333523	training's l1: 0.068367	valid_1's l2: 0.0131123	valid_1's l1: 0.0273034
[3500]	training's l2: 0.033219	training's l1: 0.0680436	valid_1's l2: 0.0130372	valid_1's l1: 0.0264549
[4000]	training's l2: 0.033109	training's l1: 0.0678005	valid_1's l2: 0.0129586	valid_1's l1: 0.0259314
[4500]	training's l2: 0.0330175	training's l1: 0.0676386	valid_1's l2: 0.0129	valid_1's l1: 0.0256456
[5000]	training's l2: 0.0329394	training's l1: 0.0675283	valid_1's l2: 0.0128614	valid_1's l1: 0.0254616
[5500]	training's l2: 0.0328693	training's l1: 0.0674366	valid_1's l2: 0.0128206	valid_1's l1: 0.0253768
Early stopping, best iteration is:
[5586]	training's l2: 0.0328584	training's l1: 0.067422	valid_1's l2: 0.0128154	valid_1's l1: 0.0253677
MAE: 0.025367655553334097
ReTraining on all data
Best_iteration:  5586
pred1 = clf.predict(test)
pred1.min(), pred1.max()
(0.2106748391712024, 0.9860467852769609)
pre_d = df[df['type'] == 'test']
pre_d['ROOM_EMPTY'] = pred1.tolist()
pre_d1 = pre_d[['HOTELID', 'DATE', 'ROOM_EMPTY']]
pp = pre_d1[pre_d1['DATE']=='2021-09-21']
356552 10083 2021-09-21 0.726068
355831 10125 2021-09-21 0.825109
354872 10237 2021-09-21 0.841640
356181 10273 2021-09-21 0.841640
356825 104814 2021-09-21 0.726068
(2667, 3)
pre_d1['ROOM_EMPTY'] = pre_d1['ROOM_EMPTY'].apply(lambda x: 0 if x>0.807 else 1)
pre_d1.to_csv('pre_02_reg.csv', index=False)



(3311, 3)
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)

pred.min(), pred.max()
(0.03637936444857589, 1.8554153046371984)
pre_df = df[df['type'] == 'test']
pre_df['ROOM_EMPTY'] = pred.tolist()
pre_df1.to_csv('origion.csv', index=False)
pre_df1 = pre_df[['HOTELID', 'DATE', 'ROOM_EMPTY']]
# 8月份500多
(2380, 3)
pre_df1['ROOM_EMPTY'] = pre_df1['ROOM_EMPTY'].apply(lambda x: 0 if x>=0.25 else 1)

# mean_absolute_error(labels[valid_index], val)  
import numpy as np
from sklearn import metrics
# y = np.array([1, 1, 2, 2])
# pred = np.array([0.1, 0.4, 0.35, 0.8])

fpr, tpr, thresholds = metrics.roc_curve(labels[valid_index], val, pos_label=2)
metrics.auc(fpr, tpr)
a = pd.DataFrame()
a['y'] = labels[valid_index].tolist()
a['val'] = val.tolist()
# _val = np.array(a['val'].values.tolist())
# _y = np.array(a['y'].values.tolist())
fpr, tpr, thresholds = metrics.roc_curve(a['y'], a['val'], pos_label=2)
metrics.auc(fpr, tpr)
array([0.98077955, 0.98028159, 0.98090901, ..., 0.99187334, 0.9917258 ,
a['y'] = a['y'].apply(lambda x: 1 if x==0 else 0)
a['val'] = a['val'].apply(lambda x: (1-x/(4.03212491)))
from tqdm import tqdm
from sklearn.metrics import *

def find_best_threshold(y_valid, oof_prob):
    best_f2 = 0
    recall = 0
    precision = 0 
    best_th = 0
    for th in tqdm([i/2000 for i in range(100, 2000)]):
        oof_prob_copy = oof_prob.copy()
        oof_prob_copy[oof_prob_copy >= th] = 1
        oof_prob_copy[oof_prob_copy < th] = 0

#         recall = recall_score(y_valid, oof_prob_copy)
#         precision = precision_score(y_valid, oof_prob_copy)
        fpr, tpr, thresholds = metrics.roc_curve(oof_prob_copy, y_valid, pos_label=2)
        aucs = metrics.auc(fpr, tpr)
        if aucs > best_f2:
            best_th = th
            best_f2 = aucs
#         gc.collect()
#     recall = recall
#     precision = precision
    return best_th, best_f2
val1 = val
y1 = labels[valid_index]
best_th, aucs = find_best_threshold(val1, y1)
print("分界值", best_th)
print("F2评价分数", aucs)
# print("recall召回率", recall)
# print("precision精确度", precision)
分界值 0.05
F2评价分数 0.7065505364632024
# df4['in_year'] = df4['datetime'].dt.year.fillna(0).astype('int')
# df4['in_month'] = df4['datetime'].dt.month.fillna(0).astype('int')
# df4['in_day'] = df4['datetime'].dt.day.fillna(0).astype('int')
# df4['in_quarter'] = df4['datetime'].dt.quarter.fillna(0).astype('int')
# df4['in_dayofweek'] = df4['datetime'].dt.dayofweek.fillna(0).astype('int')
# df4['in_dayofyear'] = df4['datetime'].dt.dayofyear.fillna(0).astype('int')
# df4['in_is_wknd'] = df4['datetime'].dt.dayofweek // 5                 #是否周末
df4['date1'] = df4['date'].astype('str')
# 周末数据量15755
use_month = [5, 6, 7, 8, 9]
df_1 = df4[(df4['in_is_wknd']==1) & (df4['in_month'].isin(use_month))]
(11583, 59)
7    4391
5    2520
6    2192
8    1656
9     824
Name: in_month, dtype: int64
holiday = ['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04', '2021-05-05', '2021-06-12', '2021-06-13', '2021-06-14', 
           '2021-09-19', '2021-09-20', '2021-09-21']
drop_col = df5.columns.tolist()
df5.drop_duplicates(drop_col, keep='first', inplace=True)
hotel = test['HOTELID'].values.tolist()
df5 = df5[df5['HOTELID'].isin(hotel)]
test1['ROOM_EMPTY'] = test1['holiday1']
test2 = test1[['HOTELID', 'DATE', 'ROOM_EMPTY']]
pre = pd.read_csv('file41704172.csv')
pre.rename(columns={'ROOM_EMPTY':'room'}, inplace=True)
pr = test2.merge(pre, on=['HOTELID', 'DATE'], how='left')
pr['ROOM_EMPTY'] = pr['ROOM_EMPTY'] + pr['room']
del pr['room']
dd = df5.groupby(['in_year', 'in_month'])['ORDER_ID'].count().reset_index()
