【随机生成半小时时间缺口】机器学习python

# -*- coding: utf-8 -*-
"""
Created on Sun Nov 13 15:43:12 2022

@author: Lenovo
"""
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import random
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error


path = r'C:\Users\Lenovo\Desktop\四大类\REALTRY'
ori = pd.DataFrame()
ori_lx = pd.DataFrame()
filled = pd.DataFrame()
xticks = []
suijishu = []
suijishujianyi = []
sj = []
ori_index = []
ori_lx_index = []
ori_index_jianyi = []
test = pd.DataFrame()
test1 = pd.DataFrame()
rmse_list = []
rmse1_list = []
TEST_NUMBER = []
for i in os.listdir(path):
    s = pd.read_csv(os.path.join(path,i))
    ori = s.loc[s['LE_F_MDS_QC']==0,['TIMESTAMP_START','LE_F_MDS']]
    filled = s.loc[s['LE_F_MDS_QC']!=0,['TIMESTAMP_START','LE_F_MDS']]
    print(i)
    s['TIMESTAMP_START'] = pd.to_datetime(s['TIMESTAMP_START'])
    # s['year'] = s['TIMESTAMP_START'].dt.year
    # 绘制散点图
    # fig ,ax = plt.subplots(3,1,sharex='col',figsize=(25,9),dpi=300)
    # ax0 = ax[0]
    # ax0.plot( 'LE_F_MDS', data=ori, linestyle='none',marker='o')
    # ax1 = ax[1]
    # ax1.plot(  'LE_F_MDS', data=filled, color='#ff7f0e',linestyle='none', marker='o')
    # ax2 = ax[2]
    # ax2.plot(  'LE_F_MDS', data=s, alpha=0.6, linestyle='none', marker='o')
    # ax2.plot( 'LE_F_MDS', data=filled, alpha=0.6, linestyle='none', marker='o')
    # ax0.set_ylabel('in-situ', fontsize=19)
    # ax1.set_ylabel('gap-filled', fontsize=19)
    # ax2.set_ylabel('Original dataset', fontsize=19)
    # ax2.set_xticks([0,365*48,365*48*2,365*48*3,365*48*4])
    # ax2.set_xticklabels(range(s['year'].min()-1,s['year'].max()+1),fontproperties='Times New Roman',size=19)
    # ax2.set_xlabel('Year', fontsize=19)
    # plt.show()
    
    #插值法(半小时缺口)计算rmse
    ori = ori.dropna()
    ori['TIMESTAMP_START'] = pd.to_datetime(ori['TIMESTAMP_START'])
    ori['index'] = ori.index
    ori['cha'] = ori.diff()['index']
    ori_lx = ori.loc[ori['cha']==1]
    
    random.seed(0)
    suijishu = sorted(random.sample(range(0,len(ori)),int(len(ori)*0.3)))
    suijishujianyi = [x-1 for x in suijishu]
    sj = sorted(list(set(suijishu).difference(set(suijishujianyi)))) 
    sj = sj[:int(len(ori)*0.15)]              #选取的10%连续区间 来比较RF和线性插值

    ori_lx_index = ori_lx['index'].to_list()  #站点测量连续index
    ori_index = ori['index'].to_list()        #站点测量index
  
    #rf大法好
    test_ori = s.iloc[sorted(list(set(ori_lx_index) & set(sj)))]  #站点测量连续区间和随机gap间的并集区间
    train = s.iloc[sorted(list(set(ori_index) - set(sj)))]        #站点测量值和随机gap间的差集区间
    train = train.dropna()
    rf=RandomForestRegressor(n_estimators=1100
                                  ,max_depth=80
                                    ,oob_score=True
                                  ,random_state=(0)) 
    LE = train['LE_F_MDS']    
    Drivers = train.drop(['LE_F_MDS','LE_F_MDS_QC','TIMESTAMP_START','TIMESTAMP_END','NEE_VUT_REF','Unnamed: 0'],axis=1)
    rf.fit(Drivers,LE) 
      
    test = test_ori.dropna()
    y_test = test['LE_F_MDS']
    x_test = test.drop(['LE_F_MDS','LE_F_MDS_QC','TIMESTAMP_START','TIMESTAMP_END','NEE_VUT_REF','Unnamed: 0'],axis=1)
    TEST_NUMBER.append(test.shape[0])
    rmse=np.sqrt(mean_squared_error(y_test,rf.predict(x_test)))
    rmse_list.append(rmse)

    s0=s.copy()  
    s0.loc[sorted(list(set(ori_lx_index) & set(sj))),'LE_F_MDS'] = np.nan
    s0['LE_F_MDS']= s0['LE_F_MDS'].interpolate()
    s1 = s0.loc[sorted(list(set(ori_lx_index) & set(sj)))].dropna()
    rmse1=np.sqrt(mean_squared_error(test['LE_F_MDS'],s1['LE_F_MDS']))
    rmse1_list.append(rmse1)

    DF = pd.DataFrame({'TEST_NUMBER':TEST_NUMBER,'RF':rmse_list,'Interpolate':rmse1_list})
    print(DF)
    DF.to_csv(os.path.join(r'D:\Fluxnet\OUTCOME', 'IN VS. RF' + '.csv'),index = False)
        

    
    
    
    
   

你可能感兴趣的:(python,开发语言)