天池大赛链接
我所用到的数据
1、income_gb_2代表的是我从天池原有的income_statement中的general business导出的,balance_gb_2和cash_gb_2
首亦然。
2、 Macro为宏观数据,Market为市场数据
导入相关包,将工作目录改为数据所在目录
from pandas import DataFrame
from numpy import nan as NA
from pandas import Series
import os
import pandas as pd
import numpy as np
import random
import time
import threading as td
import multiprocessing as mp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
#改变工作目录
os.chdir('E://kaggle//tc2')
1、由于资产负债表、利润表和现金流量表内各个数据间在存在等式关系,例如:资产=负债+ 股东权益,利润总额=营业收入-营业成本-各项费用等。根据这些内在逻辑剔除异常值
2、由于内在等式关系, 对于缺失值则直接填充0,不改变原有等式关系
3、数据中有些列是前方列的和,为排除多重共线性,将这些列予以剔除
#根据财务领域知识剔除异常值与线性相关的列
#利润表
#导入数据
income_gb2=pd.read_csv('income_gb_2.csv')
#填充缺失值
income_gb2=income_gb2.fillna(0)
#建立空列表,用于收集需要剔除的观测样本
income_drop_index=[]
#检测异常样本
for i in range(np.shape(income_gb2)[0]):
if (income_gb2.ix[i,9]-income_gb2.ix[i,10:16].sum()) >1000 or \
(income_gb2.ix[i,9]-income_gb2.ix[i,10:16].sum()) <-1000 or \
(income_gb2.ix[i,16]-income_gb2.ix[i,17:32].sum()) >1000 or \
(income_gb2.ix[i,16]-income_gb2.ix[i,17:32].sum()) <-1000 or \
(income_gb2.ix[i,10:16].sum()-income_gb2.ix[i,17:32].sum()+income_gb2.ix[i,32:34].sum()+ \
income_gb2.ix[i,35:40].sum()-income_gb2.ix[i,40]) > 1000 or \
(income_gb2.ix[i,10:16].sum()-income_gb2.ix[i,17:32].sum()+income_gb2.ix[i,32:34].sum()+ \
income_gb2.ix[i,35:40].sum()-income_gb2.ix[i,40]) < -1000 :
income_drop_index.append(i)
print((i/np.shape(income_gb2)[0])*100)
#剔除观测样本
income_gb2_drop=income_gb2.drop(income_drop_index,axis=0)
#根据业务逻辑剔除数据中线性相关的列,防止多重共线性
income_gb2_drop=income_gb2.drop(['T_REVENUE','T_COGS','OPERATE_PROFIT','N_INCOME','T_COMPR_INCOME'],axis=1)
income_gb2_drop.to_csv('income_gb2_drop.csv',index=None)
#资产负债表
#处理方式同上
balance_gb2=pd.read_csv('balance_gb_2.csv')
balance_gb2=balance_gb2.fillna(0)
balance_gb2_drop=balance_gb2
balance_gb2_drop1=balance_gb2.drop(['T_CA','T_NCA','T_ASSETS','T_CL','T_NCL','T_LIAB',
'PREFERRED_STOCK_E','PREFERRED_STOCK_L','T_EQUITY_ATTR_P',
'T_SH_EQUITY','T_LIAB_EQUITY'],axis=1)
balance_drop_index_total=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop1.ix[i,9:list(balance_gb2_drop1.columns).index('ST_BORR')].sum() - \
balance_gb2_drop1.ix[i,list(balance_gb2_drop1.columns).index('ST_BORR'):].sum()) >10000 or \
(balance_gb2_drop1.ix[i,9:list(balance_gb2_drop1.columns).index('ST_BORR')].sum() - \
balance_gb2_drop1.ix[i,list(balance_gb2_drop1.columns).index('ST_BORR'):].sum()) < -10000 :
balance_drop_index_total.append(i)
print((i+1)/209872)
balance_drop_index_sum=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_ASSETS')] - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_LIAB_EQUITY')]) >10000 or \
(balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_ASSETS')] - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_LIAB_EQUITY')]) < -10000 :
balance_drop_index_sum.append(i)
print((i+1)/209872)
balance_drop_index_TCA=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,9:list(balance_gb2_drop.columns).index('T_CA')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CA')]) >10000 or \
(balance_gb2_drop.ix[i,9:list(balance_gb2_drop.columns).index('T_CA')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CA')]) < -10000 :
balance_drop_index_TCA.append(i)
print((i+1)/209872)
balance_drop_index_TNCA=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('DISBUR_LA'):list(balance_gb2_drop.columns).index('T_NCA')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCA')]) >10000 or \
(balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('DISBUR_LA'):list(balance_gb2_drop.columns).index('T_NCA')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCA')]) < -10000 :
balance_drop_index_TNCA.append(i)
print((i+1)/209872)
#
balance_drop_index_T_CL=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('ST_BORR'):list(balance_gb2_drop.columns).index('T_CL')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CL')]) >10000 or \
(balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('ST_BORR'):list(balance_gb2_drop.columns).index('T_CL')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_CL')]) < -10000 :
balance_drop_index_T_CL.append(i)
print((i+1)/209872)
balance_drop_index_T_NCL=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('LT_BORR'):list(balance_gb2_drop.columns).index('T_NCL')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCL')] -balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_L')]) >10000 or \
(balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('LT_BORR'):list(balance_gb2_drop.columns).index('T_NCL')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_NCL')]-balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_L')]) < -10000 :
balance_drop_index_T_NCL.append(i)
print((i+1)/209872)
balance_drop_index_T_EQUITY_ATTR_P=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PAID_IN_CAPITAL'):list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')] -balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_E')]) >10000 or \
(balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PAID_IN_CAPITAL'):list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P')]-balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('PREFERRED_STOCK_E')]) < -10000 :
balance_drop_index_T_EQUITY_ATTR_P.append(i)
print((i+1)/209872)
balance_drop_index_T_SH_EQUITY=[]
for i in range(np.shape(balance_gb2_drop)[0]) :
if (balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P'):list(balance_gb2_drop.columns).index('T_SH_EQUITY')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_SH_EQUITY')] ) >10000 or \
(balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_EQUITY_ATTR_P'):list(balance_gb2_drop.columns).index('T_SH_EQUITY')].sum() - \
balance_gb2_drop.ix[i,list(balance_gb2_drop.columns).index('T_SH_EQUITY')]) < -10000 :
balance_drop_index_T_SH_EQUITY.append(i)
print((i+1)/209872)
balance_drop_index_final=balance_drop_index_sum+balance_drop_index_TCA+balance_drop_index_TNCA+balance_drop_index_T_CL+balance_drop_index_T_NCL+balance_drop_index_T_EQUITY_ATTR_P+balance_drop_index_T_SH_EQUITY
balance_drop_index_final=list(set(balance_drop_index_final))
balance_gb2_drop_final=balance_gb2.drop(balance_drop_index_final,axis=0)
balance_gb2_drop_final=balance_gb2_drop_final.drop(['T_CA','T_NCA','T_ASSETS','T_CL','T_NCL','T_LIAB',
'PREFERRED_STOCK_E','PREFERRED_STOCK_L','T_EQUITY_ATTR_P',
'T_SH_EQUITY','T_LIAB_EQUITY'],axis=1)
balance_gb2_drop_final.to_csv('balance_gb2_drop.csv',index=None)
#现金流量表
#处理方式同上
cash_gb2=pd.read_csv('cash_gb_2.csv')
cash_gb2=cash_gb2.fillna(0)
cash_drop_index_OPERATE_A=[]
for i in range(np.shape(cash_gb2)[0]) :
if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_SALE_G_S'):list(cash_gb2.columns).index('C_INF_FR_OPERATE_A')].sum() - \
cash_gb2.ix[i,list(cash_gb2.columns).index('C_PAID_G_S'):list(cash_gb2.columns).index('C_OUTF_OPERATE_A')].sum() + \
cash_gb2.ix[i,list(cash_gb2.columns).index('ANOCF')] - \
cash_gb2.ix[i,list(cash_gb2.columns).index('N_CF_OPERATE_A')]) >10000 :
cash_drop_index_OPERATE_A.append(i)
print((i+1)/209872)
cash_drop_index_INVEST_A=[]
for i in range(np.shape(cash_gb2)[0]) :
if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('PROC_SELL_INVEST'):list(cash_gb2.columns).index('C_INF_FR_INVEST_A')].sum() - \
cash_gb2.ix[i,list(cash_gb2.columns).index('PUR_FIX_ASSETS_OTH'):list(cash_gb2.columns).index('C_OUTF_FR_INVEST_A')].sum() + \
cash_gb2.ix[i,list(cash_gb2.columns).index('ANICF')] - \
cash_gb2.ix[i,list(cash_gb2.columns).index('N_CF_FR_INVEST_A')]) >10000 :
cash_drop_index_INVEST_A.append(i)
print((i+1)/209872)
cash_drop_index_FINAN_A=[]
for i in range(np.shape(cash_gb2)[0]) :
if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_CAP_CONTR'):list(cash_gb2.columns).index('C_INF_FR_FINAN_A')].sum() - \
cash_gb2.ix[i,list(cash_gb2.columns).index('C_PAID_FOR_DEBTS'):list(cash_gb2.columns).index('C_OUTF_FR_FINAN_A')].sum() + \
cash_gb2.ix[i,list(cash_gb2.columns).index('ANFCF')] - \
cash_gb2.ix[i,list(cash_gb2.columns).index('N_CF_FR_FINAN_A')] -\
cash_gb2.ix[i,list(cash_gb2.columns).index('C_FR_MINO_S_SUBS')] + \
cash_gb2.ix[i,list(cash_gb2.columns).index('DIV_PROF_SUBS_MINO_S')]) >10000 :
cash_drop_index_FINAN_A.append(i)
print((i+1)/209872)
cash_drop_index_BAL=[]
for i in range(np.shape(cash_gb2)[0]) :
if abs(cash_gb2.ix[i,list(cash_gb2.columns).index('N_CHANGE_IN_CASH'):list(cash_gb2.columns).index('N_CE_END_BAL')].sum() - \
cash_gb2.ix[i,list(cash_gb2.columns).index('N_CE_END_BAL')]) >10000 :
cash_drop_index_BAL.append(i)
print((i+1)/209872)
cash_drop_index_final=cash_drop_index_OPERATE_A+cash_drop_index_INVEST_A+cash_drop_index_FINAN_A+cash_drop_index_BAL
cash_drop_index_final=list(set(cash_drop_index_final))
cash_gb2_drop_final=cash_gb2.drop(cash_drop_index_final,axis=0)
cash_gb2_drop_final=cash_gb2_drop_final.drop(['C_INF_FR_OPERATE_A','C_OUTF_OPERATE_A','N_CF_OPERATE_A','C_INF_FR_INVEST_A','C_OUTF_FR_INVEST_A','N_CF_FR_INVEST_A',
'C_INF_FR_FINAN_A','C_OUTF_FR_FINAN_A','N_CF_FR_FINAN_A',
'N_CHANGE_IN_CASH','N_CE_END_BAL'],axis=1)
cash_gb2_drop_final.to_csv('cash_gb2_drop.csv',index=False)
1、整理数据为后面形成训练集做准备。如同一个数据有多个值选取最近更新的数据
#打开文件
cash_gb0=pd.read_csv('cash_gb2_drop.csv')
#1、将时间列表提取
#2、然后从字符串变为时间戳,并改为数据框
#3、并入原数据框
date_pub=cash_gb0['PUBLISH_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_pub=DataFrame(date_pub.values,columns=['PUBLISH_DATE_mktime'])
date_rep=cash_gb0['END_DATE_REP'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_rep=DataFrame(date_rep.values,columns=['END_DATE_REP_mktime'])
date_end=cash_gb0['END_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_end=DataFrame(date_end.values,columns=['END_DATE_mktime'])
cash_gb0=pd.concat([cash_gb0,date_pub],axis=1)
cash_gb0=pd.concat([cash_gb0,date_rep],axis=1)
cash_gb0=pd.concat([cash_gb0,date_end],axis=1)
cash_gb0.sort_index(by=['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True)
cash_gb0.set_index(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True,drop=False)
#用循环将最新截止日期的财报筛选出来
sum1=0
ticker_unique=cash_gb0['TICKER_SYMBOL'].unique()
for i in ticker_unique:
cash_slice=cash_gb0.ix[i]
end_date_unique=cash_slice['END_DATE_mktime'].unique()
sum1 += 1
for j in end_date_unique:
index_t1=cash_gb0.ix[i,j]['PUBLISH_DATE_mktime'].values[-1]
index_t2=cash_gb0.ix[i,j]['END_DATE_REP_mktime'].values[-1]
cash_gb0.ix[(i,j,index_t1,index_t2),'PARTY_ID']=-1
print(sum1/len(ticker_unique))
cash_gb1=cash_gb0.drop(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#将'party_id'列不为-1的行全部转换为缺失值
#将索引转换为列
cash_gb1=cash_gb1.reset_index()
for i in range(np.shape(cash_gb1)[0]):
if cash_gb1.ix[i,'PARTY_ID'] !=-1:
cash_gb1.iloc[i]=NA
else:
continue
print(i/np.shape(cash_gb1)[0])
cash_gb1.to_csv('cash_gb1.csv')
cash_gb2=cash_gb1
#删除全为缺失值一行
cash_gb2=cash_gb2.dropna(how='all')
#设置层次化索引
cash_gb2=cash_gb2.set_index(['TICKER_SYMBOL','END_DATE'])
#删除不需要的列
cash_gb2=cash_gb2.drop(['PARTY_ID','EXCHANGE_CD','REPORT_TYPE','FISCAL_PERIOD','MERGED_FLAG','PUBLISH_DATE',
'END_DATE_REP','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#为层次索引排序
cash_gb2=cash_gb2.sortlevel(0)
#文件输出
cash_gb2.to_csv('cash_data.csv')
#打开文件
balance_gb0=pd.read_csv('balance_gb2_drop.csv')
#1、将时间列表提取
#2、然后从字符串变为时间戳,并改为数据框
#3、并入原数据框
date_pub=balance_gb0['PUBLISH_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_pub=DataFrame(date_pub.values,columns=['PUBLISH_DATE_mktime'])
date_rep=balance_gb0['END_DATE_REP'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_rep=DataFrame(date_rep.values,columns=['END_DATE_REP_mktime'])
date_end=balance_gb0['END_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_end=DataFrame(date_end.values,columns=['END_DATE_mktime'])
balance_gb0=pd.concat([balance_gb0,date_pub],axis=1)
balance_gb0=pd.concat([balance_gb0,date_rep],axis=1)
balance_gb0=pd.concat([balance_gb0,date_end],axis=1)
balance_gb0.sort_index(by=['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True)
balance_gb0.set_index(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True,drop=False)
#用循环将最新截止日期的财报筛选出来
sum1=0
ticker_unique=balance_gb0['TICKER_SYMBOL'].unique()
for i in ticker_unique:
balance_slice=balance_gb0.ix[i]
end_date_unique=balance_slice['END_DATE_mktime'].unique()
sum1 += 1
for j in end_date_unique:
index_t1=balance_gb0.ix[i,j]['PUBLISH_DATE_mktime'].values[-1]
index_t2=balance_gb0.ix[i,j]['END_DATE_REP_mktime'].values[-1]
balance_gb0.ix[(i,j,index_t1,index_t2),'PARTY_ID']=-1
print(sum1/len(ticker_unique))
#将'party_id'列不为-1的行全部转换为缺失值
balance_gb1=balance_gb0.drop(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#将索引转换为列
balance_gb1=balance_gb1.reset_index()
for i in range(np.shape(balance_gb1)[0]):
if balance_gb1.ix[i,'PARTY_ID'] !=-1:
balance_gb1.iloc[i]=NA
else:
continue
print(i/np.shape(balance_gb1)[0])
balance_gb1.to_csv('balance_gb1.csv')
balance_gb2=balance_gb1
#删除全为缺失值一行
balance_gb2=balance_gb2.dropna(how='all')
#设置层次化索引
balance_gb2=balance_gb2.set_index(['TICKER_SYMBOL','END_DATE'])
#删除不需要的列
balance_gb2=balance_gb2.drop(['PARTY_ID','EXCHANGE_CD','REPORT_TYPE','FISCAL_PERIOD','MERGED_FLAG','PUBLISH_DATE',
'END_DATE_REP','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#为层次索引排序
balance_gb2=balance_gb2.sortlevel(0)
#文件输出
balance_gb2.to_csv('balance_data.csv')
#打开文件
income_gb0=pd.read_csv('income_gb2_drop.csv')
#1、将时间列表提取
#2、然后从字符串变为时间戳,并改为数据框
#3、并入原数据框
date_pub=income_gb0['PUBLISH_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_pub=DataFrame(date_pub.values,columns=['PUBLISH_DATE_mktime'])
date_rep=income_gb0['END_DATE_REP'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_rep=DataFrame(date_rep.values,columns=['END_DATE_REP_mktime'])
date_end=income_gb0['END_DATE'].map(lambda x:int(time.mktime(time.strptime(x,'%Y-%m-%d'))))
date_end=DataFrame(date_end.values,columns=['END_DATE_mktime'])
income_gb0=pd.concat([income_gb0,date_pub],axis=1)
income_gb0=pd.concat([income_gb0,date_rep],axis=1)
income_gb0=pd.concat([income_gb0,date_end],axis=1)
income_gb0.sort_index(by=['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True)
income_gb0.set_index(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],inplace=True,drop=False)
#用循环将最新截止日期的财报筛选出来
sum1=0
ticker_unique=income_gb0['TICKER_SYMBOL'].unique()
for i in ticker_unique:
income_slice=income_gb0.ix[i]
end_date_unique=income_slice['END_DATE_mktime'].unique()
sum1 += 1
for j in end_date_unique:
index_t1=income_gb0.ix[i,j]['PUBLISH_DATE_mktime'].values[-1]
index_t2=income_gb0.ix[i,j]['END_DATE_REP_mktime'].values[-1]
income_gb0.ix[(i,j,index_t1,index_t2),'PARTY_ID']=-1
print(sum1/len(ticker_unique))
#将'party_id'列不为-1的行全部转换为缺失值
income_gb1=income_gb0.drop(['TICKER_SYMBOL','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#将索引转换为列
income_gb1=income_gb1.reset_index()
for i in range(np.shape(income_gb1)[0]):
if income_gb1.ix[i,'PARTY_ID'] !=-1:
income_gb1.iloc[i]=NA
else:
continue
print(i/np.shape(income_gb1)[0])
income_gb1.to_csv('income_gb1.csv')
income_gb2=income_gb1
#删除全为缺失值一行
income_gb2=income_gb2.dropna(how='all')
#设置层次化索引
income_gb2=income_gb2.set_index(['TICKER_SYMBOL','END_DATE'])
#删除不需要的列
income_gb2=income_gb2.drop(['PARTY_ID','EXCHANGE_CD','REPORT_TYPE','FISCAL_PERIOD','MERGED_FLAG','PUBLISH_DATE',
'END_DATE_REP','END_DATE_mktime','PUBLISH_DATE_mktime','END_DATE_REP_mktime'],axis=1)
#为层次索引排序
income_gb2=income_gb2.sortlevel(0)
#文件输出
income_gb2.to_csv('income_data.csv')
1、字符串日期列通过函数分开为年列和月列。
2、将资产负债表,利润表和现金流量表根据证券代码和日期索引进行内联结合并。
#将资产负债表、利润表、现金流量表融合
balance_gb3=pd.read_csv('balance_data.csv',index_col=['TICKER_SYMBOL','END_DATE'])
cash_gb3=pd.read_csv('cash_data.csv',index_col=['TICKER_SYMBOL','END_DATE'])
income_gb3=pd.read_csv('income_data.csv',index_col=['TICKER_SYMBOL','END_DATE'])
merge1=pd.merge(income_gb3,balance_gb3,left_index=True,right_index=True,how='inner')
merge2=pd.merge(merge1,cash_gb3,left_index=True,right_index=True,how='inner')
#财务报表数据第二版
merge3=merge2.reset_index()
def f1(x):
return int(x[:4])
def f2(x):
if len(x) ==10:
return int(x[5:7])
elif len(x) ==8:
return int(x[5:6])
elif x[4:7].count('/') == 2 :
return int(x[5:6])
else:
return int(x[5:7])
#将日期列通过函数分开为年列和月列
merge3['YEAR']=merge3['END_DATE'].map(f1)
merge3['MONTH']=merge3['END_DATE'].map(f2)
merge3.drop(['END_DATE'],axis=1,inplace=True)
merge3=merge3.set_index(['TICKER_SYMBOL','YEAR','MONTH'])
merge3=merge3.sortlevel(0)
#保存数据
merge3.to_csv('merge_data(7.26).csv')
市场数据进行预处理
#市场数据第二版
market=pd.read_csv('Market.csv')
#查看是否有缺失值
market.isnull().sum()
#利用元素级函数将日期分为年月
def f1(x):
return int(x[:4])
def f2(x):
if len(x) ==10:
return int(x[5:7])
elif len(x) ==8:
return int(x[5:6])
elif x[4:7].count('/') == 2 :
return int(x[5:6])
else:
return int(x[5:7])
market['YEAR']=market['END_DATE_'].map(f1)
market['MONTH']=market['END_DATE_'].map(f2)
market.drop(['SECURITY_ID','TYPE_ID','TYPE_NAME_CN','END_DATE_'],axis=1,inplace=True)
market=market.set_index(['TICKER_SYMBOL','YEAR','MONTH'])
market=market.sortlevel(0)
market.to_csv('market_final.csv')
对宏观数据进行预处理
#宏观数据
macro=pd.read_csv('Macro.csv')
#查看是否有缺失值并剔除
macro.isnull().sum()
macro.dropna(how='any',inplace=True)
macro=macro.set_index('FREQUENCY_CD')
macro=macro.sortlevel(0)
def f1(x):
return int(x[:4])
def f2(x):
if len(x) ==10:
return int(x[5:7])
elif len(x) ==8:
return int(x[5:6])
elif x[4:7].count('/') == 2 :
return int(x[5:6])
else:
return int(x[5:7])
year_test=[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017,2018]
#年度数据
macro_A=macro.ix['A']
macro_A['YEAR']=macro_A['PERIOD_DATE'].map(f1)
macro_A.drop(['name_cn','PERIOD_DATE'],axis=1,inplace=True)
macro_A=macro_A.set_index(['indic_id','YEAR'])
macro_A=macro_A.sortlevel(0)
macro_A.to_csv('macro_A_final.csv')
#月度数据转换为季度数据
#提取月度数据
macro_M=macro.ix['M']
#将字符串时间列通过函数改变为年列和月列
macro_M['YEAR']=macro_M['PERIOD_DATE'].map(f1)
macro_M['MONTH']=macro_M['PERIOD_DATE'].map(f2)
macro_M.drop(['PERIOD_DATE'],axis=1,inplace=True)
macro_M=macro_M.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_M=macro_M.sortlevel(0)
#寻找确实观测值
macro_M_na_year=[]
macro_M_na_month=[]
for i in list(macro_M.index.levels[0]):
try:
for j in year_test:
macro_M.ix[i,j]
except:
macro_M_na_year.append(i)
else:
for j in year_test:
for k in list(set(macro_M.ix[i,j]['MONTH'].values)):
try :
macro_M.ix[i,j,k].values
except:
macro_M_na_month.append([i,j,k])
macro_M.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_M=macro_M.reset_index(['YEAR','MONTH'])
#若整年数据缺失则剔除这个类型的数据
macro_M=macro_M.drop(macro_M_na_year,axis=0)
macro_M=macro_M.reset_index()
#某类数据仅缺少部分月份数据则用年平均值予以填充
for i in macro_M_na_month:
part1=DataFrame([[i[0],i[1],i[2],0,NA]],columns=list(macro_M.columns))
macro_M=pd.concat([macro_M,part1],ignore_index=True)
for i in range(np.shape(macro_M)[0]):
if macro_M.ix[i,1] < 2006:
macro_M.ix[i]=NA
macro_M.dropna(how='all',inplace=True)
macro_M=macro_M.set_index(['indic_id','YEAR'])
macro_M=macro_M.sortlevel(0)
macro_M=macro_M.fillna(macro_M.mean(level=[0,1]))
macro_M=macro_M.reset_index()
macro_M=macro_M.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_M=macro_M.sortlevel(0)
#按月份分为4个季度
for i in list(macro_M.index.levels[0]):
for j in year_test:
try:
for k in list(set(macro_M.ix[i,j]['MONTH'].values)):
if k <= 3:
macro_M.ix[(i,j,k),'name_cn'] = 3
elif k <= 6:
macro_M.ix[(i,j,k),'name_cn'] = 6
elif k <= 9 :
macro_M.ix[(i,j,k),'name_cn']=9
else:
macro_M.ix[(i,j,k),'name_cn'] =12
except:
print(i,j)
macro_M.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_M=macro_M.reset_index()
macro_M=macro_M.set_index(['indic_id','YEAR','name_cn'])
#求得某类数据某年季度数据
macro_M=macro_M.sum(level=[0,1,2])
macro_M.drop(['MONTH'],axis=1,inplace=True)
macro_M.to_csv('macro_M_final.csv')
#将周数据转换为季度数据
#处理方式同上
macro_W=macro.ix['W']
macro_W['YEAR']=macro_W['PERIOD_DATE'].map(f1)
macro_W['MONTH']=macro_W['PERIOD_DATE'].map(f2)
macro_W.drop(['PERIOD_DATE'],axis=1,inplace=True)
macro_W['name_cn']=0
macro_W=macro_W.set_index(['indic_id','YEAR','MONTH'])
macro_W=macro_W.sortlevel(0)
macro_W=macro_W.sum(level=('indic_id','YEAR','MONTH'))
macro_W=macro_W.reset_index()
macro_W=macro_W.set_index(['YEAR'])
macro_W=macro_W.drop(list(range(2002,2006,1)))
macro_W=macro_W.reset_index()
macro_W=macro_W.set_index(['indic_id','YEAR','MONTH'],drop=False)
for i in list(macro_W.index.levels[0]):
for j in year_test:
try:
for k in list(set(macro_W.ix[i,j]['MONTH'].values)):
if k <= 3:
macro_W.ix[(i,j,k),'name_cn'] = 3
elif k <= 6:
macro_W.ix[(i,j,k),'name_cn'] = 6
elif k <= 9 :
macro_W.ix[(i,j,k),'name_cn']=9
else:
macro_W.ix[(i,j,k),'name_cn'] =12
except:
print(i,j)
macro_W.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_W=macro_W.reset_index()
macro_W=macro_W.set_index(['indic_id','YEAR','name_cn'])
macro_W=macro_W.sum(level=[0,1,2])
macro_W.drop(['MONTH'],axis=1,inplace=True)
macro_W.to_csv('macro_W_final.csv')
#将日数据转换为季度数据
#处理方法同上
macro_D=macro.ix['D']
macro_D['YEAR']=macro_D['PERIOD_DATE'].map(f1)
macro_D['MONTH']=macro_D['PERIOD_DATE'].map(f2)
macro_D.drop(['PERIOD_DATE'],axis=1,inplace=True)
macro_D=macro_D.set_index('YEAR')
#剔除2006年以前的数据
macro_D.drop(list(range(1995,2006,1)),inplace=True)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_D=macro_D.sortlevel(0)
macro_D_na_year=[]
macro_D_na_month=[]
#查看日数据有从2006年开始的完整数据,每个月是否有缺失值
for i in list(macro_D.index.levels[0]):
try:
for j in year_test:
macro_D.ix[i,j]
except:
macro_D_na_year.append(i)
else:
for j in year_test:
for k in list(set(macro_D.ix[i,j]['MONTH'])):
try :
len(macro_D.ix[i,j,k]['MONTH'].values)>=20
l=len(macro_D.ix[i,j,k]['MONTH'].values)
except:
macro_D_na_month.append([i,j,k,l])
macro_D=macro_D.drop(['indic_id','YEAR','MONTH'],axis=1)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index('indic_id')
macro_D=macro_D.drop(macro_D_na_year)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index(['indic_id','YEAR','MONTH'],drop=False)
macro_D=macro_D.sortlevel(0)
for i in list(macro_D.index.levels[0]):
for j in year_test:
try:
for k in list(set(macro_D.ix[i,j]['MONTH'].values)):
if k <= 3:
macro_D.ix[(i,j,k),'name_cn'] = 3
elif k <= 6:
macro_D.ix[(i,j,k),'name_cn'] = 6
elif k <= 9 :
macro_D.ix[(i,j,k),'name_cn']=9
else:
macro_D.ix[(i,j,k),'name_cn'] =12
except:
print(i,j)
macro_D.drop(['indic_id','YEAR','MONTH'],axis=1,inplace=True)
macro_D=macro_D.reset_index()
macro_D=macro_D.set_index(['indic_id','YEAR','name_cn'])
macro_D=macro_D.sum(level=[0,1,2])
macro_D.drop(['MONTH'],axis=1,inplace=True)
macro_D.to_csv('macro_D_final.csv')
构建训练集并提取列名
#构建训练集
train2=DataFrame()
sum_count_lost=0
list_count_lost=[]
sum_count_lost1=0
list_count_lost1=[]
sum_na=0
list_na=[]
year_test=[2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
ticker_list=list(merge3.index.levels[0])
#获取列名
train_columns=[]
merge_columns=[]
for i in [0,'0_3_','1_12_','1_9_','1_6_','1_3_','2_12_','2_9_','2_6_','2_3_','3_12_','3_9_','3_6_','3_3_']:
if i == 0:
merge_columns.append('0_6_'+merge3.columns[0])
else:
for k in range(len(list(merge3.columns))):
merge_columns.append(i+list(merge3.columns)[k])
market_columns=[]
for i in ['0_','1_','2_','3_']:
if i == '0_':
for j in range(5,0,-1):
for k in range(len(list(market.columns))):
market_columns.append(str(i)+str(j)+'_'+list(market.columns)[k])
else:
for j in range(12,0,-1):
for k in range(len(list(market.columns))):
market_columns.append(str(i)+str(j)+'_'+list(market.columns)[k])
macro_A_columns=[]
for i in list(macro_A.index.levels[0]):
for j in ['1_','2_','3_']:
macro_A_columns.append(j+str(i))
macro_M_columns=[]
for i in list(macro_M.index.levels[0]):
for j in ['0_','1_','2_','3_']:
if j =='0_':
for k in [3]:
macro_M_columns.append(j+str(k)+'_'+str(i))
else:
for k in sorted(list(macro_M.index.levels[2]),reverse=True):
macro_M_columns.append(j+str(k)+'_'+str(i))
macro_W_columns=[]
for j in ['1_','2_','3_']:
for k in sorted(list(macro_W.index.levels[2]),reverse=True):
macro_W_columns.append(j+str(k)+'_'+'2160000101')
macro_D_columns=[]
for i in list(macro_D.index.levels[0]):
for j in ['1_','2_','3_']:
for k in sorted(list(macro_D.index.levels[2]),reverse=True):
macro_D_columns.append(j+str(k)+'_'+str(i))
train_columns=train_columns+merge_columns+market_columns+macro_A_columns+macro_M_columns+macro_W_columns+macro_D_columns
test_columns=train_columns[1:]
#形成训练集
for i in ticker_list :
label_unique=[]
for q in list(set(merge3.ix[i].index.labels[0])):
label_unique.append(list(merge3.ix[i].index.levels[0])[q])
year_list=sorted(label_unique,reverse=True)
year_len=len(year_list)
for j in year_list:
if year_len > 3:
year_len=year_len-1
if j <2018:
train=DataFrame()
try:#主要为了当年有剔除缺失值的样本
#获取当年的半年报营业收入,1季度报和近三年个季度报告
for l in [j,j-1,j-2,j-3]:
if l == j:
for k in [6,3]:
if k == 6:
train=pd.concat([train,Series(merge3.ix[(i,l,k),:].iloc[0,0],index=[i])],axis=1,ignore_index=True)
else:
train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
else:
for k in [12,9,6,3]:
train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
# 获取当年的前五个月市场数据,上一年的月度市场数据
for l in [j,j-1,j-2,j-3]:
if l == j:
for o in list(range(5,0,-1)):
train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)
else:
for o in list(range(12,0,-1)):
train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)
except Exception as e:
print(e)
sum_na +=1
list_na.append([i,l,k])
print(i,l,k,sum_na)
continue
#获取近三年宏观年度数据
for m in list(macro_A.index.levels[0]):
for l in [j-1,j-2,j-3]:
train=pd.concat([train,Series(macro_A.ix[(m,l)][0],index=[i])],axis=1,ignore_index=True)
# #近三年宏观月度数据转换而来的季度数据
for m in list(macro_M.index.levels[0]):
for l in [j,j-1,j-2,j-3]:
if l == j:
for k in [3]:
train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)
else:
for k in sorted(list(macro_M.index.levels[2]),reverse=True):
train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)
#近三年宏观周度数据转换而来的季度数据
for l in [j-1,j-2,j-3]:
for k in sorted(list(macro_W.index.levels[2]),reverse=True):
train=pd.concat([train,Series(macro_W.ix[(2160000101,l,k),:][0],index=[i])],axis=1,ignore_index=True)
#近三年宏观日度数据转换而来的季度数据
for m in list(macro_D.index.levels[0]):
for l in [j-1,j-2,j-3]:
for k in sorted(list(macro_D.index.levels[2]),reverse=True):
train=pd.concat([train,Series(macro_D.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)
train2=pd.concat([train2,train],axis=0,ignore_index=True)
complieted=((ticker_list.index(i)+1)/3493)*100
print('已完成:',complieted,'%')
train2.to_csv('train_set.csv',header=False,index=False)
形成测试集
#形成测试集
submit_nes=pd.read_csv('submit_nes.csv')
submit_bank=pd.read_csv('submit_bank.csv')
submit_sec=pd.read_csv('submit_sec.csv')
submit_ins=pd.read_csv('submit_ins.csv')
submit_bank=list(submit_bank['TICKER_SYMBOL2'])
submit_sec=list(submit_sec['TICKER_SYMBOL2'])
submit_ins=list(submit_ins['TICKER_SYMBOL2'])
submit_nes_change=[]
for i in list(submit_nes.values.tolist()):
if list(submit_nes.values.tolist()).index(i) <872:
submit_nes_change.append(int((i[0].strip('.XSHE'))))
else:
submit_nes_change.append(int((i[0].strip('.XSHG'))))
submit_gb_id=[]
no_count=0
for i in submit_nes_change:
if i in submit_bank or i in submit_sec or i in submit_ins:
no_count +=1
continue
else:
submit_gb_id.append(i)
list_test_na=[]
sum_test_na=0
test=DataFrame()
for i in submit_gb_id :
j=2018
train=DataFrame()
try:#主要为了当年有剔除缺失值的样本
#获取当年的1季度报和近三年个季度报告
for l in [j,j-1,j-2,j-3]:
if l == j:
for k in [3]:
if k == 6:
train=pd.concat([train,Series(merge3.ix[(i,l,k),:].iloc[0,0],index=[i])],axis=1,ignore_index=True)
else:
train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
else:
for k in [12,9,6,3]:
train=pd.concat([train,DataFrame(merge3.ix[(i,l,k),:].iloc[0]).T.unstack().unstack()],axis=1,ignore_index=True)
# 当年的前五个月市场数据,上一年的月度市场数据
for l in [j,j-1,j-2,j-3]:
if l == j:
for o in list(range(5,0,-1)):
train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)
else:
for o in list(range(12,0,-1)):
train=pd.concat([train,DataFrame(market.ix[(int(i),l,o),:]).T.unstack().unstack()],axis=1,ignore_index=True)
except Exception as e:
print(e)
sum_test_na +=1
list_test_na.append([i,l,k])
print(i,l,k,sum_test_na)
continue
#近三年宏观年度数据
for m in list(macro_A.index.levels[0]):
for l in [j-1,j-2,j-3]:
train=pd.concat([train,Series(macro_A.ix[(m,l)][0],index=[i])],axis=1,ignore_index=True)
# 近三年宏观月度数据转换而来的季度数据
for m in list(macro_M.index.levels[0]):
for l in [j,j-1,j-2,j-3]:
if l == j:
for k in [3]:
train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)
else:
for k in sorted(list(macro_M.index.levels[2]),reverse=True):
train=pd.concat([train,Series(macro_M.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)
#近三年宏观周度数据转换而来的季度数据
for l in [j-1,j-2,j-3]:
for k in sorted(list(macro_W.index.levels[2]),reverse=True):
train=pd.concat([train,Series(macro_W.ix[(2160000101,l,k),:][0],index=[i])],axis=1,ignore_index=True)
#近三年宏观日度数据转换而来的季度数据
for m in list(macro_D.index.levels[0]):
for l in [j-1,j-2,j-3]:
for k in sorted(list(macro_D.index.levels[2]),reverse=True):
train=pd.concat([train,Series(macro_D.ix[(m,l,k),:][0],index=[i])],axis=1,ignore_index=True)
test=pd.concat([test,train],axis=0,ignore_index=True)
#观测进度
complieted=((submit_gb_id.index(i)+1)/1460)*100
print('已完成:',complieted,'%')
test.to_csv('test_set.csv',header=False,index=False)
1、构建拥有列名的完整训练集数据
2、与测试集数据并剔除重复列
3、将分类变量转换为哑变量
#组合成训练集数据库
train_df=pd.read_csv('train_set.csv',header=None)
test_df=pd.read_csv('test_set.csv',header=None)
train_df=DataFrame(np.array(train_df),columns=train_columns)
test_df=DataFrame(np.array(test_df),columns=test_columns)
drop_columns=[]
#剔除数据完全相同的列
for i in train_columns:
if i == '0_5_TYPE_NAME_EN':
continue
else:
if 'TYPE_NAME_EN' in i:
drop_columns.append(i)
train_df=train_df.drop(drop_columns,axis=1)
test_df=test_df.drop(drop_columns,axis=1)
#将分类变量转化为哑变量
lbl=LabelEncoder()
tlbl=LabelEncoder()
lbl.fit(list(train_df['0_5_TYPE_NAME_EN'].values))
tlbl.fit(list(test_df['0_5_TYPE_NAME_EN'].values))
train_df['0_5_TYPE_NAME_EN']=lbl.transform(list(train_df['0_5_TYPE_NAME_EN'].values))
test_df['0_5_TYPE_NAME_EN']=tlbl.transform(list(test_df['0_5_TYPE_NAME_EN'].values))
train_df.to_csv('train_final.csv',index=False)
test_df.to_csv('test_final.csv',index=False)
1、将与因变量相关度为0.99的变量剔除
2、利用PCA进行降维
#根据相关性剔除特征向量和利用PCA降维度
train_df=pd.read_csv('train_final.csv')
test_df=pd.read_csv('test_final.csv')
y_train=train_df.ix[:,0:1]
x_train=train_df.ix[:,1:]
drop_corr_columns=[]
check_corr_columns=[]
thresh_hold=0.99
x_train_corr=x_train.corr().abs()
for i in range(np.shape(x_train.columns)[0]):
for j in range(i+1,np.shape(x_train.columns)[0]):
if x_train_corr.ix[i,j]>=thresh_hold:
if x_train.columns[i] not in drop_corr_columns:
drop_corr_columns.append(list(x_train.columns)[i])
check_corr_columns.append([str(x_train.columns[i])+'+'+str(x_train.columns[j])+'='+str(round(x_train_corr.ix[i,j],2))])
print('已完成:',((i+1)/3482)*100,'%')
print('有%f个多余特征' % len(drop_corr_columns))
x_train_afcorr=x_train.drop(drop_corr_columns,axis=1)
test_df_afcorr=test_df.drop(drop_corr_columns,axis=1)
train_list_tempo=x_train_afcorr['0_5_TYPE_NAME_EN']
test_list_tempo=test_df_afcorr['0_5_TYPE_NAME_EN']
x_train_afcorr.drop('0_5_TYPE_NAME_EN',axis=1,inplace=True)
test_df_afcorr.drop('0_5_TYPE_NAME_EN',axis=1,inplace=True)
pca=PCA(n_components=92)
pca.fit(x_train_afcorr)
pca_var_rat=pca.explained_variance_ratio_
pca_var=pca.explained_variance_
print(pca.explained_variance_ratio_)
print(pca.explained_variance_)
x_train_new=pca.fit_transform(x_train_afcorr)
x_train_new=pd.concat([DataFrame(x_train_new),train_list_tempo],axis=1)
test_new=pca.fit_transform(test_df_afcorr)
test_new=pd.concat([DataFrame(test_new),test_list_tempo],axis=1)
x_train_new.to_csv('x_train_pca.csv',index=False)
test_new.to_csv('test_pca.csv',index=False)
1、运用XGBOOST算法训练数据
2、使用cross_val_score寻找最佳的学习器数量
3、使用GridSearchCV调整决策树的深度、 最小叶子的比例、每棵树所用到的样本比例、每棵树所用到的特征比例、正则参数等
#建立xgbt
train_df=pd.read_csv('train_final.csv')
x_train=pd.read_csv('x_train_pca.csv')
test=pd.read_csv('test_pca.csv')
y_train=(train_df.ix[:,0:1])
#寻找最佳学习器数目
k_estimators=list(range(1,1000,2))
k_score_mean=[]
k_score_std=[]
for i in k_estimators:
xgb3=XGBRegressor(objective='reg:linear',
learning_rate=0.1,
max_depth=8,
min_child_weight=1,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
seed=3,
eval_metric='rmse',
reg_alpha=2,
reg_lambda=0.1,
n_estimators=i)
score=cross_val_score(xgb3,x_train.values,y_train.values,scoring='neg_mean_squared_error',cv=5,n_jobs=-1)
print(i)
print(score.mean())
print(score.std())
k_score_mean.append(score.mean())
k_score_std.append(score.std())
plt.plot(k_estimators,k_score_mean)
plt.xlabel('value of k for xgb2')
plt.ylabel('neg__mean_squared_error')
plt.show()
#寻找最佳步长和最小叶子比例
xgb2=XGBRegressor(objective='reg:linear',
learning_rate=0.1,
max_depth=6,
min_child_weight=1,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
seed=3,
eval_metric='rmse',
n_estimators=216)
param_test={'max_depth':list(range(6,10,1)),'min_child_weight':list(range(1,3,1))}
clf=GridSearchCV(estimator=xgb2,param_grid=param_test,cv=5,scoring='neg_mean_squared_error')
clf.fit(x_train.values,y_train.values)
clf.grid_scores_
clf.best_params_
clf.best_score_
#寻找subsample和colsample_bytree
xgb2=XGBRegressor(objective='reg:linear',
learning_rate=0.1,
max_depth=8,
min_child_weight=1,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
seed=3,
eval_metric='rmse',
n_estimators=401)
param_test={'subsample':[i/10 for i in range(3,9)],'colsample_bytree':[i/10 for i in range(6,10)]}
clf=GridSearchCV(estimator=xgb2,param_grid=param_test,cv=5,scoring='neg_mean_squared_error')
clf.fit(x_train.values,y_train.values)
clf.grid_scores_
clf.best_params_
clf.best_score_
#寻找更好的正则参数
reg_alpha=[2,2.5,3]#之前测过【0.1,1,1.5,2】
reg_lambda=[0,0.05,0.1]#之前测过【0.1,0.5,1,2】
xgb2=XGBRegressor(objective='reg:linear',
learning_rate=0.1,
max_depth=8,
min_child_weight=1,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
seed=3,
eval_metric='rmse',
n_estimators=401)
param_test={'reg_alpha':reg_alpha,'reg_lambda':reg_lambda}
clf=GridSearchCV(estimator=xgb2,param_grid=param_test,cv=5,scoring='neg_mean_squared_error')
clf.fit(x_train.values,y_train.values)
clf.grid_scores_
clf.best_params_
clf.best_score_
xgb2=XGBRegressor(objective='reg:linear',
learning_rate=0.1,
max_depth=8,
min_child_weight=3,
subsample=0.3,
colsample_bytree=0.8,
colsample_bylevel=0.7,
seed=3,
eval_metric='rmse',
reg_alpha=2,
reg_lambda=0.1,
n_estimators=466)
xgb2.fit(x_train.values,y_train.values)
pred=xgb2.predict(test.values)