数据描述:
招商银行的Fintech比赛2019年第五题,数据有34个数值型特征,7000多个样本,其中只有100多个正例,这个做的很基础,写下来就是自己备忘一下。
1.特征工程
我认为的特征工程主要的步骤是,特征观察,特征分析,特征提取,特征处理和特征选择。
接下来用代码叙述:
首先,要将训练数据和测试数据合并处理,当然,要先把标签提取出来
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
train_df = pd.read_csv('./data/FT_Camp_5/Train.csv',index_col=0)
test_df = pd.read_csv('./data/FT_Camp_5/X_test.csv',index_col=0)
train_data.shape,test_data.shape
train_y = train_data.pop('fake')
y_plot = sns.distplot(train_y)
#观察特征
var = 'Sale_grop_ratio'
data = pd.concat([train_y, train_data[var]], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=var, y="Sale_grop_ratio", data=data)
fig.axis(ymin=0, ymax=800000);plt.xticks(rotation=90);
plt.show()
y_train = train_df.pop('fake')#提取标签
all_df = pd.concat((train_df, test_df), axis=0)#合并训练数据和测试数据
var = 'Hold_Top5'
data = pd.concat([train_df['fake'],train_df[var]],axis=1)
data.plot.scatter(x=var,y='fake')
通过这一步,可以找到一些离群值,我是一个点一个点的去的,去除的方法是:
train_df=train_df[~train_df['Sale_grop_ratio'].isin([-5913.72])]
对特征进行一些处理,包括填充缺失值,将类别数据通过独热编码转化(在我的另一篇预测博客中有用到),以及数据的标准化。
all_df = train_df
all_df.isnull().sum().sort_values(ascending=False).head()#查看缺失值
#用均值补充缺失值
mean_cols = all_df.mean()
mean_cols.head(10)
all_df = all_df.fillna(mean_cols)
all_df.isnull().sum().sum()
#查看哪些数据是数值型数据
numeric_cols = all_df.columns[all_df.dtypes != 'object']
#将数据标准化(平滑)
numeric_cols = all_df.columns[all_df.dtypes != 'object']#训练样本中所有数值型数据
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std
当然,特征也能一条一条的标准化
# sklearn数据预处理模块做数据预处理
from sklearn.preprocessing import StandardScaler
# sklearn的reshape函数设置-1,即让其自动计算有多少行 得到新的列
data['norm_Sale_grop_ratio'] = StandardScaler().fit_transform(data['Sale_grop_ratio'].values.reshape(-1, 1))
data = data.drop(['Sale_grop_ratio'],axis=1)
data['norm_Delta_Sale_grop_ratio'] = StandardScaler().fit_transform(data['Delta_Sale_grop_ratio'].values.reshape(-1, 1))
data = data.drop(['Delta_Sale_grop_ratio'],axis=1)
将处理好的训练数据和测试数据再次分开
dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]
将测试数据和标签重新拼接起来
train_df_xy = pd.concat((dummy_train_df,y_train), axis=1)
这里有一点挺重要的,就是上文中说到,这个数据相当的不均衡,所以有两种方法可以采用:1.对少的样本过采样。2.对多的样本欠采样。
如下所示:
1.用SMOTE方法过采样(SMOTE的python具体实现见其他博客)
from imblearn.over_sampling import SMOTE
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(train_data,train_lable)
# 可通过radio参数指定对应类别要生成的数据的数量
smo = SMOTE(ratio={1:300},random_state=42)
# 使生成后样本类别为1的样本的个数为300
print(dummy_train_df.shape)
print(y_train.shape)
x, y = smo.fit_sample(dummy_train_df, y_train)
2.下采样
(1)方法1
#def down_sample(df):
# df1=df[df['fake']==1]
# df2=df[df['fake']==0]
# df3=df2.sample(frac=0.6)
# return pd.concat([df1,df3],ignore_index=True)
(2)方法2
# 计算class=0的有多少个 =1的有多少个
count_classes = pd.value_counts(data['fake'], sort = True).sort_index()
count_classes.plot(kind = 'bar')
plt.title("fake class histogram")
plt.xlabel("fake")
plt.ylabel("Frequency")
# 取出x值和label值
X = data.ix[:, data.columns != 'fake']
y = data.ix[:, data.columns == 'fake']
# 拿到所有负例的个数和它们的index,用来做过采样
number_records_fake = len(data[data.fake == 1])
fake_indices = np.array(data[data.fake == 1].index)
# 拿到所有正例的index,如果是做欠采样,那么需要通过index随机取
normal_indices = data[data.fake == 0].index
# 从正例的数据集中采集负例个数的样本
random_normal_indices = np.random.choice(normal_indices, number_records_fake, replace = False)
# 转换为numpy的格式
random_normal_indices = np.array(random_normal_indices)
# 合并正例和负例样本
under_sample_indices = np.concatenate([fake_indices,random_normal_indices])
# pandas的索引来重新赋值
under_sample_data = data.loc[under_sample_indices,:]
X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'fake']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'fake']
上面的两种方法我都有尝试,但是都没有使用,最后用的是,直接复制少例,然后拼接到原先的数据集
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,train_df_xy,add_df,add_df), axis=0)
打乱数据的代码(效果不好,我并没有用)
train_df_xy = train_df_xy.sample(frac = 1)
好,现在就是最后的一点对数据集的处理了
y_train = train_df_xy.pop('fake')
dummy_train_df = train_df_xy
def get_id(path):
id_list = []
csv_reader = csv.reader(open(path))
for row in csv_reader:
id_list.append(row[0])
id_array = np.array(id_list)
return id_array
y_train = np.array(y_train,dtype = float)
x_train = np.array(dummy_train_df,dtype = float)
x_test = np.array(dummy_test_df,dtype = float)
id_test = get_id(PATH_TEST)
*对于特征的处理,还有几种方法
(1)通过加减乘除计算添加新特征
train_df['AR1/']=train_df['AR_to_Reve']/train_df['AR_to_Cur']
train_df['AR2*']=train_df['AR_to_Reve']*train_df['AR_TO_ratio']```
(2)用随机森林算法查看特征的重要性排序
from sklearn.ensemble import RandomForestClassifier
print('随机森林训练中...')
feat_labels = dummy_train_df.columns
forest = RandomForestClassifier(n_estimators=10000, random_state=0,n_jobs=1)
forest.fit(dummy_train_df, y_train)
importance = forest.feature_importances_
imp_result = np.argsort(importance)[::-1]
imp_result [0:5]
print('特征排序中...')
#输出特征重要性排序
for i in range(dummy_train_df.shape[1]):
print("%2d. %-*s %f"%(i+1, 30, feat_labels[i], importance[imp_result[i]]))
(3)用PCA降维进行特征筛选
(4)尝试删除冗余特征
#all_df.pop('A_to_L_ratio')
#all_df.pop('AR_to_Cur')
2.模型选择与调参
模型选择看auc,调参的话,要自己写一个好的交叉验证代码。我第一次做相关的东西,写出来的和用的都比较low,建议大家多看看kaggle比赛相关的内容。
(1)SVM+BAGGING
def svm_c(x_train, x_test, y_train,id_test):
print('训练中...')
svc = SVC(kernel='rbf', class_weight='balanced')
clf = BaggingClassifier(base_estimator=svc, n_estimators=20,max_samples=1.0, max_features=1.0,
bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)
param_grid = [{'base_estimator':[SVC(kernel='rbf', class_weight='balanced',C=4.5)],#没4
'n_estimators': [20], 'max_samples': [0.95], #[20]
'max_features': [0.8]}]
#param_grid:是一个列表,列表里是算法对象的超参数的取值,用字典存储
grid = GridSearchCV(clf, param_grid, cv=6, n_jobs=-1)
#clf.fit(x_train, y_train)
#predict = clf.predict(x_test)
# 训练模型
clf = grid.fit(x_train, y_train)
# 计算测试集精度
best_estimator = grid.best_estimator_
predict = best_estimator.predict(x_test)
print(grid.best_score_) #查看最佳分数(此处为f1_score)
print(grid.best_params_) #查看最佳参数
predict = predict.tolist()
id_test = id_test.tolist()
csvFile2 = open('csvFile56.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['stockcode', 'fake'])
m = len(predict)
print('文件生成中...')
for i in range(m):
writer.writerow([int(id_test[i+1]), int(predict[i])])
csvFile2.close()
2.SVM+ADABOOST
def svm_c(x_train, x_test, y_train,id_test):
print('训练中...')
svc = SVC(kernel='rbf', class_weight='balanced',C=4,probability=True)
Ada = AdaBoostClassifier(svc,n_estimators=50,learning_rate=1)
ada_model = BaggingClassifier(base_estimator=Ada, n_estimators=20,max_samples=0.8, max_features=0.8,
bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)
param_grid = [{'n_estimators': [20],'max_samples': [0.9],
'max_features': [0.8]}]
grid = GridSearchCV(ada_model, param_grid, cv=3, n_jobs=-1)
#Ada.fit(x_train, y_train)
#predict = clf.predict(x_test)
# 训练模型
grid.fit(x_train, y_train)
# 计算测试集精度
best_estimator = grid.best_estimator_
predict = best_estimator.predict(x_test)
predict_val = best_estimator.predict(x1_test)
f1score_val = f1_score(y1_test, predict_val, average='binary')
print('f1 score val:',f1score_val)
print(grid.best_score_) #查看最佳分数(此处为f1_score)
print(grid.best_params_) #查看最佳参数
predict = predict.tolist()
id_test = id_test.tolist()
csvFile2 = open('csvFile2_adaboost.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['stockcode', 'fake'])
m = len(predict)
print('文件生成中...')
for i in range(m):
writer.writerow([int(id_test[i+1]), int(predict[i])])
csvFile2.close()
3.XGBT
XGBClassifier(learning_rate=0.1,
n_estimators=1000, # 树的个数--1000棵树建立xgboost
max_depth=6, # 树的深度
min_child_weight = 1, # 叶子节点最小权重
gamma=0., # 惩罚项中叶子结点个数前的参数
subsample=0.8, # 随机选择80%样本建立决策树
colsample_btree=0.8, # 随机选择80%特征建立决策树
objective='multi:softmax', # 指定损失函数
scale_pos_weight=1, # 解决样本个数不平衡的问题
random_state=27, # 随机数
num_class=2
)
4.RF+BAGGING
def RF(x_train, x_test, y_train,id_test):
print('x_train',x_train)
print('x_test',x_test)
print('y_train',y_train)
print('id_test',id_test)
print('训练中...')
RF = RandomForestRegressor(n_estimators=50, max_features=0.8)
clf = BaggingClassifier(base_estimator=RF, n_estimators=20,max_samples=1.0, max_features=1.0,
bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)
param_grid = [{'n_estimators': [20,50], 'max_samples': [1.0,0.9,0.8],
'max_features': [1.0,0.9,0.8]}]
#param_grid:是一个列表,列表里是算法对象的超参数的取值,用字典存储
grid = GridSearchCV(clf, param_grid, cv=6, n_jobs=-1)
# 训练模型
clf = grid.fit(x_train, y_train)
# 计算测试集精度
best_estimator = grid.best_estimator_
predict = best_estimator.predict(x_test)
predict_train = best_estimator.predict(x_train)
predict_val = best_estimator.predict(x1_test)
f1score_val = f1_score(y1_test, predict_val, average='binary')
print('f1 score val:',f1score_val)
#right = sum(predict_train == y_train)
# accuracy rate
p = precision_score(y_train, predict_train, average='binary')
r = recall_score(y_train, predict_train, average='binary')
f1score = f1_score(y_train, predict_train, average='binary')
print('precision score:',p)
print('recall score:',r)
print('f1 score:',f1score)
predict = predict.tolist()
id_test = id_test.tolist()
csvFile2 = open('csvFile_rf_BAGGING.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['stockcode', 'fake'])
m = len(predict)
print('文件生成中...')
for i in range(m):
writer.writerow([int(id_test[i+1]), int(predict[i])])
csvFile2.close()
最后,自己写了一个比较搓的不交叉验证的代码
import numpy as np
import pandas as pd
import csv
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
from pandas import Series, DataFrame
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import BaggingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
%matplotlib inline
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
#数据预处理
train_df = pd.read_csv('./data/FT_Camp_5/Train.csv',index_col=0)
test_df = pd.read_csv('./data/FT_Camp_5/X_test.csv',index_col=0)
y_train = train_df.pop('fake')
all_df = pd.concat((train_df, test_df), axis=0)
all_df.isnull().sum().sort_values(ascending=False).head()#查看缺失值
mean_cols = all_df.mean()
mean_cols.head(10)
all_df = all_df.fillna(mean_cols)
all_df.isnull().sum().sum()
numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std
dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]
train_df_xy = pd.concat((dummy_train_df,y_train), axis=1)
x_test = np.array(dummy_test_df,dtype = float)
def get_id(path):
id_list = []
csv_reader = csv.reader(open(path))
for row in csv_reader:
id_list.append(row[0])
id_array = np.array(id_list)
return id_array
'''
def down_sample(df):
df1=df[df['fake']==1]
df2=df[df['fake']==0]
df3=df2.sample(frac=0.5)
return pd.concat([df1,df3],ignore_index=True)
'''
def val1(train_df_xy):
train2_df = train_df_xy.iloc[1275:,:]#大
train1_df = train_df_xy.iloc[:1275,:]#小
train_df_xy = train2_df
train1_label = train1_df.pop('fake')
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,add_df,add_df,train_df_xy), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
dummy_train_df = train_df_xy#down_sample(train_df_xy)
y_train = dummy_train_df.pop('fake')
'''
dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
oversampler = SMOTE(ratio={1:2000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
dummy_train_df = os_X_train
y_train = os_y_train'''
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
id_test = get_id(PATH_TEST)
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = float)
return id_test,x_train,y_train,x1_test,y1_test
def val2(train_df_xy):
train2_df = train_df_xy.iloc[:1275,:]
train3_df = train_df_xy.iloc[2550:,:]
train1_df = train_df_xy.iloc[1275:2550,:]
train_df_xy = pd.concat((train2_df,train3_df), axis=0)
train1_label = train1_df.pop('fake')
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,add_df,add_df,train_df_xy), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
dummy_train_df = train_df_xy#down_sample(train_df_xy)
y_train = dummy_train_df.pop('fake')
'''
dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
oversampler = SMOTE(ratio={1:2000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
dummy_train_df = os_X_train
y_train = os_y_train'''
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
id_test = get_id(PATH_TEST)
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = float)
return id_test,x_train,y_train,x1_test,y1_test
def val3(train_df_xy):
train2_df = train_df_xy.iloc[:2550,:]
train3_df = train_df_xy.iloc[3825:,:]
train1_df = train_df_xy.iloc[2550:3825,:]
train_df_xy = pd.concat((train2_df,train3_df), axis=0)
train1_label = train1_df.pop('fake')
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,add_df,train_df_xy), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
dummy_train_df = train_df_xy#down_sample(train_df_xy)
y_train = dummy_train_df.pop('fake')
'''
dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
oversampler = SMOTE(ratio={1:2000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
dummy_train_df = os_X_train
y_train = os_y_train'''
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
id_test = get_id(PATH_TEST)
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = float)
return id_test,x_train,y_train,x1_test,y1_test
def val4(train_df_xy):
train2_df = train_df_xy.iloc[:3825,:]
train3_df = train_df_xy.iloc[5100:,:]
train1_df = train_df_xy.iloc[3825:5100,:]
train_df_xy = pd.concat((train2_df,train3_df), axis=0)
train1_label = train1_df.pop('fake')
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,add_df,train_df_xy), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
dummy_train_df = train_df_xy#down_sample(train_df_xy)
y_train = dummy_train_df.pop('fake')
'''
dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
oversampler = SMOTE(ratio={1:2000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
dummy_train_df = os_X_train
y_train = os_y_train'''
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
id_test = get_id(PATH_TEST)
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = float)
return id_test,x_train,y_train,x1_test,y1_test
def val5(train_df_xy):
train2_df = train_df_xy.iloc[:5100,:]
train3_df = train_df_xy.iloc[6375:,:]
train1_df = train_df_xy.iloc[5100:6375,:]
train_df_xy = pd.concat((train2_df,train3_df), axis=0)
train1_label = train1_df.pop('fake')
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,add_df,train_df_xy), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
dummy_train_df = train_df_xy#down_sample(train_df_xy)
y_train = dummy_train_df.pop('fake')
'''
dummy_train_df = train_df_xy
y_train = train_df_xy.pop('fake')
oversampler = SMOTE(ratio={1:2000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
dummy_train_df = os_X_train
y_train = os_y_train'''
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
id_test = get_id(PATH_TEST)
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = float)
return id_test,x_train,y_train,x1_test,y1_test
def val6(train_df_xy):
train2_df = train_df_xy.iloc[:6375,:]
train1_df = train_df_xy.iloc[6375:,:]
train1_label = train1_df.pop('fake')
add_df = train_df_xy[train_df_xy['fake'].isin([1])]
train_df_xy = pd.concat((add_df,add_df,train_df_xy), axis=0)
#train_df_xy = train_df_xy.sample(frac = 1)
dummy_train_df = train_df_xy#down_sample(train_df_xy)
y_train = dummy_train_df.pop('fake')
#dummy_train_df = train_df_xy
#y_train = train_df_xy.pop('fake')
#oversampler = SMOTE(ratio={1:2000}, random_state=np.random.randint(100), k_neighbors=5, m_neighbors=10, kind='regular', n_jobs=-1)
#os_X_train, os_y_train = oversampler.fit_sample(dummy_train_df,y_train)
#dummy_train_df = os_X_train
#y_train = os_y_train
y_train = np.array(y_train,dtype = int)
x_train = np.array(dummy_train_df,dtype = float)
id_test = get_id(PATH_TEST)
x1_test = np.array(train1_df,dtype = float)
y1_test = np.array(train1_label,dtype = float)
return id_test,x_train,y_train,x1_test,y1_test
def svm_c(x_train, x_test, y_train,id_test,x1_test,y1_test):
print('训练中...')
#svc = SVC(kernel='rbf', class_weight='balanced')
lr = XGBClassifier(learning_rate=0.1,
n_estimators=1000, # 树的个数--1000棵树建立xgboost
max_depth=6, # 树的深度
min_child_weight = 1, # 叶子节点最小权重
gamma=0., # 惩罚项中叶子结点个数前的参数
subsample=0.8, # 随机选择80%样本建立决策树
colsample_btree=0.8, # 随机选择80%特征建立决策树
objective='multi:softmax', # 指定损失函数
scale_pos_weight=1, # 解决样本个数不平衡的问题
random_state=27, # 随机数
num_class=2
)
clf = BaggingClassifier(base_estimator=lr, n_estimators=20,max_samples=1.0, max_features=1.0,
bootstrap=True,bootstrap_features=False, n_jobs=1, random_state=1)
param_grid = [{#'base_estimator':[SVC(kernel='rbf', class_weight='balanced',C=4),
# SVC(kernel='rbf', class_weight='balanced',C=2)],#没4
'n_estimators': [20], 'max_samples': [1.0], #[20]
'max_features': [0.9]}]
#param_grid:是一个列表,列表里是算法对象的超参数的取值,用字典存储
grid = GridSearchCV(clf, param_grid, cv=10, n_jobs=-1)
#clf.fit(x_train, y_train)
#predict = clf.predict(x_test)
# 训练模型
clf = grid.fit(x_train, y_train)
# 计算测试集精度
best_estimator = grid.best_estimator_
#predict = best_estimator.predict(x_test)
#print(grid.best_score_) #查看最佳分数(此处为f1_score)
#print(grid.best_params_) #查看最佳参数
predict_val = best_estimator.predict(x1_test)
f1score_val = f1_score(y1_test, predict_val, average='binary')
print('f1 score val:',f1score_val)
#predict = predict.tolist()
#id_test = id_test.tolist()
#csvFile2 = open('csvFile6.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
#writer = csv.writer(csvFile2)
#writer.writerow(['stockcode', 'fake'])
#m = len(predict)
#print('文件生成中...')
#for i in range(m):
# writer.writerow([int(id_test[i+1]), int(predict[i])])
#csvFile2.close()
return f1score_val
id_test1,x_train1,y_train1,x1_test1,y1_test1 = val1(train_df_xy)
print('第1次')
s1 = svm_c(x_train1,x_test,y_train1,id_test1,x1_test1,y1_test1)
id_test2,x_train2,y_train2,x1_test2,y1_test2 = val2(train_df_xy)
print('第2次')
s2 = svm_c(x_train2,x_test,y_train2,id_test2,x1_test2,y1_test2)
id_test3,x_train3,y_train3,x1_test3,y1_test3 = val3(train_df_xy)
print('第3次')
s3 = svm_c(x_train3,x_test,y_train3,id_test3,x1_test3,y1_test3)
id_test4,x_train4,y_train4,x1_test4,y1_test4 = val4(train_df_xy)
print('第4次')
s4 = svm_c(x_train4,x_test,y_train4,id_test4,x1_test4,y1_test4)
id_test5,x_train5,y_train5,x1_test5,y1_test5 = val5(train_df_xy)
print('第5次')
s5 = svm_c(x_train5,x_test,y_train5,id_test5,x1_test5,y1_test5)
id_test6,x_train6,y_train6,x1_test6,y1_test6 = val6(train_df_xy)
print('第6次')
s6 = svm_c(x_train6,x_test,y_train6,id_test6,x1_test6,y1_test6)
s = (s1+s2+s3+s4+s5+s6)/6
print('last:',s)