import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
import math
import matplotlib.pyplot as plt
import seaborn as sns
train_file = 'D:\\train_1.csv' # 数据文件
test_file = 'D:\\test_1.csv' # 数据文件
data =pd.read_csv(train_file) # 读取训练集
run =pd.read_csv(test_file) # 读取实战集
# 数据探索
# 各种特征值
# print(data.describe())
# print(run.describe())
# 箱图
# plt.show(data.ix[:,[0]].boxplot())
# plt.show(data.ix[:,1:8].boxplot())
# plt.show(data.ix[:,[8]].boxplot())
# plt.show(run.ix[:,[0]].boxplot())
# plt.show(run.ix[:,1:8].boxplot())
# plt.show(run.ix[:,[8]].boxplot())
# 数据清洗
# 将Label从0,1替换为-1,1,替换后正确率在93.8%基础上有微小提升,方差微小减少
# data['label'][data['label'] == 0] = -1
# 取Log,期望能将偏态数据改善为正态分布的数据
# 区分0,1填充缺失值后,再打开取log功能,对正确率有0.6%的负面影响
# for i in data.columns: # 正确率从89%提升到90.5%
# if i != 'label':
# data[i] = np.log10(data[i])
# run=run.apply(np.log10)
# 处理异常值
nan = float('nan')
data_fenwei14 = data.quantile(0.25) # 下4分位数
data_fenwei34 =data.quantile(0.75) # 上4分位数
up_limit =data_fenwei34 + 1.5 * (data_fenwei34 - data_fenwei14) #上限
low_limit =data_fenwei14 - 1.5 * (data_fenwei34 - data_fenwei14) #下限
run_fenwei14 =run.quantile(0.25) # 下4分位数
run_fenwei34 =run.quantile(0.75) # 上4分位数
run_up_limit =run_fenwei34 + 1.5 * (run_fenwei34 - run_fenwei14) #上限
run_low_limit =run_fenwei14 - 1.5 * (run_fenwei34 - run_fenwei14) #下限
# 异常值替换为上限正确率81%,替换为中位数正确率89%
# 异常值先改为空值,在根据0,1赋值成中位数效果最好
for i in data.columns:
if i != 'label':
data[i][data[i] > up_limit[i]]= nan # data[i].median() # up_limit[i]
data[i][data[i] < low_limit[i]] = nan #data[i].median() # low_limit[i]
for i in run.columns: # 没有异常值,无影响
if i != 'label':
run[i][run[i] >run_up_limit[i]] = nan # run[i].median() # run_up_limit[i]
run[i][run[i] < run_low_limit[i]] =nan # run[i].median() # run_low_limit[i]
# 观察特征相关性,>99%可只取其中一个特征,<1%可删除此特征
# print(data.corr()) # 各列相关矩阵,已消除量纲
# print (data.cov())#各列协方差矩阵,未消除量纲
# 填充缺失值
# data = data.fillna(data.median()) # 中位数填充缺失值
run =run.fillna(run.median()) # 测试集中位数填充缺失值
# 区分0,1填充缺失值, 正确率从90.5%提升到93.8%
d0 = data[data["label"] == 0]
d1 = data[data["label"] == 1]
d0 = d0.fillna(d0.median())
d1 = d1.fillna(d1.median())
d = d0.append(d1)
data = d.sort_index() # 恢复原来的排序
# 归一化
for i in data.columns:
if i != 'label':
data[i] = (data[i] -low_limit[i]) / (up_limit[i] - low_limit[i]) # 极值归一化,范围为(0,1)
# data[i] = 2 * (data[i] -(up_limit[i] + low_limit[i]) / 2) / (up_limit[i] - low_limit[i])
# 极值归一化,范围为(-1,1)。效果和(0,1)相同
for i in run.columns:
if i != 'label':
run[i] = (run[i] -run_low_limit[i]) / (run_up_limit[i] - run_low_limit[i]) #极值归一化,范围为(0,1)
# 随机抽样生成训练集和测试集
test_count = 10.0 # 必须赋值成浮点型,否则计算不出正确率的小数
right_rate =np.arange(test_count)
right_rate_logistic = np.arange(test_count)
right_rate_knn = np.arange(test_count)
right_rate_rf = np.arange(test_count)
print('训练',test_count, '次,计算平均正确率')
for i in range(int(test_count)):
new_data = data.sample(frac=1) # 随机函数
# 转换为矩阵
new_data = new_data.as_matrix()
new_run = run.as_matrix()
# 8:2比例分割为训练集合测试集
data_train = new_data[:int(0.8* len(new_data)), :]
data_test = new_data[int(0.8* len(new_data)):, :]
# 定义x,y训练和测试变量
x_train = data_train[:, 0:9]
y_train= data_train[:, 9].astype(int)
x_test = data_test[:, 0:9]
y_test = data_test[:, 9].astype(int)
# SVM模型训练
model = svm.SVC(C=1.5) # (C=1.0, kernel='rbf', gamma='auto') # 不同核函数的测试
model.fit(x_train, y_train) # SVM模型训练
# 逻辑回归训练
logistic_regression_model =LogisticRegression()
logistic_regression_model.fit(x_train, y_train)
# KNN训练
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(x_train, y_train)
# 随机森林训练
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10)
rf_model.fit(x_train, y_train)
# SVM预测结果
y_pred = model.predict(x_test)
# 逻辑回归预测结果
y_pred_logistic =logistic_regression_model.predict(x_test)
# KNN预测结果
y_pred_knn = knn_model.predict(x_test)
# 随机森林预测结果
y_pred_rf = rf_model.predict(x_test)
# 将预测结果的小数换算成整数
y_pred_rf[y_pred_rf >= 0.5] = 1
y_pred_rf[y_pred_rf < 0.5] = 0
y_pred_rf = y_pred_rf.astype(int)
# SVM data_train预测正确率
df_result = pd.DataFrame(np.array([y_test,y_pred]).T)
df_result[2] = df_result[0] - df_result[1]
print('y_test集正确数:', len(df_result[2][df_result[2] == 0]), ',正确率:',
"%.2f%%" % (len(df_result[2][df_result[2] == 0]) / len(df_result[2]) * 100))
right_rate[i] = len(df_result[2][df_result[2] == 0]) / len(df_result[2])
# 逻辑回归预测正确率
df_result_logistic =pd.DataFrame(np.array([y_test, y_pred_logistic]).T)
df_result_logistic[2] = df_result_logistic[0] - df_result_logistic[1]
print('y_test_logistic集正确数:', len(df_result_logistic[2][df_result_logistic[2] == 0]), ',正确率:',
"%.2f%%" % (len(df_result_logistic[2][df_result_logistic[2] == 0]) / len(df_result_logistic[2]) * 100))
right_rate_logistic[i] = len(df_result_logistic[2][df_result_logistic[2] == 0]) / len(df_result_logistic[2])
# KNN data_train预测正确率
df_result_knn =pd.DataFrame(np.array([y_test, y_pred_knn]).T)
df_result_knn[2] = df_result_knn[0] - df_result_knn[1]
print('y_test_knn集正确数:', len(df_result_knn[2][df_result_knn[2] == 0]), ',正确率:',
"%.2f%%" % (len(df_result_knn[2][df_result_knn[2] == 0]) / len(df_result_knn[2]) * 100))
right_rate_knn[i] = len(df_result_knn[2][df_result_knn[2] == 0]) / len(df_result_knn[2])
# 随机森林data_train预测正确率
df_result_rf = pd.DataFrame(np.array([y_test,y_pred_rf]).T)
df_result_rf[2] = df_result_rf[0] - df_result_rf[1]
print('y_test_rf集正确数:', len(df_result_rf[2][df_result_rf[2] == 0]), ',正确率:',
"%.2f%%" % (len(df_result_rf[2][df_result_rf[2] == 0]) / len(df_result_rf[2]) * 100))
right_rate_rf[i] = len(df_result_rf[2][df_result_rf[2] == 0]) / len(df_result_rf[2])
print('SVM平均正确率:', "%.2f%%" % (right_rate.mean() * 100), '标准差', "%.2f%%" % (right_rate.std() * 100))
print('逻辑回归平均正确率:', "%.2f%%" % (right_rate_logistic.mean() * 100), '标准差', "%.2f%%" % (right_rate_logistic.std() * 100))
print('KNN平均正确率:', "%.2f%%" % (right_rate_knn.mean() * 100), '标准差', "%.2f%%" % (right_rate_knn.std() * 100))
print('随机森林平均正确率:', "%.2f%%" % (right_rate_rf.mean() * 100), '标准差', "%.2f%%" % (right_rate_rf.std() * 100))
# 最终结果使用全部数据训练得出
x_final =new_data[:, 0:9]
y_final = new_data[:, 9].astype(int)
# SVM最终训练
model_final =svm.SVC(C=1.5) # (C=1.0, kernel='rbf', gamma='auto')
model_final.fit(x_final,y_final)
run_final = model_final.predict(new_run)
print('SVM run_final集:总计:', len(run_final), ",为1的个数是:", len(run_final[run_final == 1]), ",占比:",
"%.2f%%" % (len(run_final[run_final == 1]) / len(run_final) * 100))
print(run_final)
# 逻辑回归最终训练
logistic_regression_model_final= LogisticRegression()
logistic_regression_model_final.fit(x_final, y_final)
run_final_logistic = logistic_regression_model_final.predict(new_run)
print('逻辑回归run_final集:总计:', len(run_final_logistic), ",为1的个数是:", len(run_final_logistic[run_final_logistic == 1]),
",占比:",
"%.2f%%" % (len(run_final_logistic[run_final_logistic == 1]) / len(run_final_logistic) * 100))
print(run_final_logistic)
# KNN最终训练
knn_model_final =KNeighborsClassifier(n_neighbors=5)
knn_model_final.fit(x_final, y_final)
run_final_knn = knn_model_final.predict(new_run)
print('KNN run_final_knn集:总计:', len(run_final_knn), ",为1的个数是:", len(run_final_knn[run_final_knn == 1]),
",占比:",
"%.2f%%" % (len(run_final_knn[run_final_knn == 1]) / len(run_final_knn) * 100))
print(run_final_knn)
# 随机森林最终训练
rf_model_final =RandomForestRegressor(n_estimators=100, max_depth=10)
rf_model_final.fit(x_final, y_final)
run_final_rf = rf_model_final.predict(new_run)
run_final_rf[run_final_rf >= 0.5] = 1
run_final_rf[run_final_rf< 0.5] = 0
run_final_rf =run_final_rf.astype(int)
print('随机森林run_final_rf集:总计:', len(run_final_rf), ",为1的个数是:", len(run_final_rf[run_final_rf == 1]),
",占比:",
"%.2f%%" % (len(run_final_rf[run_final_rf == 1]) / len(run_final_rf) * 100))
print(run_final_rf)
# 最终按SVM算法输出结果
result_p =pd.DataFrame(np.array(run_final))
result_p.index=np.arange(1,68)
result_p.index.names=['id']
result_p.columns=['result_p']
result_p.to_csv('D:\\sd_result.csv')
print (result_p)