二分类预测初步研究

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor
import math
import matplotlib.pyplot as plt
import seaborn as sns

train_file = 'D:\\train_1.csv' # 数据文件
test_file = 'D:\\test_1.csv' # 数据文件
data =pd.read_csv(train_file)  # 读取训练集
run =pd.read_csv(test_file)  # 读取实战集

#
数据探索
# 各种特征值
# print(data.describe())
# print(run.describe())
# 箱图
# plt.show(data.ix[:,[0]].boxplot())
# plt.show(data.ix[:,1:8].boxplot())
# plt.show(data.ix[:,[8]].boxplot())
# plt.show(run.ix[:,[0]].boxplot())
# plt.show(run.ix[:,1:8].boxplot())
# plt.show(run.ix[:,[8]].boxplot())

# 数据清洗
# 将Label从0,1替换为-1,1,替换后正确率在93.8%基础上有微小提升,方差微小减少
# data['label'][data['label'] == 0] = -1

# 取Log,期望能将偏态数据改善为正态分布的数据
# 区分0,1填充缺失值后,再打开取log功能,对正确率有0.6%的负面影响
# for i in data.columns:  # 正确率从89%提升到90.5%
#     if i != 'label':
#         data[i] = np.log10(data[i])
# run=run.apply(np.log10)

# 处理异常值
nan = float('nan')
data_fenwei14 = data.quantile(0.25# 下4分位数
data_fenwei34 =data.quantile(0.75# 上4分位数
up_limit =data_fenwei34 + 1.5 * (data_fenwei34 - data_fenwei14)  #上限
low_limit =data_fenwei14 - 1.5 * (data_fenwei34 - data_fenwei14)  #下限
run_fenwei14 =run.quantile(0.25 # 下4分位数
run_fenwei34 =run.quantile(0.75# 上4分位数
run_up_limit =run_fenwei34 + 1.5 * (run_fenwei34 - run_fenwei14)  #上限
run_low_limit =run_fenwei14 - 1.5 * (run_fenwei34 - run_fenwei14)  #下限
#
异常值替换为上限正确率81%,替换为中位数正确率89%
# 异常值先改为空值,在根据0,1赋值成中位数效果最好
for i in data.columns:
    if i != 'label':
        data[i][data[i] > up_limit[i]]= nan  # data[i].median()  # up_limit[i]
       
data[i][data[i] < low_limit[i]] = nan  #data[i].median()  # low_limit[i]
for i in run.columns: # 没有异常值,无影响
   
if i != 'label':
        run[i][run[i] >run_up_limit[i]] = nan  # run[i].median()  # run_up_limit[i]
       
run[i][run[i] < run_low_limit[i]] =nan  # run[i].median()  # run_low_limit[i]
#
观察特征相关性,>99%可只取其中一个特征,<1%可删除此特征
# print(data.corr())  # 各列相关矩阵,已消除量纲
# print (data.cov())#各列协方差矩阵,未消除量纲

# 填充缺失值
# data = data.fillna(data.median())  # 中位数填充缺失值
run =run.fillna(run.median())  # 测试集中位数填充缺失值
#
区分0,1填充缺失值, 正确率从90.5%提升到93.8%
d0 = data[data["label"] == 0]
d1 = data[data["label"] == 1]
d0 = d0.fillna(d0.median())
d1 = d1.fillna(d1.median())
d = d0.append(d1)
data = d.sort_index()  # 恢复原来的排序

#
归一化
for i in data.columns:
    if i != 'label':
        data[i] = (data[i] -low_limit[i]) / (up_limit[i] - low_limit[i]) # 极值归一化,范围为(0,1)
        # data[i] = 2 * (data[i] -(up_limit[i] + low_limit[i]) / 2) / (up_limit[i] - low_limit[i])
        #
极值归一化,范围为(-1,1)。效果和(0,1)相同
for i in run.columns:
    if i != 'label':
        run[i] = (run[i] -run_low_limit[i]) / (run_up_limit[i] - run_low_limit[i])  #极值归一化,范围为(0,1)
#
随机抽样生成训练集和测试集
test_count = 10.0 # 必须赋值成浮点型,否则计算不出正确率的小数
right_rate =np.arange(test_count)
right_rate_logistic = np.arange(test_count)
right_rate_knn = np.arange(test_count)
right_rate_rf = np.arange(test_count)
print('训练',test_count, '次,计算平均正确率')
for i in range(int(test_count)):
    new_data = data.sample(frac=1# 随机函数
    #
转换为矩阵
   
new_data = new_data.as_matrix()
    new_run = run.as_matrix()
    # 8:2比例分割为训练集合测试集
   
data_train = new_data[:int(0.8* len(new_data)), :]
    data_test = new_data[int(0.8* len(new_data)):, :]
    # 定义x,y训练和测试变量
   
x_train = data_train[:, 0:9]
    y_train= data_train[:, 9].astype(int)
    x_test = data_test[:, 0:9]
    y_test = data_test[:, 9].astype(int)

    # SVM模型训练
   
model = svm.SVC(C=1.5# (C=1.0, kernel='rbf',  gamma='auto') # 不同核函数的测试
   
model.fit(x_train, y_train# SVM模型训练
    #
逻辑回归训练
   
logistic_regression_model =LogisticRegression()
   logistic_regression_model.fit(x_train, y_train)
    # KNN训练
   
knn_model = KNeighborsClassifier(n_neighbors=5)
    knn_model.fit(x_train, y_train)
    # 随机森林训练
   
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10)
    rf_model.fit(x_train, y_train)
    # SVM预测结果
   
y_pred = model.predict(x_test)
    # 逻辑回归预测结果
    
y_pred_logistic =logistic_regression_model.predict(x_test)
    # KNN预测结果
   
y_pred_knn = knn_model.predict(x_test)
    # 随机森林预测结果
   
y_pred_rf = rf_model.predict(x_test)
    # 将预测结果的小数换算成整数
   
y_pred_rf[y_pred_rf >= 0.5] = 1
   
y_pred_rf[y_pred_rf < 0.5] = 0
   
y_pred_rf = y_pred_rf.astype(int)
    # SVM data_train预测正确率
   
df_result = pd.DataFrame(np.array([y_test,y_pred]).T)
    df_result[2] = df_result[0] - df_result[1]
    print('y_test集正确数:', len(df_result[2][df_result[2] == 0]), ',正确率:',
          "%.2f%%" % (len(df_result[2][df_result[2] == 0]) / len(df_result[2]) * 100))
    right_rate[i] = len(df_result[2][df_result[2] == 0]) / len(df_result[2])
    # 逻辑回归预测正确率
   
df_result_logistic =pd.DataFrame(np.array([y_test, y_pred_logistic]).T)
    df_result_logistic[2] = df_result_logistic[0] - df_result_logistic[1]
    print('y_test_logistic集正确数:', len(df_result_logistic[2][df_result_logistic[2] == 0]), ',正确率:',
          "%.2f%%" % (len(df_result_logistic[2][df_result_logistic[2] == 0]) / len(df_result_logistic[2]) * 100))
    right_rate_logistic[i] = len(df_result_logistic[2][df_result_logistic[2] == 0]) / len(df_result_logistic[2])
    # KNN data_train预测正确率
   
df_result_knn =pd.DataFrame(np.array([y_test, y_pred_knn]).T)
    df_result_knn[2] = df_result_knn[0] - df_result_knn[1]
    print('y_test_knn集正确数:', len(df_result_knn[2][df_result_knn[2] == 0]), ',正确率:',
          "%.2f%%" % (len(df_result_knn[2][df_result_knn[2] == 0]) / len(df_result_knn[2]) * 100))
    right_rate_knn[i] = len(df_result_knn[2][df_result_knn[2] == 0]) / len(df_result_knn[2])
    # 随机森林data_train预测正确率
   
df_result_rf = pd.DataFrame(np.array([y_test,y_pred_rf]).T)
    df_result_rf[2] = df_result_rf[0] - df_result_rf[1]
    print('y_test_rf集正确数:', len(df_result_rf[2][df_result_rf[2] == 0]), ',正确率:',
          "%.2f%%" % (len(df_result_rf[2][df_result_rf[2] == 0]) / len(df_result_rf[2]) * 100))
    right_rate_rf[i] = len(df_result_rf[2][df_result_rf[2] == 0]) / len(df_result_rf[2])
print('SVM平均正确率:', "%.2f%%" % (right_rate.mean() * 100), '标准差', "%.2f%%" % (right_rate.std() * 100))
print('逻辑回归平均正确率:', "%.2f%%" % (right_rate_logistic.mean() * 100), '标准差', "%.2f%%" % (right_rate_logistic.std() * 100))
print('KNN平均正确率:', "%.2f%%" % (right_rate_knn.mean() * 100), '标准差', "%.2f%%" % (right_rate_knn.std() * 100))
print('随机森林平均正确率:', "%.2f%%" % (right_rate_rf.mean() * 100), '标准差', "%.2f%%" % (right_rate_rf.std() * 100))

# 最终结果使用全部数据训练得出
x_final =new_data[:, 0:9]
y_final = new_data[:, 9].astype(int)
# SVM最终训练
model_final =svm.SVC(C=1.5# (C=1.0, kernel='rbf',  gamma='auto')
model_final.fit(x_final,y_final)
run_final = model_final.predict(new_run)
print('SVM run_final集:总计:', len(run_final), ",为1的个数是:", len(run_final[run_final == 1]), ",占比:",
      "%.2f%%" % (len(run_final[run_final == 1]) / len(run_final) * 100))
print(run_final)
# 逻辑回归最终训练
logistic_regression_model_final= LogisticRegression()
logistic_regression_model_final.fit(x_final, y_final)
run_final_logistic = logistic_regression_model_final.predict(new_run)
print('逻辑回归run_final集:总计:', len(run_final_logistic), ",为1的个数是:", len(run_final_logistic[run_final_logistic == 1]),
      ",占比:",
      "%.2f%%" % (len(run_final_logistic[run_final_logistic == 1]) / len(run_final_logistic) * 100))
print(run_final_logistic)
# KNN最终训练
knn_model_final =KNeighborsClassifier(n_neighbors=5)
knn_model_final.fit(x_final, y_final)
run_final_knn = knn_model_final.predict(new_run)
print('KNN run_final_knn集:总计:', len(run_final_knn), ",为1的个数是:", len(run_final_knn[run_final_knn == 1]),
      ",占比:",
      "%.2f%%" % (len(run_final_knn[run_final_knn == 1]) / len(run_final_knn) * 100))
print(run_final_knn)
# 随机森林最终训练
rf_model_final =RandomForestRegressor(n_estimators=100, max_depth=10)
rf_model_final.fit(x_final, y_final)
run_final_rf = rf_model_final.predict(new_run)
run_final_rf[run_final_rf >= 0.5] = 1
run_final_rf[run_final_rf< 0.5] = 0
run_final_rf =run_final_rf.astype(int)
print('随机森林run_final_rf集:总计:', len(run_final_rf), ",为1的个数是:", len(run_final_rf[run_final_rf == 1]),
      ",占比:",
      "%.2f%%" % (len(run_final_rf[run_final_rf == 1]) / len(run_final_rf) * 100))
print(run_final_rf)

# 最终按SVM算法输出结果
result_p =pd.DataFrame(np.array(run_final))
result_p.index=np.arange(1,68)
result_p.index.names=['id']
result_p.columns=['result_p']
result_p.to_csv('D:\\sd_result.csv')
print (result_p)

 

你可能感兴趣的:(二分类预测初步研究)