最近在处理数据不均衡的问题,写出来一个简单的代码,程序依旧在跑,觉得程序写的蛮不错的(因为之前都是从上到下一直写下去,这次居然知道调用函数了,羞愧啊),保存下来以后借鉴。
0 数据
标签 数据量
2 1000*39
3 1000*39
4 10000*39
5 10000*39
1 代码
'''
=======================================================================
本程序功能:
step1: 使用不同的过采样算法 对少数类样本采样,使之成为均衡样本集
step2: 通过基本的svm分类器进行多分类
Author = "Bing"
Date = "2019-7-17"
=======================================================================
'''
print(__doc__)
#plan1 :RandomOverSampler 随机过采样
#通过简单的随机采样少数类的样本, 使得每类样本的比例为1:1:1:1
def Ros(data,label):
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state = 0)
data_ros, label_ros = ros.fit_sample(data, label)
svms(data_ros,label_ros)
#plan2 :SMOTE
#对于少数类样本a, 随机选择一个最近邻的样本b, 然后从a与b的连线上随机选取一个点c作为新的少数类样本;
def Smote(data,label):
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state = 0)
data_smote, label_smote =smote.fit_sample(data, label)
svms(data_smote, label_smote)
#plan3 :ADASYN
#用 被 K最近邻分类器 错误分类的 原始样本 生成新的少数类样本
def Adasyn(data,label):
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state = 0)
data_adasyn, label_adasyn = adasyn.fit_sample(data, label)
svms(data_adasyn, label_adasyn)
#plan4 :smote的变体:kind = 'borderline1'
#样本的近邻至少有一半是其他类,(此时样本被称为危险样本)。最近邻中的随机样本b与该少数类样本a来自于不同的类
def Smote_bd1(data,label):
from imblearn.over_sampling import SMOTE
smote = SMOTE(kind = 'borderline1',random_state = 0)
data_smote_bd1, label_smote_bd1 =smote.fit_sample(data, label)
svms(data_smote_bd1, label_smote_bd1)
#plan5 :smote的变体:kind = 'borderline2'
#危险样本 的 最近邻中的随机样本b可以任意的类
def Smote_bd2(data,label):
from imblearn.over_sampling import SMOTE
smote = SMOTE(kind = 'borderline2',random_state = 0)
data_smote_bd2, label_smote_bd2 =smote.fit_sample(data, label)
svms(data_smote_bd2, label_smote_bd2)
#plan6 :smote的变体:kind = 'svm'
#使用支持向量机分类器产生支持向量然后再生成新的少数类样本.
def Smote_svm(data,label):
from imblearn.over_sampling import SMOTE
smote = SMOTE(kind = 'svm',random_state = 0)
data_smote_svm, label_smote_svm =smote.fit_sample(data, label)
svms(data_smote_svm, label_smote_svm)
#==============================================================================#
# 使用SVM分类器训练、预测
def svms(data,label):
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import time
s_train_time = time.time()#开始训练时间
#step1:归一化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(data)
data = pd.DataFrame(data)
#step2:训练集、测试集
train_data, test_data, train_label, test_label = train_test_split(data,label,test_size=0.3,random_state=420)
#step3:调参 + 训练
#调 gamme
score = []
gamma_range = np.logspace(-10, 1, 50) #返回在对数刻度上均匀间隔的数字
for i in gamma_range:
clf = SVC(kernel="rbf",gamma = i,cache_size=10000).fit(train_data,train_label)
score.append(clf.score(test_data,test_label))
print("score:",score[-1],"gamma=",i)#-1是指列表的最后一个值
gamma_best = gamma_range[score.index(max(score))]
print("The best accuracy is %f, The best gamma is %f" % (max(score), gamma_best))
#调 C
score = []
C_range = np.linspace(0.01,30,50)
for i in C_range:
clf = SVC(kernel="rbf",C=i,gamma = gamma_best,cache_size=10000).fit(train_data,train_label)
score.append(clf.score(test_data,test_label))
print("score:",score[-1],"gamma=",gamma_best,"c=",i)
C_best = C_range[score.index(max(score))]
print("The best accuracy is %f, The best C is %f" % (max(score), C_best))
#step4: 预测
predict_label = clf.predict(test_data)
#step5:分类报告
from sklearn.metrics import classification_report
target_names = ['class 2', 'class 3', 'class 4', 'class 5']
print(classification_report(test_label, predict_label, target_names=target_names))
e_train_time = time.time()#训练完成时间
#step6:模型训练时间
print("模型训练时间:",e_train_time-s_train_time)
if __name__ =="__main__":
#导入数据
import scipy.io as sci
import pandas as pd
import numpy as np
path = "C:/Users/bing/Desktop/gan_svms/taskdata_part.mat" #原始样本
data = sci.loadmat(path)['taskdata_part'][:,1:40]
label = sci.loadmat(path)['taskdata_part'][:,0]
print("1、原始样本:")
svms(data,label)
print("2、原始样本 + ROS生成样本:")
Ros(data,label)
print("3、原始样本 + SMOTE生成样本:")
Smote(data,label)
print("4、原始样本 + ADASYN生成样本:")
Adasyn(data,label)
print("5、原始样本 + SMOTE_BD1生成样本:")
Smote_bd1(data,label)
print("6、原始样本 + SMOTE_BD2生成样本:")
Smote_bd2(data,label)
print("7、原始样本 + SMOTE_SVM生成样本:")
Smote_svm(data,label)