学习记录--- 几种Oversampling(上采样)方法

最近在处理数据不均衡的问题,写出来一个简单的代码,程序依旧在跑,觉得程序写的蛮不错的(因为之前都是从上到下一直写下去,这次居然知道调用函数了,羞愧啊),保存下来以后借鉴。

0 数据

  	   标签  数据量
		2	1000*39
		3	1000*39
		4	10000*39
		5	10000*39

1 代码

'''
=======================================================================
本程序功能:

step1: 使用不同的过采样算法 对少数类样本采样,使之成为均衡样本集
step2: 通过基本的svm分类器进行多分类

Author = "Bing"
Date = "2019-7-17"
=======================================================================
'''

print(__doc__)

#plan1 :RandomOverSampler   随机过采样
#通过简单的随机采样少数类的样本, 使得每类样本的比例为1:1:1:1
def Ros(data,label):
    from imblearn.over_sampling import RandomOverSampler
    
    ros = RandomOverSampler(random_state = 0)
    data_ros, label_ros = ros.fit_sample(data, label)
    
    svms(data_ros,label_ros)
    

#plan2 :SMOTE   
#对于少数类样本a, 随机选择一个最近邻的样本b, 然后从a与b的连线上随机选取一个点c作为新的少数类样本;
def Smote(data,label):
    from imblearn.over_sampling import SMOTE
    
    smote = SMOTE(random_state = 0)
    data_smote, label_smote =smote.fit_sample(data, label)
    
    svms(data_smote, label_smote)
    

#plan3 :ADASYN
#用 被 K最近邻分类器 错误分类的 原始样本 生成新的少数类样本
def Adasyn(data,label):
    from imblearn.over_sampling import ADASYN
    
    adasyn = ADASYN(random_state = 0)
    data_adasyn, label_adasyn = adasyn.fit_sample(data, label)
    
    svms(data_adasyn, label_adasyn)
    
    
#plan4 :smote的变体:kind = 'borderline1'
#样本的近邻至少有一半是其他类,(此时样本被称为危险样本)。最近邻中的随机样本b与该少数类样本a来自于不同的类
def Smote_bd1(data,label):    
    from imblearn.over_sampling import SMOTE
    
    smote = SMOTE(kind = 'borderline1',random_state = 0)
    data_smote_bd1, label_smote_bd1 =smote.fit_sample(data, label)
    
    svms(data_smote_bd1, label_smote_bd1)
    
    
#plan5 :smote的变体:kind = 'borderline2'
#危险样本 的 最近邻中的随机样本b可以任意的类
def Smote_bd2(data,label):
    from imblearn.over_sampling import SMOTE
    
    smote = SMOTE(kind = 'borderline2',random_state = 0)
    data_smote_bd2, label_smote_bd2 =smote.fit_sample(data, label)
    
    svms(data_smote_bd2, label_smote_bd2)
    
    
#plan6 :smote的变体:kind = 'svm'
#使用支持向量机分类器产生支持向量然后再生成新的少数类样本.
def Smote_svm(data,label):
    from imblearn.over_sampling import SMOTE
    
    smote = SMOTE(kind = 'svm',random_state = 0)
    data_smote_svm, label_smote_svm =smote.fit_sample(data, label)
    
    svms(data_smote_svm, label_smote_svm)
    

#==============================================================================#
# 使用SVM分类器训练、预测

def svms(data,label):
    from sklearn.svm import SVC 
    from sklearn.model_selection import train_test_split 
    import time
    
    s_train_time = time.time()#开始训练时间
    
    #step1:归一化
    from sklearn.preprocessing import StandardScaler 
    
    data = StandardScaler().fit_transform(data) 
    data = pd.DataFrame(data)
    
    #step2:训练集、测试集
    train_data, test_data, train_label, test_label = train_test_split(data,label,test_size=0.3,random_state=420)

    #step3:调参 + 训练
    #调 gamme
    score = [] 
    gamma_range = np.logspace(-10, 1, 50) #返回在对数刻度上均匀间隔的数字 
    for i in gamma_range:
        clf = SVC(kernel="rbf",gamma = i,cache_size=10000).fit(train_data,train_label)
        score.append(clf.score(test_data,test_label))
        print("score:",score[-1],"gamma=",i)#-1是指列表的最后一个值
        
    gamma_best = gamma_range[score.index(max(score))]
    print("The best accuracy is %f, The best gamma is %f" % (max(score), gamma_best)) 
    
    #调 C
    score = [] 
    C_range = np.linspace(0.01,30,50) 
    for i in C_range:
        clf = SVC(kernel="rbf",C=i,gamma = gamma_best,cache_size=10000).fit(train_data,train_label)
        score.append(clf.score(test_data,test_label))
        print("score:",score[-1],"gamma=",gamma_best,"c=",i)
        
    C_best = C_range[score.index(max(score))]
    print("The best accuracy is %f, The best C is %f" % (max(score), C_best))
    
    #step4: 预测
    predict_label = clf.predict(test_data)
    
    #step5:分类报告
    from sklearn.metrics import classification_report
    
    target_names = ['class 2', 'class 3', 'class 4', 'class 5']
    print(classification_report(test_label, predict_label, target_names=target_names))
  
    e_train_time = time.time()#训练完成时间
    
    #step6:模型训练时间
    print("模型训练时间:",e_train_time-s_train_time)

    
if __name__ =="__main__":
    #导入数据
    import scipy.io as sci
    import pandas as pd
    import numpy as np
    
    path = "C:/Users/bing/Desktop/gan_svms/taskdata_part.mat"   #原始样本
    data = sci.loadmat(path)['taskdata_part'][:,1:40]
    label = sci.loadmat(path)['taskdata_part'][:,0]
    
    print("1、原始样本:")
    svms(data,label)
    
    print("2、原始样本 + ROS生成样本:")
    Ros(data,label)
    
    print("3、原始样本 + SMOTE生成样本:")
    Smote(data,label)
    
    print("4、原始样本 + ADASYN生成样本:")
    Adasyn(data,label)    
    
    print("5、原始样本 + SMOTE_BD1生成样本:")
    Smote_bd1(data,label)    

    print("6、原始样本 + SMOTE_BD2生成样本:")
    Smote_bd2(data,label)
    
    print("7、原始样本 + SMOTE_SVM生成样本:")
    Smote_svm(data,label)

你可能感兴趣的:(大四暑假)