机器学习分类—SMOTE算法

先粘下来代码,原理之后再补充,训练数据的背景还是之前二分类博客里的那个第五题

# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from numpy import *
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
import pandas as pd
import csv
from pandas import Series, DataFrame
# 导入数据集切割训练与测试数据
%matplotlib inline
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
#数据预处理
train_df = pd.read_csv('./data/FT_Camp_5/Train.csv',index_col=0)

test_df = pd.read_csv('./data/FT_Camp_5/X_test.csv',index_col=0)
print(test_df.columns)
y_train = train_df.pop('fake')
all_df = pd.concat((train_df, test_df), axis=0)

mean_cols = all_df.mean()
all_df = all_df.fillna(mean_cols)
all_df.isnull().sum().sum()

numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std

dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]

train_df_xy = pd.concat((dummy_train_df,y_train), axis=1)

#print(train_df_xy.iloc[1259:1262,:])
def get_id(path):
    id_list = []
    csv_reader = csv.reader(open(path))
    for row in csv_reader:
        id_list.append(row[0])
    id_array = np.array(id_list)
    return id_array

#dummy_train_df = train_df_xy
#y_train = dummy_train_df.pop('fake')

#x1_test = np.array(train1_df,dtype = float)
#y1_test = np.array(train1_label,dtype = int)
#y_train = np.array(y_train,dtype = int)
#x_train = np.array(dummy_train_df,dtype = float)
#x_test = np.array(dummy_test_df,dtype = float)
#id_test = get_id(PATH_TEST)


#简单的预处理
test_date = train_df_xy#pd.concat([data['label'], data.iloc[:, 7:10]], axis=1)
'''

Out[25]: 
   label  max_date_diff  max_pay  cnt_time
0      0           23.0  43068.0        15
1      0           10.0   1899.0         2
2      0          146.0   3299.0        21
3      0           30.0  31959.0        35
4      0            3.0  24165.0        98
test_date['label'][test_date['label']==0].count()/test_date['label'][test_date['label']==1].count()
Out[37]: 67'''

# 筛选目标变量
aimed_date = test_date[test_date['fake'] == 1]#109个
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# 随机筛选少类扩充中心#筛选百分之10得数据
index = pd.DataFrame(aimed_date.index).sample(frac=0.1, random_state=1)

index.columns = ['stockcode']

number = len(index)

# 生成array格式
aimed_date_new = aimed_date.ix[index.values.ravel(), :]

# 自变量标准化#由于我的变量已经进行过标准化,所以这里隐去
#sc = StandardScaler().fit(aimed_date_new)
#aimed_date_new = pd.DataFrame(sc.transform(aimed_date_new))
#print(aimed_date_new)
#sc1 = StandardScaler().fit(aimed_date)
#aimed_date = pd.DataFrame(sc1.transform(aimed_date))

# 定义欧式距离计算#所有类别,包括标签在内得欧式距离
def dist(a, b):
    
    a = array(a)
    b = array(b)
    d = ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2 + (a[2] - b[2]) ** 2 + (a[3] - b[3]) ** 2 + \
          (a[4] - b[4]) ** 2 + (a[5] - b[5]) ** 2 + (a[6] - b[6]) ** 2 + (a[7] - b[7]) ** 2 + \
          (a[8] - b[8]) ** 2 + (a[9] - b[9]) ** 2 + (a[10] - b[10]) ** 2 + (a[11] - b[11]) ** 2 + \
          (a[12] - b[12]) ** 2 + (a[13] - b[13]) ** 2 + (a[14] - b[14]) ** 2 + (a[15] - b[15]) ** 2 + \
          (a[16] - b[16]) ** 2 + (a[17] - b[17]) ** 2 + (a[18] - b[18]) ** 2 + (a[19] - b[19]) ** 2 + \
          (a[20] - b[20]) ** 2 + (a[21] - b[21]) ** 2 + (a[22] - b[22]) ** 2 + (a[23] - b[23]) ** 2 + \
          (a[24] - b[24]) ** 2 + (a[25] - b[25]) ** 2 + (a[26] - b[26]) ** 2 + (a[27] - b[27]) ** 2 + \
          (a[28] - b[28]) ** 2 + (a[29] - b[29]) ** 2 + (a[30] - b[30]) ** 2 + (a[31] - b[31]) ** 2 + \
          (a[32] - b[32]) ** 2 + (a[33] - b[33]) ** 2 + (a[34] - b[34]) ** 2) ** 0.5

    return d

#print('aimed_date_new.iloc[:, 0]',aimed_date_new.iloc[:, 0])
# 统计所有检验距离样本个数
row_l1 = aimed_date_new.iloc[:, 0].count()
row_l2 = aimed_date.iloc[:, 0].count()

a = zeros((row_l1, row_l2))#11行,109列
a = pd.DataFrame(a)

# 计算距离矩阵
for i in range(row_l1):
    for j in range(row_l2):
        d = dist(aimed_date_new.iloc[i, :], aimed_date.iloc[j, :])#计算每个新点和这一对旧点之间的距离
        a.ix[i, j] = d

b = a.T.apply(lambda x:x.min())#找到最小距离
# 找到同类点位置
h = []
z = []
#print(b)
#number=11
#len(a.iloc[i,:]=109)

for i in range(number):
    for j in range(len(a.iloc[i, :])):
        ai = a.iloc[i, j]
        bi = b[i]
        if ai == bi:
            h.append(i)
            z.append(j)
        else:
            continue
new_point = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
new_point = pd.DataFrame(new_point)
for i in range(len(h)):
    index_a = z[i]
    new = aimed_date.iloc[index_a, :]
    new_point = pd.concat([new, new_point], axis=1)

new_point = new_point.iloc[:, range(len(new_point.columns) - 1)]
import random
r1 = []
for i in range(len(new_point.columns)):
    r1.append(random.uniform(0, 1))
new_point_last = []
new_point_last = pd.DataFrame(new_point_last)
# 求新点 new_x=old_x+rand()*(append_x-old_x)
for i in range(len(new_point.columns)):
    
    new_x = (new_point.iloc[0:35, i] - aimed_date_new.iloc[number - 1 - i, 0:35]) * r1[i] + aimed_date_new.iloc[number - 1 - i, 0:35]
    new_point_last = pd.concat([new_point_last, new_x], axis=1)

guodu_df = new_point_last.T

add_df = guodu_df.loc[0,:]

m = len(add_df)
with open('./添加的SMOTE数据.csv', "a+", newline='') as file: # 处理csv读写时不同换行符  linux:\n    windows:\r\n    mac:\r
    writer = csv.writer(file)
    for i in range(m):
        writer.writerow(add_df.iloc[i,:])


'''
csvFile2 = open('添加的SMOTE数据.csv', 'w', newline='')  # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['Sale_grop_ratio', 'Delta_Sale_grop_ratio', 'A_to_L_ratio','Curr_ratio','Quick_ratio','CA_ratio',
                 'ROE','ROA','Cash_to_Reve','Gp_growth rate','Reve_growth_rate','NetCash_OA_growth_rate',
                 'Sale_pro_ratio','AR_TO_ratio','Inv_TO_ratio','TA_TO_ratio','OtherRec_to_Cur','Pre_np',
                 'Pre_to_Cur','Opinion','Neg_Dednp_times','AR_to_Reve','TReve_to_np','Intangible_ass_ratio',
                 'Monetary_to_Cur','Inventory_to_Cur','AR_to_Cur','Nonb_to_np','Np_growth_rate','Mainb_pro_ratio',
                 'Mainb_gp_ratio','CIP_ratio','Hold_Top5','Invest_to_NA','fake'])
m = len(add_df)
print('文件生成中...')
for i in range(m):
    writer.writerow(add_df.iloc[i,:])
csvFile2.close()'''

你可能感兴趣的:(机器学习分类—SMOTE算法)