先粘下来代码,原理之后再补充,训练数据的背景还是之前二分类博客里的那个第五题
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from numpy import *
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
import pandas as pd
import csv
from pandas import Series, DataFrame
# 导入数据集切割训练与测试数据
%matplotlib inline
PATH_TEST = './data/FT_Camp_5/X_test.csv'
PATH_TRAIN = './data/FT_Camp_5/Train.csv'
#数据预处理
train_df = pd.read_csv('./data/FT_Camp_5/Train.csv',index_col=0)
test_df = pd.read_csv('./data/FT_Camp_5/X_test.csv',index_col=0)
print(test_df.columns)
y_train = train_df.pop('fake')
all_df = pd.concat((train_df, test_df), axis=0)
mean_cols = all_df.mean()
all_df = all_df.fillna(mean_cols)
all_df.isnull().sum().sum()
numeric_cols = all_df.columns[all_df.dtypes != 'object']
numeric_col_means = all_df.loc[:, numeric_cols].mean()
numeric_col_std = all_df.loc[:, numeric_cols].std()
all_df.loc[:, numeric_cols] = (all_df.loc[:, numeric_cols] - numeric_col_means) / numeric_col_std
dummy_train_df = all_df.loc[train_df.index]
dummy_test_df = all_df.loc[test_df.index]
train_df_xy = pd.concat((dummy_train_df,y_train), axis=1)
#print(train_df_xy.iloc[1259:1262,:])
def get_id(path):
id_list = []
csv_reader = csv.reader(open(path))
for row in csv_reader:
id_list.append(row[0])
id_array = np.array(id_list)
return id_array
#dummy_train_df = train_df_xy
#y_train = dummy_train_df.pop('fake')
#x1_test = np.array(train1_df,dtype = float)
#y1_test = np.array(train1_label,dtype = int)
#y_train = np.array(y_train,dtype = int)
#x_train = np.array(dummy_train_df,dtype = float)
#x_test = np.array(dummy_test_df,dtype = float)
#id_test = get_id(PATH_TEST)
#简单的预处理
test_date = train_df_xy#pd.concat([data['label'], data.iloc[:, 7:10]], axis=1)
'''
Out[25]:
label max_date_diff max_pay cnt_time
0 0 23.0 43068.0 15
1 0 10.0 1899.0 2
2 0 146.0 3299.0 21
3 0 30.0 31959.0 35
4 0 3.0 24165.0 98
test_date['label'][test_date['label']==0].count()/test_date['label'][test_date['label']==1].count()
Out[37]: 67'''
# 筛选目标变量
aimed_date = test_date[test_date['fake'] == 1]#109个
''''''''''''''''''''''''''''''''''''''''''''''''''''''''''''
# 随机筛选少类扩充中心#筛选百分之10得数据
index = pd.DataFrame(aimed_date.index).sample(frac=0.1, random_state=1)
index.columns = ['stockcode']
number = len(index)
# 生成array格式
aimed_date_new = aimed_date.ix[index.values.ravel(), :]
# 自变量标准化#由于我的变量已经进行过标准化,所以这里隐去
#sc = StandardScaler().fit(aimed_date_new)
#aimed_date_new = pd.DataFrame(sc.transform(aimed_date_new))
#print(aimed_date_new)
#sc1 = StandardScaler().fit(aimed_date)
#aimed_date = pd.DataFrame(sc1.transform(aimed_date))
# 定义欧式距离计算#所有类别,包括标签在内得欧式距离
def dist(a, b):
a = array(a)
b = array(b)
d = ((a[0] - b[0]) ** 2 + (a[1] - b[1]) ** 2 + (a[2] - b[2]) ** 2 + (a[3] - b[3]) ** 2 + \
(a[4] - b[4]) ** 2 + (a[5] - b[5]) ** 2 + (a[6] - b[6]) ** 2 + (a[7] - b[7]) ** 2 + \
(a[8] - b[8]) ** 2 + (a[9] - b[9]) ** 2 + (a[10] - b[10]) ** 2 + (a[11] - b[11]) ** 2 + \
(a[12] - b[12]) ** 2 + (a[13] - b[13]) ** 2 + (a[14] - b[14]) ** 2 + (a[15] - b[15]) ** 2 + \
(a[16] - b[16]) ** 2 + (a[17] - b[17]) ** 2 + (a[18] - b[18]) ** 2 + (a[19] - b[19]) ** 2 + \
(a[20] - b[20]) ** 2 + (a[21] - b[21]) ** 2 + (a[22] - b[22]) ** 2 + (a[23] - b[23]) ** 2 + \
(a[24] - b[24]) ** 2 + (a[25] - b[25]) ** 2 + (a[26] - b[26]) ** 2 + (a[27] - b[27]) ** 2 + \
(a[28] - b[28]) ** 2 + (a[29] - b[29]) ** 2 + (a[30] - b[30]) ** 2 + (a[31] - b[31]) ** 2 + \
(a[32] - b[32]) ** 2 + (a[33] - b[33]) ** 2 + (a[34] - b[34]) ** 2) ** 0.5
return d
#print('aimed_date_new.iloc[:, 0]',aimed_date_new.iloc[:, 0])
# 统计所有检验距离样本个数
row_l1 = aimed_date_new.iloc[:, 0].count()
row_l2 = aimed_date.iloc[:, 0].count()
a = zeros((row_l1, row_l2))#11行,109列
a = pd.DataFrame(a)
# 计算距离矩阵
for i in range(row_l1):
for j in range(row_l2):
d = dist(aimed_date_new.iloc[i, :], aimed_date.iloc[j, :])#计算每个新点和这一对旧点之间的距离
a.ix[i, j] = d
b = a.T.apply(lambda x:x.min())#找到最小距离
# 找到同类点位置
h = []
z = []
#print(b)
#number=11
#len(a.iloc[i,:]=109)
for i in range(number):
for j in range(len(a.iloc[i, :])):
ai = a.iloc[i, j]
bi = b[i]
if ai == bi:
h.append(i)
z.append(j)
else:
continue
new_point = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
new_point = pd.DataFrame(new_point)
for i in range(len(h)):
index_a = z[i]
new = aimed_date.iloc[index_a, :]
new_point = pd.concat([new, new_point], axis=1)
new_point = new_point.iloc[:, range(len(new_point.columns) - 1)]
import random
r1 = []
for i in range(len(new_point.columns)):
r1.append(random.uniform(0, 1))
new_point_last = []
new_point_last = pd.DataFrame(new_point_last)
# 求新点 new_x=old_x+rand()*(append_x-old_x)
for i in range(len(new_point.columns)):
new_x = (new_point.iloc[0:35, i] - aimed_date_new.iloc[number - 1 - i, 0:35]) * r1[i] + aimed_date_new.iloc[number - 1 - i, 0:35]
new_point_last = pd.concat([new_point_last, new_x], axis=1)
guodu_df = new_point_last.T
add_df = guodu_df.loc[0,:]
m = len(add_df)
with open('./添加的SMOTE数据.csv', "a+", newline='') as file: # 处理csv读写时不同换行符 linux:\n windows:\r\n mac:\r
writer = csv.writer(file)
for i in range(m):
writer.writerow(add_df.iloc[i,:])
'''
csvFile2 = open('添加的SMOTE数据.csv', 'w', newline='') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile2)
writer.writerow(['Sale_grop_ratio', 'Delta_Sale_grop_ratio', 'A_to_L_ratio','Curr_ratio','Quick_ratio','CA_ratio',
'ROE','ROA','Cash_to_Reve','Gp_growth rate','Reve_growth_rate','NetCash_OA_growth_rate',
'Sale_pro_ratio','AR_TO_ratio','Inv_TO_ratio','TA_TO_ratio','OtherRec_to_Cur','Pre_np',
'Pre_to_Cur','Opinion','Neg_Dednp_times','AR_to_Reve','TReve_to_np','Intangible_ass_ratio',
'Monetary_to_Cur','Inventory_to_Cur','AR_to_Cur','Nonb_to_np','Np_growth_rate','Mainb_pro_ratio',
'Mainb_gp_ratio','CIP_ratio','Hold_Top5','Invest_to_NA','fake'])
m = len(add_df)
print('文件生成中...')
for i in range(m):
writer.writerow(add_df.iloc[i,:])
csvFile2.close()'''