import numpy as np import pandas as pd import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from sklearn.preprocessing import LabelEncoder from sklearn.neighbors import KNeighborsClassifier from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.linear_model import LogisticRegression #加载数据返回dataframe def load_data(): data = pd.read_csv('D:\data\lgdata.csv',index_col=False, header=None,names=['target','x1','x2','x3','x4','x5', 'x6','x7','x8','x9','x10','x11','x12', 'x13','x14','x15','x16','x17','x18','x19', 'x20','x21','x22']) return data #特征工程:补齐缺失值对离散特征进行独热码以及对target进行labelecode def deal_feature(data): #首先使用knn对特征进行补齐 #在此之前先将数据集进行独热码处理,否则无法使用knn #需要补齐得数据 l_data = data.loc[data['x11'].isin(['?'])] #训练数据 t_data = data.loc[data['x11']!='?'] #训练数据得x t_x = t_data.loc[:,['target','x1','x2','x3','x4','x5', 'x6','x7','x8','x9','x10','x12', 'x13','x14','x15','x16','x17','x18','x19', 'x20','x21','x22']] #独热码,实际是增加了维度 t_x = pd.get_dummies(t_x) #训练得y t_y = t_data.loc[:,['x11']] #对目标labelencoder处理 x11_l = LabelEncoder() #处理训练集得y t_y = x11_l.fit_transform(t_y) #需要补齐得数据,测试集 l_x = l_data.loc[:,['target','x1','x2','x3','x4','x5', 'x6','x7','x8','x9','x10','x12', 'x13','x14','x15','x16','x17','x18','x19', 'x20','x21','x22']] l_data = l_x l_x = pd.get_dummies(l_x) #由于数据不随机性导致。特征值缺失,所以只能将训练集和测试集得共同特征提取出来 f = [] #之对共同特征进行处理 f = l_x.columns&t_x.columns t_x = t_x.loc[:,f] #首先将特征转化为独热码以及x11转化为lable knn = KNeighborsClassifier(n_neighbors=5) knn.fit(t_x,t_y) l_x = l_x.loc[:,f] x11 = knn.predict(l_x) x11 = x11_l.inverse_transform(x11) #将x11放到data中 # 形成新得dataframe # t_data和l_data,x11, l_data.insert(11, 'x11', x11) #data为填充好得数据 data = t_data.append(l_data) # 首先对target进行类别编码 target_le = LabelEncoder() y = target_le.fit_transform(data['target'].values) a = ['x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x10', 'x11', 'x12', 'x13', 'x14', 'x15', 'x16', 'x17', 'x18', 'x19', 'x20', 'x21', 'x22'] # 对离散化特征独热码处理 x = pd.get_dummies(data[a]) return x,y def deal_pca(data): pca = PCA(n_components=70) t_data = pca.fit_transform(data) print(pca.explained_variance_) return t_data def test_L(x,y): x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=2) print(x_train.shape) print(x_test.shape) # 逻辑回归。写出损失函数 LR = LogisticRegression(C=1.0, penalty='l1', tol=0.01) LR.fit(x_train, y_train) LR.predict(x_test) print(LR.score(x_test, y_test)) if __name__=='__main__': #处理特征 x,y = deal_feature(load_data()) test_L(x,y) #pca降维处理 x = deal_pca(x) test_L(x,y) #画图 ''' data = pd.DataFrame(x,columns=['x1','x2','x3']) print(data) data.insert(3,'class',y) print(data) positive = data[data['class'] == 1] negative = data[data['class'] == 0] fig = plt.figure(figsize=(100,100)) ax = Axes3D(fig) ax.scatter(positive['x1'], positive['x2'],positive['x3'], c='r', marker='x', s=30, label='p') ax.scatter(negative['x1'], negative['x2'], negative['x3'],s=30, c='b', marker='o', label='e') ax.legend() ax.set_xlabel('x1') ax.set_ylabel('x2') ax.set_zlabel('x3') plt.show() ''' #划分训练集,测试集 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7, random_state=1) print(x_train.shape) print(x_test.shape) #逻辑回归。写出损失函数 #梯度下降法求训练参数