svm算法python代码_SVM算法之代码实现

import csv

import numpy as np

import matplotlib.pyplot as plt

import copy

from time import sleep

import random

import types

def loadDataset(filename):

with open(filename, 'r') as f:

lines = csv.reader(f)

data_set = list(lines)

if filename != 'titanic.csv':

for i in range(len(data_set)):

del(data_set[i][0])

# 整理数据

for i in range(len(data_set)):

del(data_set[i][0])

del(data_set[i][2])

data_set[i][4] += data_set[i][5]

del(data_set[i][5])

del(data_set[i][5])

del(data_set[i][6])

del(data_set[i][-1])

category = data_set[0]

del (data_set[0])

# 转换数据格式

for data in data_set:

data[0] = int(data[0])

data[1] = int(data[1])

if data[3] != '':

data[3] = float(data[3])

else:

data[3] = None

data[4] = float(data[4])

data[5] = float(data[5])

# 补全缺失值 转换记录方式 分类

for data in data_set:

if data[3] is None:

data[3] = 28

# male : 1, female : 0

if data[2] == 'male':

data[2] = 1

else:

data[2] = 0

# 经过测试,如果不将数据进行以下处理,分布会过于密集,处理后,数据的分布变得稀疏了

# age <25 为0, 25<=age<31为1,age>=31为2

if data[3] < 25:

data[3] = 0

elif data[3] >= 21 and data[3] < 60: # 但是测试得60分界准确率最高???!!!

data[3] = 1

else:

data[3] = 2

# sibsp&parcg以2为界限,小于为0,大于为1

if data[4] < 2:

data[4] = 0

else:

data[4] = 1

# fare以64为界限

if data[-1] < 64:

data[-1] = 0

else:

data[-1] = 1

return data_set, category

def split_data(data):

data_set = copy.deepcopy(data)

data_mat = []

label_mat = []

for i in range(len(data_set)):

if data_set[i][0] == 0:

data_set[i][0] = -1

label_mat.append(data_set[i][0])

del(data_set[i][0])

data_mat.append(data_set[i])

print(data_mat)

print(label_mat)

return data_mat, label_mat

def select_j_rand(i ,m):

# 选取alpha

j = i

while j == i:

j = int(random.uniform(0, m))

return j

def clip_alptha(aj, H, L):

# 修剪alpha

if aj > H:

aj = H

if L > aj:

aj = L

return aj

def smo(data_mat_In, class_label, C, toler, max_iter):

# 转化为numpy的mat存储

data_matrix = np.mat(data_mat_In)

label_mat = np.mat(class_label).transpose()

# data_matrix = data_mat_In

# label_mat = class_label

# 初始化b,统计data_matrix的纬度

b = 0

m, n = np.shape(data_matrix)

# 初始化alpha,设为0

alphas = np.mat(np.zeros((m, 1)))

# 初始化迭代次数

iter_num = 0

# 最多迭代max_iter次

while iter_num < max_iter:

alpha_pairs_changed = 0

for i in range(m):

# 计算误差Ei

fxi = float(np.multiply(alphas, label_mat).T*(data_matrix*data_matrix[i, :].T)) + b

Ei = fxi - float(label_mat[i])

# 优化alpha,松弛向量

if (label_mat[i]*Ei < -toler and alphas[i] < C) or (label_mat[i]*Ei > toler and alphas[i] > 0):

# 随机选取另一个与alpha_j成对优化的alpha_j

j = select_j_rand(i, m)

# 1.计算误差Ej

fxj = float(np.multiply(alphas, label_mat).T*(data_matrix*data_matrix[j, :].T)) + b

Ej = fxj - float(label_mat[j])

# 保存更新前的alpha,deepcopy

alpha_i_old = copy.deepcopy(alphas[i])

alpha_j_old = copy.deepcopy(alphas[j])

# 2.计算上下界L和H

if label_mat[i] != label_mat[j]:

L = max(0, alphas[j] - alphas[i])

H = min(C, C + alphas[j] - alphas[i])

else:

L = max(0, alphas[j] + alphas[i] - C)

H = min(C, alphas[j] + alphas[i])

if L == H:

print("L == H")

continue

# 3.计算eta

eta = 2.0 * data_matrix[i, :]*data_matrix[j, :].T - data_matrix[i, :]*data_matrix[i, :].T - data_matrix[j, :]*data_matrix[j, :].T

if eta >= 0:

print("eta >= 0")

continue

# 4.更新alpha_j

alphas[j] -= label_mat[j]*(Ei - Ej)/eta

# 5.修剪alpha_j

alphas[j] = clip_alptha(alphas[j], H, L)

if abs(alphas[j] - alphas[i]) < 0.001:

print("alpha_j变化太小")

continue

# 6.更新alpha_i

alphas[i] += label_mat[j]*label_mat[i]*(alpha_j_old - alphas[j])

# 7.更新b_1和b_2

b_1 = b - Ei - label_mat[i]*(alphas[i] - alpha_i_old)*data_matrix[i, :]*data_matrix[i, :].T - label_mat[j]*(alphas[j] - alpha_j_old)*data_matrix[i, :]*data_matrix[j, :].T

b_2 = b - Ej - label_mat[i]*(alphas[i] - alpha_i_old)*data_matrix[i, :]*data_matrix[j, :].T - label_mat[j]*(alphas[j] - alpha_j_old)*data_matrix[j, :] * data_matrix[j, :].T

# 8.根据b_1和b_2更新b

if 0 < alphas[i] and C > alphas[i]:

b = b_1

elif 0 < alphas[j] and C > alphas[j]:

b = b_2

else:

b = (b_1 + b_2)/2

# 统计优化次数

alpha_pairs_changed += 1

# 打印统计信息

print("第%d次迭代 样本:%d, alpha优化次数:%d" % (iter_num, i, alpha_pairs_changed))

# 更新迭代次数

if alpha_pairs_changed == 0:

iter_num += 1

else:

iter_num = 0

print("迭代次数:%d" % iter_num)

return b, alphas

def caluelate_w(data_mat, label_mat, alphas):

# 计算w

alphas = np.array(alphas)

data_mat = np.array(data_mat)

label_mat = np.array(label_mat)

# numpy.tile(A, reps):通过重复A给出的次数来构造数组。

# numpy中reshape函数的三种常见相关用法

# reshape(1, -1)转化成1行:

# reshape(2, -1)转换成两行:

# reshape(-1, 1)转换成1列:

# reshape(-1, 2)转化成两列

w = np.dot((np.tile(label_mat.reshape(1, -1).T, (1, 5))*data_mat).T, alphas)

return w.tolist()

def prediction(test, w, b):

test = np.mat(test)

result = []

for i in test:

if i*w+b > 0:

result.append(1)

else:

result.append(-1)

print(result)

return result

if __name__ == "__main__":

test_set, category = loadDataset('titanic_test.csv')

data_set, category = loadDataset('titanic_train.csv')

test_mat, test_label = split_data(test_set)

data_mat, label_mat = split_data(data_set)

b, alphas = smo(data_mat, list(label_mat), 0.6, 0.001, 40)

print(b)

print(alphas)

w = caluelate_w(data_mat, label_mat, alphas)

print(w)

print(test_mat)

print(test_label)

result = prediction(test_mat, w, b)

count = 0

for i in range(len(result)):

if result[i] == test_label[i]:

count += 1

print(count)

print(count/len(result))

你可能感兴趣的:(svm算法python代码)