BP网络识别钓鱼网站

BP网络识别钓鱼网站的应用

声明
(1)该博文为个人的学习总结以及学习成果的应用,具体引用的资料请看参考文献。
( 2 )数据来源:https://archive.ics.uci.edu/ml/datasets/Website+Phishing

代码
1、定义相关函数

import numpy as np

# sigmoid函数
def sig(x):
    return 1 / (1 + np.exp(-x))

# sigoid函数的一阶导数
def partial_sig(x):
    out = np.multiply(sig(x), (np.ones(x.shape) - sig(x))) #np.multiplt()对应位置的乘积
    return out
 
# 计算隐含层的输入
def hidden_in(feature, w0, b0):
    return feature * w0 + b0

# 计算隐含层的输出
def hidden_out(hidden_in):
    return sig(hidden_in)

# 计算输出层的输入
def predict_in(hidden_out, w1, b1):
    return hidden_out * w1 + b1

def predict_out(predict_in):
    return sig(predict_in)

模型训练函数

def bp_train(feature, label, n_hidden, maxCycle, alpha, n_output):
    m, n = np.shape(feature)
    # 1、初始化权重和偏置
    np.random.seed(2019)
    w0 = np.mat(np.random.normal(0, 1, size = (n, n_hidden)))
    b0 = np.mat(np.random.normal(0, 1, size = (1, n_hidden)))
    w1 = np.mat(np.random.normal(0, 1, size = (n_hidden, n_output)))
    b1 = np.mat(np.random.normal(0, 1, size = (1, n_output)))
    
    # 2、训练
    i = 0
    history_cost = []
    while i <= maxCycle:
        # 2.1、信号正向传播
        hidden_input = hidden_in(feature, w0, b0)
        hidden_output = hidden_out(hidden_input)
        output_in = predict_in(hidden_output, w1, b1)
        output_out = predict_out(output_in)
        
        # 2.2、误差的反向传播
        delta_output = -np.multiply((label - output_out), partial_sig(output_in))
        delta_hidden = np.multiply((delta_output * w1.T), partial_sig(hidden_input))
        
        # 2.3、 修正权重和偏置       
        w1 = w1 - alpha * (hidden_output.T * delta_output)
        b1 = b1 - alpha * np.sum(delta_output, axis=0) * (1.0 / m)
        w0 = w0 - alpha * (feature.T * delta_hidden)
        b0 = b0 - alpha * np.sum(delta_hidden, axis=0) * (1.0 / m)
        history_cost.append(get_cost(get_predict(feature, w0, w1, b0, b1) - label))
        if i % 100 == 0:
            print("\t-------- iter: {:5}".format(i), \
            " ,cost: {:<.20}".format((1.0/2) * get_cost(get_predict(feature, w0, w1, b0, b1) \
            - label)),"--------")
        i += 1           
    return w0, w1, b0, b1, history_cost

计算误差,预测,准确性的相关函数

# 计算cost
def get_cost(cost):
    cost_sum = np.sum(np.multiply(cost,cost))
    return cost_sum / np.shape(cost)[0]

# 预测
def get_predict(feature, w0, w1, b0, b1):
	return predict_out(predict_in(hidden_out(hidden_in(feature, w0, b0)), w1, b1))

# 计算准确率
def accuracy(label, pre):
    m = np.shape(label)[0]
    acc = 0.0
    for i in range(m):
        if label[i, 0] == pre[i, 0]:
            acc += 1
    return acc / m

数据预处理

import pandas as pd
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

data =  pd.read_csv("E:/data/PhishingData.csv")
data["Result"] = data["Result"] + 1

y_data = np.array(data["Result"])
x_data = np.array(data.drop(["Result"],axis = 1))
Y_data = to_categorical(y_data,3)
x_train, x_test, y_train, y_test = train_test_split(x_data, Y_data, \
											test_size = 0.2, random_state = 2019)

主函数

if __name__ == "__main__":
    # 1、导入数据
    #print("--------- 1.load data ------------")
    #feature, label, n_class = load_data("data.txt")
    # 2、训练网络模型
    print("--------- 1.training ------------")
    w0, w1, b0, b1, hist = bp_train(x_train, y_train, 100, 1000, 0.001, 3)
    # 3、保存最终的模型
    #print("--------- 3.save model ------------")
    #save_model(w0, w1, b0, b1)
    # 4、得到最终的预测结果
    print("--------- 2.get prediction ------------")
    result = get_predict(x_test, w0, w1, b0, b1)
    print("训练准确性为:", (accuracy(np.argmax(y_test, axis=1), np.argmax(result, axis=1))))

结果
BP网络识别钓鱼网站_第1张图片

绘制cost变化图

import matplotlib.pyplot as plt
plt.plot(np.arange(len(hist)),hist)
plt.title("history cost")
plt.xlabel("epoch")
plt.ylabel("cost")
plt.show()

如下图所示:
BP网络识别钓鱼网站_第2张图片

而本模型主要用于识别钓鱼网站,因此将标签为钓鱼网站已有的数据进行识别查看其识别的准确性

测试代码

test = data[data["Result"]==0]

y_test_1 = np.array(test["Result"])
x_test_1 = np.array(test.drop(['Result'], axis = 1))
y_test_1 = to_categorical(y_test_1,3)

result_1 = get_predict(x_test_1, w0, w1, b0, b1)
print("训练准确性为:", (accuracy(np.argmax(y_test_1, axis=1), np.argmax(result_1, axis=1))))

结果
>>训练准确性为: 0.9173789173789174

参考文献:
[1]:赵志勇.Python机器学习算法机器学习算法[M].北京:电子工业出版社,2017.07

你可能感兴趣的:(个人学习应用)