#encoding=utf8
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from io import StringIO
#构建感知机算法
class Perceptron(object):
def __init__(self, learning_rate = 0.01, max_iter = 200):
self.lr = learning_rate
self.max_iter = max_iter
def fit(self, data, label):
'''
input:data(ndarray):训练数据特征
label(ndarray):训练数据标签
output:w(ndarray):训练好的权重
b(ndarry):训练好的偏置
'''
#编写感知机训练方法,w为权重,b为偏置
self.w = np.array([1.]*data.shape[1])
self.b = np.array([1.])
#********* Begin *********#
for _ in range(self.max_iter):
has_wrong = False
for idx in range(len(data)):
x = data[idx]
y = label[idx]
if y*(self.w.dot(x)+self.b) <= 0:
has_wrong = True
self.w += self.lr*y*x
self.b += self.lr*y
if not has_wrong:
return None
return None
#********* End *********#
def predict(self, data):
'''
input:data(ndarray):测试数据特征
output:predict(ndarray):预测标签
'''
#********* Begin *********#
predict = []
for i in range(data.shape[0]):
reuslt = self.w.dot(data[i])+self.b
if reuslt >= 0:
result = 1
else:
result = -1
predict.append(result)
predict = np.array(predict)
#********* End *********#
return predict
# 读入数据
def createDataSet():
""" 数据读入 """
rawData = StringIO(
"""编号,色泽,根蒂,敲声,纹理,脐部,触感,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,是
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,是
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,是
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,是
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,是
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,是
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,是
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,是
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,否
10,青绿,硬挺,清脆,清晰,平坦,软粘,否
11,浅白,硬挺,清脆,模糊,平坦,硬滑,否
12,浅白,蜷缩,浊响,模糊,平坦,软粘,否
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,否
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,否
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,否
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,否
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,否
""")
df = pd.read_csv(rawData, sep=",")
return df
# 基于LabelEncoder的类别变量重编码
class MultiColumnLabelEncoder:
def __init__(self, columns=None):
self.columns = columns # array of column names to encode
def fit(self, X, y=None):
return self # not relevant here
def transform(self, X):
"""
Transforms columns of X specified in self.columns using
LabelEncoder(). If no columns specified, transforms all
columns in X.
"""
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname, col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self, X, y=None):
return self.fit(X, y).transform(X)
df = createDataSet()
# 输出数据
print(type(df)) #
df
为什么要进行类别变量重编码呢?
答:机器学习中要求所有输入和输出变量均为数字。这意味着,如果你的数据包含分类数据,则必须先将其编码为数字,然后才能拟合和评估模型。
df = MultiColumnLabelEncoder(columns=['色泽', '根蒂', '敲声', '纹理', '脐部', '触感', '好瓜']).fit_transform(df)
# 输出
print(type(df)) #
df
feature_names = ['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']
target_names = ['是', '否']
feature = df[feature_names]
label = df['好瓜']
# df[['色泽', '根蒂', '敲声', '纹理', '脐部', '触感']]
x = feature
print('df[[\'色泽\', \'根蒂\', \'敲声\', \'纹理\', \'脐部\', \'触感\']]')
print(x)
# df['好瓜']
y = label
print('df[\'好瓜\']')
print(y)
# 测试集占 0.2
x_train, x_test, y_train, y_test = model_selection.train_test_split(x, y, random_state=1, test_size=0.2)
myPerceptron = Perceptron()
myPerceptron.fit(np.array(x_train), np.array(y_train))
y_te_pred = myPerceptron.predict(np.array(x_test))
print('感知模型的参数为w={0}, b={1}'.format(myPerceptron.w, myPerceptron.b)
print('y的真实值')
print(y_test)
# 基于P、R和F1的模型评估
# target_names 以 y的label分类为准
target_names = ['否', '是']
print("myPerceptron 模型评估")
print(classification_report(np.array(y_test), y_te_pred, target_names=target_names))
# print(y_te_pred)
# print(y_test)
myPerceptron 模型评估
查准率 查全率 f1-score 当前行的类别在测试数据中的样本总量
否 0.00 0.00 0.00 1
是 0.75 1.00 0.86 3
准确率 0.75 4
平均值 0.38 0.50 0.43 4
weighted avg 0.56 0.75 0.64 4
查准分母是预测 查全分母是真实
否的查准率 0/0 否的查全率 0/1
是的查准率 3/4 是的查全率 3/3 //所以会报错
print("myPerceptron 模型评估")
accuracy = accuracy_score(y_te_pred, np.array(y_test))
print('accuracy:{}'.format(round(accuracy,2)))
print(y_te_pred)
print(y_test)