In [8]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
def iris_type(s):
it = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2}
return it[s]
In [15]:
data = np.loadtxt('D:\\mlInAction\\8.iris.data', encoding='utf-8', dtype=float, delimiter=',',
converters={4: iris_type})
data
Out[15]:
In [16]:
x, y = np.split(data, (4,), axis=1) # 前四列是x最后一列是y
x
Out[16]:
In [17]:
y
Out[17]:
In [18]:
x = x[:, :2] # 只取前两列作为x
x
Out[18]:
In [19]:
gnb = Pipeline([
('sc', StandardScaler()), # 把数据进行高斯标准化,以0为均值,1为方差
('clf', GaussianNB())]) # 假定数据为高斯分布
In [20]:
y.ravel() # 转化为行向量
Out[20]:
In [21]:
gnb.fit(x, y.ravel())
Out[21]:
In [23]:
y_hat = gnb.predict(x)
y_hat
Out[23]:
In [24]:
y = y.reshape(-1) # 相当于y.ravel()
y
Out[24]:
In [25]:
result = y_hat == y
result
Out[25]:
In [27]:
acc = np.mean(result) # 相当于把true当成1,false为0,求平均值,即为准确率
acc
Out[27]:
以下为版本2
In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.naive_bayes import GaussianNB, MultinomialNB #高斯贝叶斯和多项式朴素贝叶斯
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
# 设置属性防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
# 花萼长度、花萼宽度,花瓣长度,花瓣宽度
iris_feature_E = 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature_C = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
iris_class = 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'
features = [2, 3]
# 读取数据
path = 'D:\\mlInAction\\8.iris.data' # 数据文件路径
data = pd.read_csv(path, header=None)
data
Out[29]:
In [35]:
x = data[list(range(4))] # 此处为pd,不能用切片
x
Out[35]:
In [36]:
x = x[features]
x
Out[36]:
In [37]:
y = pd.Categorical(data[4]).codes # 直接将数据特征转换为0,1,2
y
Out[37]:
In [38]:
print("总样本数目:%d;特征属性数目:%d" % x.shape)
In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=14)
print("训练数据集样本数目:%d, 测试数据集样本数目:%d" % (x_train.shape[0], x_test.shape[0]))
In [41]:
clf = Pipeline([
('sc', StandardScaler()), # 标准化,把它转化成了高斯分布
('poly', PolynomialFeatures(degree=1)),
('clf', GaussianNB())]) # MultinomialNB多项式贝叶斯算法中要求特征属性的取值不能为负数
# 训练模型
clf.fit(x_train, y_train)
Out[41]:
In [42]:
y_train_hat = clf.predict(x_train)
print('训练集准确度: %.2f%%' % (100 * accuracy_score(y_train, y_train_hat)))
y_test_hat = clf.predict(x_test)
print('测试集准确度:%.2f%%' % (100 * accuracy_score(y_test, y_test_hat)))