12、朴素贝叶斯实战

文本数据的分类

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB #多项式朴素贝叶斯
# 正则匹配,a-z,A-Z所有中文
tfCoder = CountVectorizer(token_pattern="[a-zA-Z|\u4e00-\u9fa5]+")  # TF模型
X = df["words"]
Y = df["Y"]
X = tfCoder.fit_transform(X)  # 训练TF模型
print(tfCoder.get_feature_names())
print(X.toarray())
X_ = ["Chinese Chinese Chinese Tokyo Japan"]  # 测试数据
X_ = tfCoder.transform(X_).A  # A相当于toarray() 将训练数据转为array类型
model = MultinomialNB()
model.fit(X, Y)
print(model.predict(X_))
print('-----------')
print(Y[model.predict(X_)]) #证明当前字符串属于中国的类别

鸢尾花数据的分类

鸢尾花数据集使用朴素贝叶斯进行分类操作


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.naive_bayes import GaussianNB, MultinomialNB#高斯贝叶斯和多项式朴素贝叶斯
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

## 设置属性防止中文乱码
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
对列名进行处理,选择鸢尾花的最后两个特征进行数据分析


# 花萼长度、花萼宽度,花瓣长度,花瓣宽度
iris_feature_E = 'sepal length', 'sepal width', 'petal length', 'petal width'
iris_feature_C = u'花萼长度', u'花萼宽度', u'花瓣长度', u'花瓣宽度'
iris_class = 'Iris-setosa', 'Iris-versicolor', 'Iris-virginica'
features = [0, 2]
将数据进行标签化处理


## 读取数据
path = '../datas/iris.data'  # 数据文件路径
data = pd.read_csv(path, header=None)
x = data[list(range(4))]
x = x[features]
y = pd.Categorical(data[4]).codes ## 直接将数据特征转换为0,1,2
print ("总样本数目:%d;特征属性数目:%d" % x.shape)
总样本数目:150;特征属性数目:2

## 0. 数据分割,形成模型训练数据和测试数据
x_train1, x_test1, y_train1, y_test1 = train_test_split(x, y, train_size=0.8, random_state=14)
x_train, x_test, y_train, y_test = x_train1, x_test1, y_train1, y_test1
print ("训练数据集样本数目:%d, 测试数据集样本数目:%d" % (x_train.shape[0], x_test.shape[0]))
训练数据集样本数目:120, 测试数据集样本数目:30
对数据进行特征化处理


## 高斯贝叶斯模型构建
clf = Pipeline([
        ('sc', StandardScaler()),#标准化,把它转化成了高斯分布
        ('poly', PolynomialFeatures(degree=4)),
        ('clf', GaussianNB())]) # MultinomialNB多项式贝叶斯算法中要求特征属性的取值不能为负数
## 训练模型
clf.fit(x_train, y_train)
Pipeline(steps=[('sc', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly', PolynomialFeatures(degree=4, include_bias=True, interaction_only=False)), ('clf', GaussianNB(priors=None))])
计算准确度


y_train_hat = clf.predict(x_train)
print ('训练集准确度: %.2f%%' % (100 * accuracy_score(y_train, y_train_hat)))
y_test_hat = clf.predict(x_test)
print ('测试集准确度:%.2f%%' % (100 * accuracy_score(y_test, y_test_hat)))
​
训练集准确度: 87.50%
测试集准确度:93.33%
产生区域图


N, M = 500, 500     # 横纵各采样多少个值
​
#生成画图的图像区域
x1_min1, x2_min1 = x_train.min()
x1_max1, x2_max1 = x_train.max()
x1_min2, x2_min2 = x_test.min()
x1_max2, x2_max2 = x_test.max()
x1_min = np.min((x1_min1, x1_min2))
x1_max = np.max((x1_max1, x1_max2))
x2_min = np.min((x2_min1, x2_min2))
x2_max = np.max((x2_max1, x2_max2))
​
t1 = np.linspace(x1_min, x1_max, N)
t2 = np.linspace(x2_min, x2_max, N)
x1, x2 = np.meshgrid(t1, t2)  # 生成网格采样点
x_show = np.dstack((x1.flat, x2.flat))[0] # 测试点
​
cm_light = mpl.colors.ListedColormap(['#77E0A0', '#FF8080', '#A0A0FF'])
cm_dark = mpl.colors.ListedColormap(['g', 'r', 'b'])
y_show_hat = clf.predict(x_show)                  # 预测值
y_show_hat = y_show_hat.reshape(x1.shape)
画图验证分类效果的优劣


## 画图
plt.figure(facecolor='w')
plt.pcolormesh(x1, x2, y_show_hat, cmap=cm_light)     # 预测值的显示
plt.scatter(x_train[features[0]], x_train[features[1]], c=y_train, edgecolors='k', s=50, cmap=cm_dark)
plt.scatter(x_test[features[0]], x_test[features[1]], c=y_test, marker='^', edgecolors='k', s=120, cmap=cm_dark)
plt.xlabel(iris_feature_C[features[0]], fontsize=13)
plt.ylabel(iris_feature_C[features[1]], fontsize=13)
plt.xlim(x1_min, x1_max)
plt.ylim(x2_min, x2_max)
plt.title(u'高斯贝叶斯对鸢尾花数据的分类结果, 正确率:%.3f%%' % (100 * accuracy_score(y_test, y_test_hat)), fontsize=18)
plt.grid(True)
plt.show()

你可能感兴趣的:(数据分析实战篇,机器学习,tensorflow,python,分类算法,深度学习)