本文主要通过调用sklearn库调用svm/knn/决策树/随机森林实现简单的鸢尾花数据集的分类,主要的目的是熟悉处理流程。
1.svm分类鸢尾花数据集:
# 文件功能:svm分类鸢尾花数据集
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
#【1】读取数据集
data = load_iris()
#【2】划分数据与标签
x = data.data[:, :2]
y = data.target
train_data, test_data, train_label, test_label = train_test_split\
(x, y, random_state=1, train_size=0.6, test_size=0.4)
print(train_data.shape)
#【3】训练svm分类器
classifier = svm.SVC(C=2, kernel='rbf', gamma=10, decision_function_shape='ovo') # ovr:一对多策略
classifier.fit(train_data, train_label.ravel()) #ravel函数在降维时默认是行序优先
#【4】计算分类器的准确率
print("训练集:", classifier.score(train_data, train_label))
print("测试集:", classifier.score(test_data, test_label))
#【5】可直接调用accuracy_score计算准确率
tra_label = classifier.predict(train_data) #训练集的预测标签
tes_label = classifier.predict(test_data) #测试集的预测标签
print("训练集:", accuracy_score(train_label, tra_label))
print("测试集:", accuracy_score(test_label, tes_label))
#【6】查看决策函数
print('train_decision_function:\n', classifier.decision_function(train_data)) # (90,3)
print('predict_result:\n', classifier.predict(train_data))
svm的相关资料详见支持向量机相关博客。
2.knn分类鸢尾花数据集:
# 文件功能:knn实现鸢尾花数据集分类
from sklearn import datasets # 引入sklearn包含的众多数据集
from sklearn.model_selection import train_test_split # 将数据分为测试集和训练集
from sklearn.neighbors import KNeighborsClassifier # 利用knn方式训练数据
# 【1】引入训练数据
iris = datasets.load_iris() # 引入iris鸢尾花数据,iris数据包含4个特征变量
iris_X = iris.data # 特征变量
iris_y = iris.target # 目标值
# 利用train_test_split进行将训练集和测试集进行分开,test_size占30%
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.8)
print(y_train) #训练数据的特征值分为3类
# 【2】执行训练
knn = KNeighborsClassifier() # 引入训练方法
knn.fit(X_train, y_train) # 进行填充测试数据进行训练
# 【3】预测数据
print(knn.predict(X_test)) # 预测特征值
print(y_test) # 真实特征值
# 【4】可直接调用accuracy_score计算准确率
from sklearn.metrics import accuracy_score
print("测试准确度:", accuracy_score(knn.predict(X_test), y_test))
3.随机森林分类鸢尾花数据集:
# 文件功能:随机森林分类鸢尾花数据集
"""
随机森林主要应用于回归和分类两种场景,侧重于分类。随机森林是指利用多棵树对样本数据进行训练、分类并预测的一种方法。
它在对数据进行分类的同时,还可以给出各个变量的重要性评分,评估各个变量在分类中所起的作用。
"""
"""
随机森林的构建:
1.首先利用bootstrap方法有放回地从原始训练集中随机抽取n个样本,并构建n个决策树;
2.然后假设在训练样本数据中有m个特征,那么每次分裂时选择最好的特征进行分裂,每棵树都一直这样分裂下去,直到该节点
3.的所有训练样例都属于同一类;接着让每棵决策树在不做任何修剪的前提下最大限度地生长;
4.最后将生成的多棵分类树组成随机森林,用随机森林分类器对新的数据进行分类与回归。对于分类问题,按多棵树分类器投票决定最终分类结果;对于回归问题,则由多棵树预测值的均值决定最终预测结果
"""
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
RF = RandomForestClassifier(n_estimators=100, n_jobs=4, oob_score=True)
iris = load_iris()
x = iris.data[:, :2]
y = iris.target
RF.fit(x, y)
h = .02
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for weight in ['uniform', 'distance']:
x_min, x_max = x[:, 0].min() - 1, x[:, 0].max() + 1
y_min, y_max = x[:, 1].min() - 1, x[:, 1].max() + 1
xx, yy = np.meshgrid(
np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h)
)
z = RF.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, z, cmap=cmap_light)
plt.scatter(x[:, 0], x[:, 1], c=y, cmap=cmap_bold, edgecolors='k', s=20)
plt.xlim(xx.min(), xx.max())
plt.title('RandomForestClassifier')
plt.show()
print('RandomForestClassifier:', RF.score(x, y))
本段代码摘抄自博客使用随机森林算法实现鸢尾花案例。
4.决策树分类鸢尾花数据集:
(1)调用sklearn库分类鸢尾花数据集:
from sklearn import datasets # 导入方法类
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
# 【1】载入数据集
iris = datasets.load_iris() # 加载 iris 数据集
iris_feature = iris.data # 特征数据
iris_target = iris.target # 分类数据
# 【2】数据集划分
feature_train, feature_test, target_train, target_test = train_test_split(iris_feature, iris_target, test_size=0.33, random_state=42)
# 【3】训练模型
dt_model = DecisionTreeClassifier() # 所有参数均置为默认状态
dt_model.fit(feature_train,target_train) # 使用训练集训练模型
predict_results = dt_model.predict(feature_test) # 使用模型对测试集进行预测
# 【4】结果评估
scores = dt_model.score(feature_test, target_test)
print(scores)
(2)自定义函数分类鸢尾花数据集:
这段代码摘抄整理自博客决策树分类鸢尾花数据,实现思路很清晰,搭配如西瓜书等其他原理类决策树资料可以清楚了解决策树的实现机制。
# 文件功能:决策树分类鸢尾花数据集
# 代码整体思路:
# 1 . 先处理数据,shuffle函数随机抽取80%样本做训练集。
# 2 . 特征值离散化
# 3 . 用信息熵来递归地构造树
# 4 . 用构造好的树来判断剩下20%的测试集,求算法做分类的正确率
from sklearn import datasets
import math
import numpy as np
# 【1】获取信息熵
def getInformationEntropy(arr, leng):
return -(arr[0] / leng * math.log(arr[0] / leng if arr[0] > 0 else 1) + arr[1] / leng * math.log(
arr[1] / leng if arr[1] > 0 else 1) + arr[2] / leng * math.log(arr[2] / leng if arr[2] > 0 else 1))
# 【2】离散化特征一的值
def discretization(index):
feature1 = np.array([iris.data[:, index], iris.target]).T
feature1 = feature1[feature1[:, 0].argsort()]
counter1 = np.array([0, 0, 0])
counter2 = np.array([0, 0, 0])
resEntropy = 100000
for i in range(len(feature1[:, 0])):
counter1[int(feature1[i, 1])] = counter1[int(feature1[i, 1])] + 1
counter2 = np.copy(counter1)
for j in range(i + 1, len(feature1[:, 0])):
counter2[int(feature1[j, 1])] = counter2[int(feature1[j, 1])] + 1
# print(i,j,counter1,counter2)
# 贪心算法求最优的切割点
if i != j and j != len(feature1[:, 0]) - 1:
sum = (i + 1) * getInformationEntropy(counter1, i + 1) + (j - i) * getInformationEntropy(
counter2 - counter1, j - i) + (length - j - 1) * getInformationEntropy(np.array(num) - counter2, length - j - 1)
if sum < resEntropy:
resEntropy = sum
res = np.array([i, j])
res_value = [feature1[res[0], 0], feature1[res[1], 0]]
print(res, resEntropy, res_value)
return res_value
# 【3】计算合适的分割值
def getRazors():
a = []
for i in range(len(iris.feature_names)):
print(i)
a.append(discretization(i))
return np.array(a)
# 【4】随机抽取80%的训练集和20%的测试集
def divideData():
completeData = np.c_[iris.data, iris.target.T]
np.random.shuffle(completeData)
trainData = completeData[range(int(length * 0.8)), :]
testData = completeData[range(int(length * 0.8), length), :]
return [trainData, testData]
# 【5】
def getEntropy(counter):
res = 0
denominator = np.sum(counter)
if denominator == 0:
return 0
for value in counter:
if value == 0:
continue
res += value / denominator * math.log(value / denominator if value > 0 and denominator > 0 else 1)
return -res
# 【6】寻找最大索引
def findMaxIndex(dataSet):
maxIndex = 0
maxValue = -1
for index, value in enumerate(dataSet):
if value > maxValue:
maxIndex = index
maxValue = value
return maxIndex
# 【7】递归
def recursion(featureSet, dataSet, counterSet):
if (counterSet[0] == 0 and counterSet[1] == 0 and counterSet[2] != 0):
return iris.target_names[2]
if (counterSet[0] != 0 and counterSet[1] == 0 and counterSet[2] == 0):
return iris.target_names[0]
if (counterSet[0] == 0 and counterSet[1] != 0 and counterSet[2] == 0):
return iris.target_names[1]
if len(featureSet) == 0:
return iris.target_names[findMaxIndex(counterSet)]
if len(dataSet) == 0:
return []
res = 1000
final = 0
# print("剩余特征数目", len(featureSet))
for feature in featureSet:
i = razors[feature][0]
j = razors[feature][1]
# print("i = ",i," j = ",j)
set1 = []
set2 = []
set3 = []
counter1 = [0, 0, 0]
counter2 = [0, 0, 0]
counter3 = [0, 0, 0]
for data in dataSet:
index = int(data[-1])
# print("data ",data," index ",index)
if data[feature] < i:
set1.append(data)
counter1[index] = counter1[index] + 1
elif data[feature] >= i and data[feature] <= j:
set2.append(data)
counter2[index] = counter2[index] + 1
else:
set3.append(data)
counter3[index] = counter3[index] + 1
a = (len(set1) * getEntropy(counter1) + len(set2) * getEntropy(counter2) + len(set3) * getEntropy(
counter3)) / len(dataSet)
# print("特征编号:",feature,"选取该特征得到的信息熵:",a)
if a < res:
res = a
final = feature
# 返回被选中的特征的下标
# sequence.append(final)
# print("最终在本节点上选取的特征编号是:",final)
featureSet.remove(final)
child = [0, 0, 0, 0]
child[0] = final
child[1] = recursion(featureSet, set1, counter1)
child[2] = recursion(featureSet, set2, counter2)
child[3] = recursion(featureSet, set3, counter3)
return child
# 【8】决策
def judge(data, tree):
root = "unknow"
while (len(tree) > 0):
if isinstance(tree, str) and tree in iris.target_names:
return tree
root = tree[0]
if (isinstance(root, str)):
return root
if isinstance(root, int):
if data[root] < razors[root][0] and tree[1] != []:
tree = tree[1]
elif tree[2] != [] and (tree[1] == [] or (data[root] >= razors[root][0] and data[root] <= razors[root][1])):
tree = tree[2]
else:
tree = tree[3]
return root
# 【9】调用
if __name__ == '__main__':
iris = datasets.load_iris()
num = [0, 0, 0]
for row in iris.data:
num[int(row[-1])] = num[int(row[-1])] + 1
length = len(iris.target)
[trainData, testData] = divideData()
razors = getRazors()
tree = recursion(list(range(len(iris.feature_names))), trainData,
[np.sum(trainData[:, -1] == 0), np.sum(trainData[:, -1] == 1), np.sum(trainData[:, -1] == 2)])
print("本次选取的训练集构建出的树: ", tree)
index = 0
right = 0
for data in testData:
result = judge(testData[index], tree)
truth = iris.target_names[int(testData[index][-1])]
print("result is ", result, " truth is ", truth)
index = index + 1
if result == truth:
right = right + 1
print("正确率 : ", right / index)
5.参考资料
(1)使用随机森林算法实现鸢尾花案例
(2)使用随机森林算法实现鸢尾花案例
(3)决策树分类鸢尾花数据
(4)python之sklearn使用教程