数据集
- 数据集:MNIST数据,图片大小是28×28的,10个类别,使用数据的原始特征,所有每个样本有28×28=784个特征。
- 图片中的每个元素值都经过二值化
- 剪枝使用的是预剪枝。
代码
import cv2
import time
import logging
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def binaryzation(img):
for i in range(len(img)):
img_1 = img[i]
cv_img = img_1.astype(np.uint8)
cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
img[i] = cv_img
class Tree(object):
def __init__(self, node_type, Class=None, feature=None):
self.node_type = node_type
self.Child = {}
self.Class = Class
self.feature = feature
def add_tree(self, val, tree):
self.Child[val] = tree
def predict(self, features):
if self.node_type == 'leaf':
return self.Class
tree = self.Child[features[self.feature]]
return tree.predict(features)
def calc_ent(x):
x_value_list = list(set(list(x)))
ent = 0.0
for x_value in x_value_list:
p = float(x[x == x_value].shape[0]) / x.shape[0]
logp = np.log2(p)
ent -= p * logp
return ent
def calc_condition_ent(x, y):
x_value_list = list(set(x))
ent = 0.0
for x_value in x_value_list:
sub_y = y[x == x_value]
temp_ent = calc_ent(sub_y)
ent += (float(sub_y.shape[0]) / y.shape[0]) * temp_ent
return ent
def Create_tree(train_set, train_label, features, epsilon):
LEAF = 'leaf'
INTERNAL = 'internal'
label_set = list(set(train_label))
if len(label_set) == 1:
return Tree(LEAF, Class=label_set[-1])
label_to_count = Counter(list(train_label))
max_class, max_len = label_to_count.most_common(1)[0]
if len(features) == 0 or len(train_label) <= epsilon[1]:
return Tree(LEAF, Class=max_class)
max_feature = 0
max_inf_gain = 0
HD = calc_ent(train_label)
for feature in features:
A = np.array(train_set[:, feature].tolist())
gda = HD - calc_condition_ent(A, train_label)
if gda > max_inf_gain:
max_inf_gain, max_feature = gda, feature
if max_inf_gain < epsilon[0]:
return Tree(LEAF, Class=max_class)
sub_features = features.copy()
sub_features.remove(max_feature)
tree = Tree(INTERNAL, feature=max_feature)
feature_value_list = list(set(train_set[:, max_feature].tolist()))
for feature_value in feature_value_list:
sub_train_set = train_set[train_set[:, max_feature] == feature_value]
sub_train_label = train_label[train_set[:, max_feature] == feature_value]
sub_tree = Create_tree(sub_train_set, sub_train_label, sub_features, epsilon)
tree.add_tree(feature_value, sub_tree)
return tree
def predict(test_set, tree):
result = []
for features in test_set:
tmp_predict = tree.predict(features)
result.append(tmp_predict)
return np.array(result)
if __name__ == '__main__':
print('Start read data')
S = time.time()
raw_data = pd.read_csv('../data/train.csv')
data = raw_data.values
print("data shape:", data.shape)
imgs = data[:, 1:]
labels = data[:, 0]
binaryzation(imgs)
print("imgs shape:", imgs.shape)
print("labels shape:", labels.shape)
train_features, test_features, train_labels, test_labels = train_test_split(
imgs, labels, test_size=0.33, random_state=23323)
print("train data count :%d" % len(train_labels))
print("test data count :%d" % len(test_labels))
print('read data cost ', time.time() - S, ' second')
print('Start training')
S = time.time()
tree = Create_tree(train_features, train_labels, [i for i in range(784)], (0.1,10))
print('training cost ', time.time() - S, ' second')
print('Start predicting')
S = time.time()
test_predict = predict(test_features, tree)
print('predicting cost ', time.time() - S, ' second')
score = accuracy_score(test_labels, test_predict)
print("The accruacy socre is ", score)