优点:精度高、对异常值不敏感、无数据输入假定
缺点:计算复杂度高、空间复杂度高
使用数据范围:数值型和标称型
from numpy import *
import operator
def create_data_set():
group = array([[1.0,1.1],[1.0,1.0],[0,0.1],[0.1,0.1]])
labels = ['a','a','b','b']
return group,labels
def classify(pre_data,data_set,labels,k):
data_set_size = data_set.shape[0]
diff_mat = tile(pre_data,(data_set_size,1)) - data_set
sq_diff_mat = diff_mat**2
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances**0.5
sorted_dist_indicies = distances.argsort()
class_count = {}
for i in range(k):
vote_label = labels[sorted_dist_indicies[i]]
class_count[vote_label] = class_count.get(vote_label,0) + 1
sorted_class_count = sorted(class_count.iteritems(),key=operator.itemgetter(1),reverse=True)
return sorted_class_count[0][0]
def main():
group,labels = create_data_set()
print classify([0,0],group,labels,2)
print classify([1,0],group,labels,2)
if __name__=="__main__":
main()
优点:计算复杂度不高,结果易于理解,对缺失值不敏感,可以处理不相关特征数据
缺点:可能产生过度匹配问题
适用数据类型:数值型和标称型
from numpy import *
import operator
def create_data_set():
group = array([[1.0,1.1,'no'],[1.0,1.0,'no'],[0,0.1,'yes'],[0.1,0.1,'yes']])
labels = ['a','a','b','b']
return group,labels
def classify(pre_data,data_set,labels,k):
data_set_size = data_set.shape[0]
diff_mat = tile(pre_data,(data_set_size,1)) - data_set
sq_diff_mat = diff_mat**2
sq_distances = sq_diff_mat.sum(axis=1)
distances = sq_distances**0.5
sorted_dist_indicies = distances.argsort()
class_count = {}
for i in range(k):
vote_label = labels[sorted_dist_indicies[i]]
class_count[vote_label] = class_count.get(vote_label,0) + 1
sorted_class_count = sorted(class_count.iteritems(),key=operator.itemgetter(1),reverse=True)
return sorted_class_count[0][0]
def calc_shannon_ent(data_set):
num_entries = len(data_set)
label_counts = {}
for v in data_set:
current_label = v[-1]
if current_label not in label_counts:
label_counts[current_label] = 0
label_counts[current_label] += 1
shannon_ent = 0
for key in label_counts:
prob = float(label_counts[key])/num_entries
shannon_ent -= prob * log(prob,2)
return shannon_ent
def split_data_set(data_set,axis,value):
ret_data_set = []
for v in data_set:
if v[axis] == value:
reduce_feat_vec = v[:axis]
reduce_feat_vec.extend(v[:axis+1])
ret_data_set.append(reduce_feat_vec)
return ret_data_set
def choose_best_feature_to_split(data_set):
num_features = len(data_set[0]) - 1
base_Entropy = calc_shannon_ent(data_set)
best_info_gain = 0
best_featurn = -1
for i in range(num_features):
feat_list = set([e[i] for e in data_set])
new_entropy = 0.0
for value in feat_list:
sub_data_set = split_data_set(data_set,i,value)
prob = len(sub_data_set)/float(len(data_set)
new_entropy += prob * calc_shannon_ent(sub_data_set)
info_gain = base_Entropy - new_entropy
if(info_gain > best_info_gain):
best_info_gain = info_gain
best_featurn = i
return best_featurn
def majority_cnt(class_list):
class_count = {}
for v in class_list:
if v not in class_count:
class_count[v] = 0
class_count[v] += 1
sorted_dist_count = sorted(class_count.iteritems(),key=operator.itemgetter(1),reverse=True)
return sorted_class_count[0][0]
def create_tree(data_set,labels):
class_list = [i[-1] for i in data_set]
if class_list.count(class_list[0]) == len(class_list):
return class_list[0]
if len(data_set[0]) == 1:
return majority_cnt(class_list)
best_feat = choose_best_feature_to_split(data_set)
best_feat_label = labels[best_feat]
my_tree = {best_feat_label:{}}
del(labels[best_feat])
uniq_feat_values = set([i[best_feat] for i in data_set])
for value in uniq_feat_values:
sub_labels = labels[:]
my_tree[best_feat_label][value] = create_tree(split_data_set(data_set,best_feat,value),sub_labels)
return my_tree
def main():
group,labels = create_data_set()
my_tree = create_tree(group,labels)
if __name__=="__main__":
main()