Apriori、ID3、Naive_Bayes等(数据挖掘)

  • Apriori算法
  • 线性回归
  • UCI分类KNN
  • 决策树
  • Naive_Bayes
  • K-Means图像分割

Apriori算法

流程

源代码

# coding=utf-8


def load_data_set():
    """
    一个项目集合的列表
    表示买的东西
    """
    data_set = [['i1', 'i2', 'i5'], ['i2', 'i4'], ['i2', 'i3'],
            ['i1', 'i2', 'i4'], ['i1', 'i3'], ['i2', 'i3'],
            ['i1', 'i3'], ['i1', 'i2', 'i3', 'i5'], ['i1', 'i2', 'i3'],
                ['i1','i4'],['i2','i3']]
    return data_set


def create_C1(data_set):
    """
    创建只有一个项目的锁定候选集合
    """
    C1 = set()
    for t in data_set:
        for item in t:
            item_set = frozenset([item])
            C1.add(item_set)
    return C1


def is_apriori(Ck_item, Lksub1):
    """
    评价这个候选集合是不是满足apriori算法

    """
    for item in Ck_item:
        sub_Ck = Ck_item - frozenset([item])
        if sub_Ck not in Lksub1:
            return False
    return True


def create_Ck(Lksub1, k):
    """
    创建候选集合
    """
    Ck = set()
    len_Lksub1 = len(Lksub1)
    list_Lksub1 = list(Lksub1)
    for i in range(len_Lksub1):
        for j in range(1, len_Lksub1):
            l1 = list(list_Lksub1[i])
            l2 = list(list_Lksub1[j])
            l1.sort()
            l2.sort()
            if l1[0:k-2] == l2[0:k-2]:
                Ck_item = list_Lksub1[i] | list_Lksub1[j]
                # pruning
                if is_apriori(Ck_item, Lksub1):
                    Ck.add(Ck_item)
    return Ck


def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
    """
    用删除策略从候选集合中选出频繁集
    """
    Lk = set()
    item_count = {}
    # 计数 每个项目出现次数
    for t in data_set:
        for item in Ck:
            if item.issubset(t):
                if item not in item_count:
                    item_count[item] = 1
                else:
                    item_count[item] += 1
    t_num = float(len(data_set))
    for item in item_count:
        if (item_count[item] / t_num) >= min_support:
            Lk.add(item)
            support_data[item] = item_count[item] / t_num
    return Lk


def generate_L(data_set, k, min_support):
    '''
    产生频繁集
    :param data_set: 一个列表度事务。每个事务包含几个项。
    :param k: 所有频繁项集的最大项数。
    :param min_support: 最小支持度
    :return:L频繁集列表,support_data <频繁集:支持度>
    '''
    support_data = {}
    C1 = create_C1(data_set)
    L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
    Lksub1 = L1.copy()
    L = []
    L.append(Lksub1)
    for i in range(2, k+1):
        Ci = create_Ck(Lksub1, i)
        Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
        Lksub1 = Li.copy()
        L.append(Lksub1)
    return L, support_data


def generate_big_rules(L, support_data, min_conf):
    """
    产生关联规则
    Args:
        L: 项目列表
        support_data: 频繁集的支持度
        min_conf: 最小置信度
    Returns:
        big_rule_list: 三元组列表。
    """
    big_rule_list = []
    sub_set_list = []
    for i in range(0, len(L)):
        for freq_set in L[i]:
            for sub_set in sub_set_list:
                if sub_set.issubset(freq_set):
                    # 计算置信度
                    conf = support_data[freq_set] / support_data[freq_set - sub_set]
                    big_rule = (freq_set - sub_set, sub_set, conf)
                    if conf >= min_conf and big_rule not in big_rule_list:
                        # print freq_set-sub_set, " => ", sub_set, "conf: ", conf
                        big_rule_list.append(big_rule)
            sub_set_list.append(freq_set)
    return big_rule_list


if __name__ == "__main__":

    data_set = load_data_set()
    L, support_data = generate_L(data_set, k=3, min_support=0.2)#支持度
    big_rules_list = generate_big_rules(L, support_data, min_conf=0.6)#置信度
    for Lk in L:
        if(Lk==set([])):
            break
        print("="*50)
        print("frequent " + str(len(list(Lk)[0])) + "-itemsets\t\tsupport")
        print("="*50)
        for freq_set in Lk:
            print(freq_set, support_data[freq_set])
    print()
    print("Rules")
    for item in big_rules_list:
        print(item[0], "=>", item[1], "conf: ", item[2])

结果

==================================================
frequent 1-itemsets support
==================================================
frozenset({'i2'}) 0.7272727272727273
frozenset({'i1'}) 0.6363636363636364
frozenset({'i3'}) 0.6363636363636364
frozenset({'i4'}) 0.2727272727272727
==================================================
frequent 2-itemsets support
==================================================
frozenset({'i1', 'i2'}) 0.36363636363636365
frozenset({'i3', 'i2'}) 0.45454545454545453
frozenset({'i1', 'i3'}) 0.36363636363636365
Rules
frozenset({'i3'}) => frozenset({'i2'}) conf: 0.7142857142857143
frozenset({'i2'}) => frozenset({'i3'}) conf: 0.625

线性回归

流程

源代码

from matplotlib import pyplot as plt
import numpy as np
from sklearn.preprocessing import minmax_scale
from sklearn.model_selection import KFold
import os
os.chdir('./2_LR/')
def generate_data(N):
    import random
    student=[]
    delta=(10-0.4)/N
    y=lambda x:int(((-5/8)*x**2)+(25/2)*x+305/8)
    lt=[]
    for i in range(N):
        learn_time=0.+delta*i
        lt.append(learn_time)
        # print(learn_time)
        score=y(learn_time)
        # score=random.random(0,5)
        # print(score)
        student.append(score)
    lt,student=np.array(lt)[:,np.newaxis],np.array(student)[:,np.newaxis]
    return lt,student
def predict(theta, x):
        '''
        theta:(d,1)
        x:(n,d)
        '''
        # x = x.reshape((len(x), -1))
        # print(x.shape)
        return x.dot(theta).reshape((-1, 1))
def train(X,Y):
    def predict(theta, x):
        '''
        theta:(d,1)
        x:(n,d)
        '''
        # x = x.reshape((len(x), -1))
        # print(x.shape)
        return x.dot(theta).reshape((-1, 1))

    def const_error(h, y):
        return h - y

    # 均方误差
    # h:pred_y
    def cost(h, y, con):
        return (np.mean(con ** 2)) / 2

    def grad(x, con):
        return np.mean(con * x, axis=0, keepdims=True).transpose()
    gamma=1e-3
    epoch = 10000
    epsilon = 1e-8
    XY=np.concatenate((X,np.ones((len(X),1)),Y),axis=1)
    np.random.seed(2)
    np.random.shuffle(XY)
    X, Y = XY[:, :2], XY[:, 2:]
    kfold = KFold(5)
    thetas = []  # 每一次的参数
    for j, (train_index, test_index) in enumerate(kfold.split(X)):
        # print(train_index)
        train_x, test_x, train_y, test_y = X[train_index], X[test_index], Y[train_index], Y[test_index]
        sc = []  # 训练集误差
        vc = []  # 交叉集预测误差
        theta = np.random.randn(2,1)  # 随机初始化
        # print(theta)
        this_time_con = 10000  # 损失
        # if j!=3:
        #     continue
        # print('?')
        for i in range(epoch):
            # print(i,train_x.shape)
            h = predict(theta, train_x)
            con = const_error(h, train_y)
            g = grad(train_x, con)
            pre_y = predict(theta, test_x)
            if gamma >= 500:
                gamma *= 0.95
            theta = theta - gamma * g  # 更新公式
            # 存一下损失
            valdation_cost = cost(pre_y, test_y, const_error(pre_y, test_y))
            sc.append(cost(h, train_y, con))
            vc.append(valdation_cost)
            if abs(this_time_con - valdation_cost) <= epsilon:
                break
            else:
                this_time_con = valdation_cost
        plt.plot(np.arange(len(sc)), np.array(sc), label="training_cost")
        plt.legend()
        plt.plot(np.arange(len(vc)), np.array(vc), label="validating_cost")
        plt.legend()
        plt.xlabel("epoch")
        plt.ylabel("cost")
        plt.title(str(j) + 'time_cost.png')
        # print(i,valdation_cost)
        plt.text(i, valdation_cost, 'val_cost:\n(%d,%.3f)' % (i, valdation_cost), fontsize=8)
        # plt.show()
        plt.savefig(str(j) + 'time_cost.png')
        plt.clf()
        thetas.append(theta)
        plt.scatter(X[:,0], Y, marker='x',c='red')
        plt.grid()
        x = np.arange(-0.2, 8, 0.1)[:,np.newaxis]
        x=np.concatenate((x,np.ones((len(x),1))),axis=1)
        y = predict(theta, x)
        plt.plot(x[:,0], y,c='blue')
        plt.title("%dtime_predict" % (j))
        plt.savefig("%dtime_predict" % (j))
        plt.clf()
    return thetas[-1]

if __name__ == '__main__':
    learn_time, score=generate_data(100)

    theta=train(learn_time,score)
    plt.plot(learn_time, score)
    x=np.arange(0,10,0.1)[:,np.newaxis]
    x = np.concatenate((x, np.ones((len(x), 1))), axis=1)
    plt.plot(x[:,0],predict(theta,x))
    # plt.show()
    plt.savefig('curve.tiff')

结果



UCI分类KNN

流程

step.1---初始化距离为最大值
step.2---计算未知样本和每个训练样本的距离dist
step.3---得到目前K个最临近样本中的最大距离maxdist
step.4---如果dist小于maxdist,则将该训练样本作为K-最近邻样本
step.5---重复步骤2、3、4,直到未知样本和所有训练样本的距离都算完
step.6---统计K-最近邻样本中每个类标号出现的次数
step.7---选择出现频率最大的类标号作为未知样本的类标号

源代码

# coding: utf-8
# 作者:
# ## 显示图片
# In[7]: get_ipython().magic('matplotlib inline')
# ## 导入库
# In[17]:
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
# ## 读取数据
  # In[2]:
iris = load_iris()
X = iris.data
y = iris.target
# ## 每次可视化两维数据
# In[5]:
X_sepal = X[:, :2]
plt.scatter(X_sepal[:, 0], X_sepal[:, 1], c=y, cmap=plt.cm.gnuplot)
         plt.xlabel('Sepal length')
plt.ylabel('Sepal width')
# In[6]:
X_petal = X[:, 2:4]
plt.scatter(X_petal[:, 0], X_petal[:, 1], c=y, cmap=plt.cm.gnuplot) plt.xlabel('Petal length')
plt.ylabel('Petal width') # ## 初始化分类器
# ### 最近邻
# In[18]:
knn1 = KNeighborsClassifier(n_neighbors=1) knn1.fit(X, y)
y_pred = knn1.predict(X) print((metrics.accuracy_score(y, y_pred)))
# 为什么当 K=1 的时候 KNN 算法的训练准确度为 1,KNN 会查找在训练数据集中的最近的观测,训练 得到的模型会在相同的数据集中找到相同的观测。换句话说,KNN 算法已经记住了训练数据集,因为我 们使用同样的数据作为测试的数据。
                 ### 将X和y分割成训练和测试集
# In[15]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=4) # 我们能找到一个比较好的 K 值吗?
# In[19]:
# 测试从 K=1 到 K=25,记录测试准确率
k_range = list(range(1, 26))
        
  test_accuracy = [] for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test) test_accuracy.append(metrics.accuracy_score(y_test, y_pred))
# In[20]:
plt.plot(k_range, test_accuracy)
         plt.xlabel("Value of K for KNN") plt.ylabel("Testing Accuracy") # 所以我们选择 k=9 比较好
# ## 考虑交叉验证
# In[21]:
from sklearn.cross_validation 
import KFold 
import numpy as np
def cv_estimate(k, kfold=5):
cv = KFold(n = X.shape[0], n_folds=kfold) clf = KNeighborsClassifier(n_neighbors=k) score = 0
for train, test in cv:
clf.fit(X[train], y[train])
score += clf.score(X[test], y[test]) #print clf.score(X[test], y[test])
score /= kfold return score
                 # In[22]:
k_range = list(range(1, 26)) test_accuracy = []
for k in k_range:
test_accuracy.append(cv_estimate(k, 5)) # In[23]:
plt.plot(k_range, test_accuracy) plt.xlabel("Value of K for KNN")
        
plt.ylabel("Average Accuracy of Kfold CV")
# 所以我们可以选择 10 折以内,都会取得不错的效果。

结果




决策树

流程

  1. 创建数据集
  2. 计算数据集的信息熵
  3. 遍历所有特征,选择信息熵最小的特征,即为最好的分类特征
  4. 根据上一步得到的分类特征分割数据集,并将该特征从列表中移除
  5. 执行递归函数,返回第三步,不断分割数据集,直到分类结束
  6. 使用决策树执行分类,返回分类结果

源代码

from math import log
import operator

def calcShannonEnt(dataSet):  # 计算数据的熵(entropy)
    numEntries=len(dataSet)  # 数据条数
    labelCounts={}
    for featVec in dataSet:
        currentLabel=featVec[-1] # 每行数据的最后一个字(类别)
        if currentLabel not in labelCounts.keys():
            labelCounts[currentLabel]=0
        labelCounts[currentLabel]+=1  # 统计有多少个类以及每个类的数量
    shannonEnt=0
    for key in labelCounts:
        prob=float(labelCounts[key])/numEntries # 计算单个类的熵值
        shannonEnt-=prob*log(prob,2) # 累加每个类的熵值
    return shannonEnt

def createDataSet1():    # 创造示例数据
    dataSet = [['长', '粗', '男'],
               ['短', '粗', '男'],
               ['短', '粗', '男'],
               ['长', '细', '女'],
               ['短', '细', '女'],
               ['短', '粗', '女'],
               ['长', '粗', '女'],
               ['长', '粗', '女']]
    labels = ['头发','声音']  #两个特征
    return dataSet,labels

def splitDataSet(dataSet,axis,value): # 按某个特征分类后的数据
    retDataSet=[]
    for featVec in dataSet:
        if featVec[axis]==value:
            reducedFeatVec =featVec[:axis]
            reducedFeatVec.extend(featVec[axis+1:])
            retDataSet.append(reducedFeatVec)
    return retDataSet

def chooseBestFeatureToSplit(dataSet):  # 选择最优的分类特征
    numFeatures = len(dataSet[0])-1
    baseEntropy = calcShannonEnt(dataSet)  # 原始的熵
    bestInfoGain = 0
    bestFeature = -1
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob =len(subDataSet)/float(len(dataSet))
            newEntropy +=prob*calcShannonEnt(subDataSet)  # 按特征分类后的熵
        infoGain = baseEntropy - newEntropy  # 原始熵与按特征分类后的熵的差值
        if (infoGain>bestInfoGain):   # 若按某特征划分后,熵值减少的最大,则次特征为最优分类特征
            bestInfoGain=infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList):    #按分类后类别数量排序,比如:最后分类为2男1女,则判定为男;
    classCount={}
    for vote in classList:
        if vote not in classCount.keys():
            classCount[vote]=0
        classCount[vote]+=1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    return sortedClassCount[0][0]

def createTree(dataSet,labels):
    classList=[example[-1] for example in dataSet]  # 类别:男或女
    if classList.count(classList[0])==len(classList):
        return classList[0]
    if len(dataSet[0])==1:
        return majorityCnt(classList)
    bestFeat=chooseBestFeatureToSplit(dataSet) #选择最优特征
    bestFeatLabel=labels[bestFeat]
    myTree={bestFeatLabel:{}} #分类结果以字典形式保存
    del(labels[bestFeat])
    featValues=[example[bestFeat] for example in dataSet]
    uniqueVals=set(featValues)
    for value in uniqueVals:
        subLabels=labels[:]
        myTree[bestFeatLabel][value]=createTree(splitDataSet\
                            (dataSet,bestFeat,value),subLabels)
    return myTree


if __name__=='__main__':
    dataSet, labels=createDataSet1()  # 创造示列数据
    print(createTree(dataSet, labels))  # 输出决策树模型结果

结果

{'声音': {'粗': {'头发': {'长': '女', '短': '男'}}, '细': '女'}

Naive_Bayes

流程

源代码

import pandas as pd
import numpy as np
import math
from functools import reduce
# from scipy import stats

'''
天气因素有温度、湿度和刮风等,通过给出数据,使用贝叶斯算法学习分类,输出一个人是运动和不运动与天气之间的概率关系。
# 求先验概率p c,这里的c有两类
# 求联合概率evidence p(f1,f2...) 大家都一样无所谓
# 似然概率p(f1,f2...|c)=p(f1|c)*p(f2|c)...
## 对连续样本假设为高斯分布
'''
class Naive_Bayes():
    def __init__(self):
        self.data=pd.read_csv('3_data.csv')
        self.P_category()
        self.likelihood()
    def category_extract(self,values):
        # prior
        labels = set(values)
        self.__labels_Lenth=len(labels)
        for s in labels:
            indices = np.where(values == s)[0]
            yield s,indices

    def P_category(self):
        self.prior = dict()
        L = len(self.data['运动'].values)
        for s,indices in self.category_extract(self.data['运动'].values):
            self.prior[s] = len(indices) / L
        # {'不适合': 0.35714285714285715, '适合': 0.6428571428571429}

    def likelihood(self):
        # 提取每一类索引
        # 有两种情况,object、int

        self.Pfeature = dict()#似然概率
        for info in self.data._info_axis.values[:-1]:
            self.Pfeature[info]=dict()
            for c,indices in self.category_extract(self.data['运动'].values):
                temp_values=self.data[info].values[indices]#在数据中 <天气> 这一类 <不合适-运动> 的值

                if (self.data[info].dtype==np.int64):
                    # 整型-假设高斯分布
                    u=np.mean(temp_values)
                    theta=np.var(temp_values)
                    self.Pfeature[info][c]=[u,theta] # <温度> <不合适运动c> 高斯函数的参数[u,theta]
                    pass
                elif (self.data[info].dtype==np.object):
                    # 文字可以直接抽取-离散
                    self.Pfeature[info][c] = dict()
                    L = len(temp_values)+self.__labels_Lenth
                    for s, inds in self.category_extract(temp_values):
                        self.Pfeature[info][c][s] =  (len(inds)+1)/ L #  <天气info> <不合适c>(运动)  <多云s> 的似然概率
                    pass
                else:
                    print(self.data[info].dtype, 'gg')
                    exit()
                pass

    def __gauss(self,x,u,theta):
        # x=u
        # theta=1
        # print(x,u,theta,1 / math.sqrt(2 * math.pi * theta ** 2) * math.exp(-(x - u) ** 2 / (2 * theta ** 2)))
        # a=1 / math.sqrt(2 * math.pi * theta ** 2) * math.exp(-(x - u) ** 2 / (2 * theta ** 2))
        # b=stats.norm.pdf(x,u,theta)
        # print(a,b)
        x=np.arange(x-0.5,x+0.5,0.1)
        return np.sum(1 / math.sqrt(2 * math.pi * theta ** 2) * np.exp(-(x - u) ** 2 / (2 * theta ** 2)))
    def predict(self,n=-1):
        x,y=list(self.data[info].values[n] for info in self.data._info_axis.values[:-1]),self.data[self.data._info_axis.values[-1]].values[-1]
        max_p=[0,0,0]#第一个存率,第二个存类别,第三个求和
        for c,d in self.prior.items():
            # print('?')
            temp_possibility=[]#连乘用的分子
            temp_possibility.append(d)
            for i,info in enumerate(self.data._info_axis.values[:]):
                # print(info, c, x[i])
                if (self.data[info].dtype==np.int64):
                    temp_possibility.append(self.__gauss(x[i],self.Pfeature[info][c][0],self.Pfeature[info][c][1]))
                elif (self.data[info].dtype==np.object):
                    try:
                        temp_possibility.append(self.Pfeature[info][c][x[i]])
                    except KeyError:
                        temp_possibility.append(0.00001)
            # print(c,temp_possibility)
            # temp=abs(reduce(lambda a,b:a+b,map(math.log,temp_possibility)))
            temp=reduce(lambda a,b:a*b,temp_possibility)
            max_p[2]+=temp
            if temp>max_p[0]:
                max_p[:2]=temp,c
                # print(max_p)
                # print(temp)

        # print(self.Pfeature)
        '''
        {'天气': 
                {'不适合': {'有雨': 0.42857142857142855, '晴': 0.5714285714285714}, 
                '适合': {'有雨': 0.36363636363636365, '多云': 0.45454545454545453, '晴': 0.2727272727272727}}, 
        '温度': 
                {'不适合': [74.599999999999994, 49.839999999999996], 
                '适合': [73.0, 33.777777777777779]}, 
        '湿度': 
                {'不适合': [84.0, 74.0], 
                '适合': [78.222222222222229, 86.839506172839506]}, 
        '风况': 
            {'不适合': {'有': 0.5714285714285714, '无': 0.42857142857142855}, 
            '适合': {'有': 0.36363636363636365, '无': 0.6363636363636364}}}
        '''

        print(x,max_p[1],max_p[0]/max_p[2])
        return max_p[1]
        pass

if __name__ == '__main__':
    myNB=Naive_Bayes()
    # myNB.predict(-4)
    y=list(map(myNB.predict,list(range(14))))
    # print(y)
    # print(np.where(y==myNB.data['运动'].values)[0])
    print(len(np.where(y==myNB.data['运动'].values)[0])/len(y))

结果

['晴', 85, 85, '无'] 适合 0.605515228032
['晴', 80, 90, '有'] 不适合 0.597763996605
['多云', 83, 78, '无'] 适合 0.999993280941
['有雨', 70, 96, '无'] 适合 0.738902944045
['有雨', 68, 80, '无'] 适合 0.740164275592
['有雨', 65, 70, '有'] 适合 0.550906281645
['多云', 64, 65, '有'] 适合 0.99998485259
['晴', 72, 95, '无'] 适合 0.614379338515
['晴', 69, 70, '无'] 适合 0.618965026052
['有雨', 75, 80, '无'] 适合 0.74028592022
['晴', 75, 70, '有'] 不适合 0.589771190225
['多云', 72, 90, '有'] 适合 0.999984648989
['多云', 81, 75, '无'] 适合 0.999993369057
['有雨', 71, 80, '有'] 适合 0.550487083686
0.6428571428571429

K-Means图像分割

流程

源代码

# 基于K-means算法的图像分割
import cv2
import numpy as np
# 读取图像
start=0
img0=cv2.imread('./kmean/'+str(start)+'.tiff')
zz=np.load('./kmean/zz.npy')
C = 3  # 聚类数量


def kmean(img0):
    img = np.copy(img0)
    img=np.array(img,dtype=np.float32)
    shape = img.shape[:2]
    # zz=np.array([[60,80,100],[140,160,180],[220,240,260]])#聚类中心
    zz=np.array([[ 107.42873407,165.75729793,149.6303364],[51.03919928,54.9071066,45.18418758],[304.08849809,230.56291292,161.19507833]])#聚类中心

    # zz=np.tile(z,[1,3])
    # zz=np.random.randint(0,255,(3,3))
    # print(zz)
    L=shape[0]*shape[1]#数据长度
    kinds=np.empty((L,),dtype=np.uint8)#类别数组
    data=np.reshape(img,(L,3))            # reshape
    # z_last=zz.copy()#中心
    for time in range(start,5+start):
        print(time)
        # 逐点
        for i in range(L):
            mdzz = np.linalg.norm(data[i] - zz,axis=1)
            # print(mdzz)
            min_mdzz_index = np.argmin(mdzz)
            # print(min_mdzz_index)
            kinds[i] = min_mdzz_index
        # print(kinds)
        # 更新聚类中心
        for k in range(C):
            tmp_where = np.where(kinds == k)[0]
            # print('tmp_where',tmp_where)
            # print(data[tmp_where])
            zz[k] = np.mean(data[tmp_where],axis=0)
        print('zz\n',zz)
        # 赋值
        for k in range(C):
            tmp_where = np.where(kinds == k)[0]
            data[tmp_where] = zz[k]
        # re-reshpae
        temp = np.reshape(data, (shape[0],shape[1],3))
        temp=np.uint8(temp)
        print(temp.shape)
        print('保存%d.tiff'%(time+1))
        # temp=cv2.cvtColor(temp,cv2.COLOR_GRAY2BGR)
        cv2.imwrite('./kmean/' + str(time+1) + '.tiff', temp)
    np.save('./kmean/zz.npy',zz)
    return kinds

if __name__ == '__main__':
    kinds=kmean(img0)
    gray=[0,178,255]
    img_gray=cv2.cvtColor(img0,cv2.COLOR_BGR2GRAY)
    shape=img_gray.shape
    img_gray=np.reshape(img_gray,(img_gray.size,))
    for k in range(C):
        tmp_where = np.where(kinds == k)[0]
        img_gray[tmp_where]=gray[k]
    img_gray=np.reshape(img_gray,shape)
    cv2.imwrite('./kmean/gray.tiff', img_gray)

结果



你可能感兴趣的:(Apriori、ID3、Naive_Bayes等(数据挖掘))