skearn DBSCAN聚类自定义距离函数DTW

2019独角兽企业重金招聘Python工程师标准>>>

某些特殊场景下，普通的欧式距离、曼哈顿距离等并不能满足我们的需求。例如在语音识别中，常使用DTW距离（Dynamic Time Warping，动态时间归整），如下图，实线和虚线分别是同一个词“pen”的两个语音波形（在y轴上拉开了，以便观察）。可以看到他们整体上的波形形状很相似，但在时间轴上却是不对齐的。

同时间度量转变为同模式度量，才能更好地反映2个语音波形的相似性：

（图片转载自：https://www.cnblogs.com/Daringoo/p/4095508.html）

而在sklearn中，我们可以自定义部分机器学习模型的距离函数，例如聚类算法DBSCAN就可以自定义距离：

dbscan = DBSCAN(eps=14,
                min_samples=2,
                metric=lambda a, b: DTW.distance(a, b))

使用metric参数即可。那么算法对这个距离函数有什么要求呢？

1、给出2组feature，它们的类型都是np.ndarray

2、返回一个距离，数据类型是float

为了使得距离适应DTW的特性（即长短不一致），又符合同类相比的距离标准（即所有feature长度一致），我们使用一个特殊数字-9999来填充所有的曲线，使它们长度一直。在运算中，我们用return_center_data函数把这个数字去掉，使得DTW算法能够正确地对原始数据进行距离计算。

由于函数输入变量的类型为np.ndarray，我们为了后续方便操作，全部转化为list类型

    @staticmethod
    def distance(s1, s2, signal_num=-9999):
        # type: (np.ndarray, np.ndarray, int) -> float
        tmp_s1 = []
        for i in s1:
            tmp_s1.append(i)
        tmp_s2 = []
        for i in s2:
            tmp_s2.append(i)
        s1_in = DTW.return_center_data(tmp_s1, signal_num)
        s2_in = DTW.return_center_data(tmp_s2, signal_num)
        result = DTW.dtw(s1_in, s2_in, DTW.dist_for_float)[0]

        return result

这样就能正确计算距离，并且聚类了：

聚类结果为：

[0 0 1 1 1]

结果显示曲线可分成两类。详细两两对比的距离结果如下：

(14.0, [1.0, 2.0, 3.0, 11.0, 11.0, 6.0, 6.0, 6.0, 6.0], [2.0, 2.0, 3.0, 11.0, 4.0, 6.0, 6.0, 6.0, 7.0])
(41.0, [1.0, 2.0, 3.0, 11.0, 11.0, 6.0, 6.0, 6.0, 6.0], [3.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0])
(40.0, [1.0, 2.0, 3.0, 11.0, 11.0, 6.0, 6.0, 6.0, 6.0], [4.0, 8.0, 3.0, 1.0, 2.0, 3.0, 4.0])
(42.0, [1.0, 2.0, 3.0, 11.0, 11.0, 6.0, 6.0, 6.0, 6.0], [5.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0, 4.0])
(33.0, [2.0, 2.0, 3.0, 11.0, 4.0, 6.0, 6.0, 6.0, 7.0], [3.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0])
(30.0, [2.0, 2.0, 3.0, 11.0, 4.0, 6.0, 6.0, 6.0, 7.0], [4.0, 8.0, 3.0, 1.0, 2.0, 3.0, 4.0])
(32.0, [2.0, 2.0, 3.0, 11.0, 4.0, 6.0, 6.0, 6.0, 7.0], [5.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0, 4.0])
(13.0, [3.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0], [4.0, 8.0, 3.0, 1.0, 2.0, 3.0, 4.0])
(17.0, [3.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0], [5.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0, 4.0])
(13.0, [4.0, 8.0, 3.0, 1.0, 2.0, 3.0, 4.0], [5.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0, 4.0])
(0.0, [1.0, 2.0, 3.0, 11.0, 11.0, 6.0, 6.0, 6.0, 6.0], [1.0, 2.0, 3.0, 11.0, 11.0, 6.0, 6.0, 6.0, 6.0])
(0.0, [2.0, 2.0, 3.0, 11.0, 4.0, 6.0, 6.0, 6.0, 7.0], [2.0, 2.0, 3.0, 11.0, 4.0, 6.0, 6.0, 6.0, 7.0])
(0.0, [3.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0], [3.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0])
(0.0, [4.0, 8.0, 3.0, 1.0, 2.0, 3.0, 4.0], [4.0, 8.0, 3.0, 1.0, 2.0, 3.0, 4.0])
(0.0, [5.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0, 4.0], [5.0, 8.0, 3.0, 1.0, 2.0, 3.0, 3.0, 4.0])

我们发现，在DBSCAN聚类中，如何调整区分不同类别曲线的距离阀值是关键。

完整代码如下：

# coding=utf-8
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import DBSCAN


class ToolFuncOfDTW:
    def __init__(self):
        pass

    @staticmethod
    def print_matrix(mat):
        print('[matrix] width : %d height : %d' % (len(mat[0]), len(mat)))
        print('-----------------------------------')
        for i in range(len(mat)):
            print(mat[i])  # [v[:2] for v in mat[i]]


class DTW:

    def __init__(self):
        pass

    @staticmethod
    def numpy_num_to_python_num(p1):
        if isinstance(p1, np.int32):
            p1 = int(p1)
        elif isinstance(p1, np.float64):
            p1 = float(p1)
        return p1

    @staticmethod
    def dist_for_float(p1, p2):
        p1 = DTW.numpy_num_to_python_num(p1)
        p2 = DTW.numpy_num_to_python_num(p2)
        if (type(p1) == float or type(p1) == int) and \
                (type(p2) == float or type(p2) == int):
            dist = float(abs(p1 - p2))
            return dist
        else:
            sum_val = 0.0
            for i in range(len(p1)):
                sum_val += pow(p1[i] - p2[i], 2)
            dist = pow(sum_val, 0.5)
            return dist

    @staticmethod
    def dtw(s1, s2, dist_func):
        w = len(s1)
        h = len(s2)

        mat = [([[0, 0, 0, 0*j*i] for j in range(w)]) for i in range(h)]

        for x in range(w):
            for y in range(h):
                dist = dist_func(s1[x], s2[y])
                mat[y][x] = [dist, 0, 0, 0]
                # DTW.print_matrix(mat)

        elem_0_0 = mat[0][0]
        elem_0_0[1] = elem_0_0[0] * 2

        for x in range(1, w):
            mat[0][x][1] = mat[0][x][0] + mat[0][x - 1][1]
            mat[0][x][2] = x - 1
            mat[0][x][3] = 0

        for y in range(1, h):
            mat[y][0][1] = mat[y][0][0] + mat[y - 1][0][1]
            mat[y][0][2] = 0
            mat[y][0][3] = y - 1

        for y in range(1, h):
            for x in range(1, w):
                distlist = [mat[y][x - 1][1], mat[y - 1][x][1], 2 * mat[y - 1][x - 1][1]]
                mindist = min(distlist)
                idx = distlist.index(mindist)
                mat[y][x][1] = mat[y][x][0] + mindist
                if idx == 0:
                    mat[y][x][2] = x - 1
                    mat[y][x][3] = y
                elif idx == 1:
                    mat[y][x][2] = x
                    mat[y][x][3] = y - 1
                else:
                    mat[y][x][2] = x - 1
                    mat[y][x][3] = y - 1

        result = mat[h - 1][w - 1]
        retval = result[1]
        path = [(w - 1, h - 1)]
        while True:
            x = result[2]
            y = result[3]
            path.append((x, y))

            result = mat[y][x]
            if x == 0 and y == 0:
                # DTW.print_matrix(mat)
                break
        return retval, sorted(path)

    @staticmethod
    def distance(s1, s2, signal_num=-9999):
        # type: (np.ndarray, np.ndarray, int) -> float
        tmp_s1 = []
        for i in s1:
            tmp_s1.append(i)
        tmp_s2 = []
        for i in s2:
            tmp_s2.append(i)
        s1_in = DTW.return_center_data(tmp_s1, signal_num)
        s2_in = DTW.return_center_data(tmp_s2, signal_num)
        result = DTW.dtw(s1_in, s2_in, DTW.dist_for_float)[0]
        print(result, s1_in, s2_in)
        return result

    @staticmethod
    def return_center_data(list_data, signal_num=-9999):
        # type: (list, int) -> list
        start = 0
        end = len(list_data)
        for i in range(len(list_data)):
            if list_data[i] != signal_num:
                start = i
                break

        for i in range(len(list_data)-1, 0, -1):
            if list_data[i] != signal_num:
                end = i + 1
                break
        return list_data[start:end]


class TestDTW:

    def __init__(self):
        pass

    @staticmethod
    def display(s1, s2):
        val, path = DTW.dtw(s1, s2, DTW.dist_for_float)

        w = len(s1)
        h = len(s2)

        mat = [[1] * (w + 0*i) for i in range(h)]
        for node in path:
            x, y = node
            mat[y][x] = 0

        mat = np.array(mat)

        plt.subplot(2, 2, 2)
        plt.pcolor(mat, edgecolors='k', linewidths=4)
        # print(c)
        plt.title('Dynamic Time Warping (%f)' % val)

        plt.subplot(2, 2, 1)
        plt.plot(s2, range(len(s2)), 'g')

        plt.subplot(2, 2, 4)
        plt.plot(range(len(s1)), s1, 'r')

        plt.show()

    @staticmethod
    def test_path():
        s1 = [1, 2, 3, 4, 5, 5, 5, 4]
        s2 = [3, 4, 5, 5, 5, 4]
        # s2 = s1
        # s2 = [1, 2, 3, 4, 5, 5]
        # s2 = [2, 3, 4, 5, 5, 5]
        val, path = DTW.dtw(s1, s2, DTW.dist_for_float)
        TestDTW.display(s1, s2)
        print(val, path)

    @staticmethod
    def test_remove_signal():
        s1 = [1, 2, 3, 4, 5, -9999, -9999, -9999, -9999, -9999]
        s2 = [-9999, -9999, -9999, 1, 2, 3, 4, 5, -9999, -9999, -9999, -9999, -9999]
        # print(np.array(s1), type(np.array(s1)))
        result = DTW.distance(np.array(s1), np.array(s2))
        TestDTW.display(s1, s2)
        print(s1)
        print(s2)
        print(result)

    @staticmethod
    def test_cluster_effect():
        from sklearn.neighbors import NearestNeighbors
        LL = 3

        def d(a, b, l):
            # type: (np.ndarray, np.ndarray, int) -> float
            # print(sum(a.tolist()))
            aa = a.tolist()  # 返回的是可迭代对象，不是list
            bb = b.tolist()
            # print(aa, type(aa))
            # print(bb, type(bb))
            result_d = 0.0

            tmp_list_a = []
            for i in aa:
                tmp_list_a.append(i)

            tmp_list_b = []
            for i in bb:
                tmp_list_b.append(i)

            for i in range(len(tmp_list_b)):
                result_d += (tmp_list_a[i] - tmp_list_b[i])*(tmp_list_a[i] - tmp_list_b[i])
            # print(type(a))
            # print(type(b))
            # result_d = bb + aa + float(2 + L)
            # bb += aa
            # result_d = bb
            return result_d + l

        knn = NearestNeighbors(n_neighbors=2,
                               algorithm='auto',
                               metric=lambda a, b: d(a, b, LL)
                               )
        # X = pd.DataFrame({'b': [0, 3, 2], 'c': [1.0, 4.3, 2.2]})
        X = np.array([[-1, -1],
                      [-2, -1],
                      [-3, -2],
                      [1, 1],
                      [2, 1],
                      [3, 2]])
        knn.fit(X)
        # result = knn.predict([0, 3, 1.9])
        distances, indices = knn.kneighbors(X)
        print(distances)
        print(indices)
        print(knn.kneighbors_graph(X).toarray())
        print("---------------------------")
        distances, indices = knn.kneighbors(np.array([[-3, -3]]))
        print(distances)
        print(indices)
        print(knn.kneighbors_graph(X).toarray())

    @staticmethod
    def test_cluster_effect_agg():
        s = [[1, 2, 3, 11, 11, 6, 6, 6, 6, -9999, -9999, -9999, -9999],
             [-9999, -9999, -9999, 2, 2, 3, 11, 4, 6, 6, 6, 7, -9999],
             [3, 8, 3, 1, 2, 3, 3, -9999, -9999, -9999, -9999, -9999, -9999],
             [4, 8, 3, 1, 2, 3, 4, -9999, -9999, -9999, -9999, -9999, -9999],
             [-9999, -9999, 5, 8, 3, 1, 2, 3, 3, 4,  -9999, -9999, -9999]]
        X = np.array(s)
        dbscan = DBSCAN(eps=14,
                        min_samples=2,
                        metric=lambda a, b: DTW.distance(a, b))  # 可以自定义距离函数
        cluster = dbscan.fit_predict(X)
        print(cluster)
        plt.rcParams.update({'figure.autolayout': True})
        for i in range(len(s)):
            size = (len(s)+1)*100 + 10 + (i+1)
            plt.subplot(size)
            plt.plot(DTW.return_center_data(s[i]))  # , title='title'+str(i)
            plt.xticks([0, 1, 2, 3, 4, 5, 6, 7, 8])
            plt.ylabel(str(i+1))
        plt.xlabel('step')
        plt.show()


if __name__ == "__main__":
    TestDTW.test_cluster_effect_agg()

skearn DBSCAN聚类自定义距离函数DTW

你可能感兴趣的:(人工智能,python,matlab)