对需要聚类的数据使用canopy做初步的计算

K值聚类的时候,需要自己指定cluster的数目。

这个cluster数目一般是通过canopy算法进行预处理来确定的。

canopy具体描述可以参考这里。

对需要聚类的数据使用canopy做初步的计算_第1张图片

 

下面是 golang语言的一个实现(对经纬度距离计算进行cluster)。

package main

import (
    "fmt"
    "math"
)

const (
    EARTH_RADIUS = 6371
)

type Point struct {
    lat float64
    lng float64
}

func Pop(points []Point) (p Point, newPoints []Point) {
    if len(points) > 0 {
        p = points[0]
        newPoints = points[1:]
    }
    return
}

func Push(p Point, points []Point) []Point {
    points = append(points, p)
    return points
}

// Calculates the Haversine distance between two points in kilometers.
// Original Implementation from: http://www.movable-type.co.uk/scripts/latlong.html
func GreatCircleDistance(p1, p2 Point) float64 {
    dLat := (p2.lat - p1.lat) * (math.Pi / 180.0)
    dLon := (p2.lng - p1.lng) * (math.Pi / 180.0)

    lat1 := p1.lat * (math.Pi / 180.0)
    lat2 := p2.lat * (math.Pi / 180.0)

    a1 := math.Sin(dLat/2) * math.Sin(dLat/2)
    a2 := math.Sin(dLon/2) * math.Sin(dLon/2) * math.Cos(lat1) * math.Cos(lat2)

    a := a1 + a2

    c := 2 * math.Atan2(math.Sqrt(a), math.Sqrt(1-a))
    return EARTH_RADIUS * c
}

/*
while(没有标记的数据点){
    选择一个没有强标记的数据点p
    把p看作一个新Canopy c的中心
    离p距离<x1的所有点都认为在c中,给这些点做上弱标记  //纳入canopy,有可能会纳入其它canopy
    离p距离<x2的所有点都认为在c中,给这些点做上强标记  //不会再纳入其它canopy
}
*/

//目前只实现了经纬度以及经纬度的距离计算,这里可以是一个向量 func CanopyCluster(points []Point, x1, x2 float64) { var tmp []Point var cluster [][]Point for len(points) > 0 { var center Point center, points = Pop(points) index := len(cluster) var cpList []Point cpList = append(cpList, center) cluster = append(cluster, cpList) var cur Point for len(points) > 0 { cur, points = Pop(points) distance := GreatCircleDistance(center, cur) if distance <= x1 { cluster[index] = append(cluster[index], cur) if distance > x2 { tmp = Push(cur, tmp) } } else { tmp = Push(cur, tmp) } } fmt.Printf("current number of items in this canopy %d\n", center) var t []Point points = tmp tmp = t } for k, c := range cluster { fmt.Println("canopy", k, "has", len(c), "items:") for _, v := range c { fmt.Println("\t", v.lat, v.lng) } } } func main() { pointsList := []Point{ {34.28637, -110.12059}, {34.28638, -110.1206}, {34.29077, -110.12078}, {34.29111, -110.11941}, {34.29113, -110.11938}, {34.29116, -110.1194}, {34.29145, -110.12043}, {34.29146, -110.12063}, {34.29154, -110.11873}, {34.3141, -110.11556}, {34.31411, -110.11557}, {34.31411, -110.11556}, {34.31412, -110.11556}, {34.31412, -110.11557}, {34.31415, -110.11552}, {34.31415, -110.11556}, } CanopyCluster(pointsList, 1.0, 0.8) }

 

你可能感兴趣的:(对需要聚类的数据使用canopy做初步的计算)