k-means算法


import os, sys
import argparse
import numpy as np
import time
import matplotlib.pyplot as plt


parser = argparse.ArgumentParser()
parser.add_argument('-s', '--seed', type=int, default=42, help="Random seed.")

args = parser.parse_args()

np.random.seed(args.seed)
# Sample data points from three different Gaussians.
u1, u2, u3 = np.asarray([1.0, 1.0]), np.asarray([-1.0, 1.0]), np.asarray([5.0, 5.0])
sigma1, sigma2, sigma3 = np.random.rand(2, 2), np.random.rand(2, 2), np.random.rand(2, 2)
sigma1 = np.dot(sigma1, sigma1.T) * 1.0
sigma2 = np.dot(sigma2, sigma2.T) * 5.0
sigma3 = np.dot(sigma3, sigma3.T) * 2.0

num_pts = 100
samples1 = np.random.multivariate_normal(u1, sigma1 * np.ones((2, 2)), size=num_pts)
samples2 = np.random.multivariate_normal(u2, sigma2 * np.ones((2, 2)), size=num_pts)
samples3 = np.random.multivariate_normal(u3, sigma3 * np.ones((2, 2)), size=num_pts)

# Plot for visualization.
plt.figure()
plt.scatter(samples1[:, 0], samples1[:, 1], s=40, c="r", alpha=0.5)
plt.scatter(samples2[:, 0], samples2[:, 1], s=40, c="b", alpha=0.5)
plt.scatter(samples3[:, 0], samples3[:, 1], s=40, c="g", alpha=0.5)
plt.grid(True)
plt.title("Ground Truth Clustering")
plt.savefig("./examples/clustering_{}.pdf".format(args.seed), bbox_inches="tight")
plt.show()

# Vanilla K-means clustering.
samples = np.vstack([samples1, samples2, samples3])
rorder = np.arange(num_pts * 3)
rorder = np.random.shuffle(rorder)
samples = samples[rorder, :].squeeze()
# Lloyd's algorithm, with random initialization.
k = 3
centers = np.random.rand(k, 2)
num_iters = 10
losses = []
# Save for repeated use.
xdist = np.sum(samples * samples, axis=1)
for _ in xrange(num_iters):
    # Compute distance to each center.
    cdist = np.sum(centers * centers, axis=1)
    consts = xdist[:, np.newaxis] + cdist
    dists = consts - 2 * np.dot(samples, centers.T)
    # Compute cluster assignment.
    ids = np.argmin(dists, axis=1)
    losses.append(np.sum(np.min(dists, axis=1)))
    for i in xrange(k):
        centers[i, :] = np.mean(samples[ids == i], axis=0)

# Plot loss function. 
plt.figure()
plt.plot(np.arange(num_iters), losses, "bo-", linewidth=4, markersize=10)
plt.grid(True)
plt.xlabel("Iteration")
plt.ylabel("Loss")
plt.title("K-means loss function")
plt.savefig("./examples/loss_{}.pdf".format(args.seed), bbox_inches="tight")
plt.show()

# Plot cluster assignment.
plt.figure()
colors = ["r", "b", "g"]
for i in xrange(k):
    plt.scatter(samples[ids == i, 0], samples[ids == i, 1], c=colors[i], s=40, alpha=0.5)
plt.grid(True)
plt.title("K-means Clustering")
plt.savefig("./examples/kmeans_{}.pdf".format(args.seed), bbox_inches="tight")
plt.show()

你可能感兴趣的:(python,算法,kmeans,numpy)