第五章 数据建模

  (一)聚类分析

  1、主要方法

  2、距离分析

  度量样本之间的相似性,采用距离算法:

  文档相似性度量

  3、K-means分类

  #-*- coding: utf-8 -*-

  # 使用K-Means算法聚类消费行为特征数据

  import pandas as pd

  # 参数初始化

  inputfile = '../data/consumption_data.xls' # 销量及其他属性数据

  outputfile = '../tmp/data_type.xls' # 保存结果的文件名

  k = 3 # 聚类的类别

  iteration = 500 # 聚类最大循环次数

  data = pd.read_excel(inputfile, index_col='Id') # 读取数据

  data_zs = 1.0 * (data - data.mean()) / data.std() # 数据标准化

  from sklearn.cluster import KMeans

  model = KMeans(n_clusters=k, n_jobs=1, max_iter=iteration) # 分为k类,并发数4

  model.fit(data_zs) # 开始聚类

  # 简单打印结果

  r1 = pd.Series(model.labels_).value_counts() # 统计各个类别的数目

  r2 = pd.DataFrame(model.cluster_centers_) # 找出聚类中心

  r = pd.concat([r2, r1], axis=1) # 横向连接(0是纵向),得到聚类中心对应的类别下的数目

  r.columns = list(data.columns) + [u'类别数目'] # 重命名表头

  print(r)

  # 详细输出原始数据及其类别

  r = pd.concat([data, pd.Series(model.labels_, index=data.index)],

  axis=1) # 详细输出每个样本对应的类别

  r.columns = list(data.columns) + [u'聚类类别'] # 重命名表头

  r.to_excel(outputfile) # 保存结果

  def density_plot(data): # 自定义作图函数

  import matplotlib.pyplot as plt

  plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签

  plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

  p = data.plot(kind='kde', linewidth=2, subplots=True, sharex=False)

  [p[i].set_ylabel(u'密度') for i in range(k)]

  plt.legend()

  return plt

  pic_output = '../tmp/pd_' # 概率密度图文件名前缀

  for i in range(k):

  density_plot(data[r[u'聚类类别'] == i]).savefig(u'%s%s.png' % (pic_output, i))

  #利用TSNE绘图

  #-*- coding: utf-8 -*-

  # 接k_means.py

  from sklearn.manifold import TSNE

  tsne = TSNE()

  tsne.fit_transform(data_zs) # 进行数据降维

  tsne = pd.DataFrame(tsne.embedding_, index=data_zs.index) # 转换数据格式

  import matplotlib.pyplot as plt

  plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签

  plt.rcParams['axes.unicode_minus'] = False # 用来正常显示负号

  # 不同类别用不同颜色和样式绘图

  d = tsne[r[u'聚类类别'] == 0]

  plt.plot(d[0], d[1], 'r.')

  d = tsne[r[u'聚类类别'] == 1]

  plt.plot(d[0], d[1], 'go')

  d = tsne[r[u'聚类类别'] == 2]

  plt.plot(d[0], d[1], 'b*')

  plt.show()

  聚类分析算法评价 P111

  (1)purity评价法

  (2)RI评价法

  (3)F值评价法

  4、Meanshift

  与kmeans算法不同,mean shift 算法可自动决定类别的数目。与kmeans算法一样的是,两者都是用集合内数据点的均值进行中心点的移动。

  算法核心:算法的关键操作是通过感兴趣区域内的数据密度变化计算中心点的漂移向量,从而移动中心点进行下一次迭代,直到到达密度最大处(中心点不变)。从每个数据点出发都可以进行该操作,在这个过程,统计出现在感兴趣区域内的数据的次数。该参数将在最后作为分类的依据。

  mean shift 算法中,bandwidth(带宽)是重要参数。

  案例来源:AI with python(mean_shift.py)

  import numpy as np

  import matplotlib.pyplot as plt

  from sklearn.cluster import MeanShift, estimate_bandwidth

  from itertools import cycle

  # Load data from input file

  X = np.loadtxt('../code/data_clustering.txt', delimiter=',')

  # Estimate the bandwidth of X

  bandwidth_X = estimate_bandwidth(X, quantile=0.1, n_samples=len(X))

  # Cluster data with MeanShift

  meanshift_model = MeanShift(bandwidth=bandwidth_X, bin_seeding=True)

  meanshift_model.fit(X)

  # Extract the centers of clusters

  cluster_centers = meanshift_model.cluster_centers_

  print('\nCenters of clusters:\n', cluster_centers)

  # Estimate the number of clusters

  labels = meanshift_model.labels_

  num_clusters = len(np.unique(labels))

  print("\nNumber of clusters in input data =", num_clusters)

  # Plot the points and cluster centers

  plt.figure()

  markers = 'o*xvs'

  for i, marker in zip(range(num_clusters), markers):

  # Plot points that belong to the current cluster

  plt.scatter(X[labels==i, 0], X[labels==i, 1], marker=marker, color='black')

  # Plot the cluster center

  cluster_center = cluster_centers[i]

  plt.plot(cluster_center[0], cluster_center[1], marker='o',

  markerfacecolor='black', markeredgecolor='black',

  markersize=15)

  plt.title('Clusters')

  plt.show()

  5、GMM算法

  GMM算法主要利用EM算法来估计高斯混合模型中的参数,然后根据计算得到的 概率进行聚类。

  GMM分布

  高斯混合分布是假设总体的分布有多个不同的高斯分布混合而成,其中每一个高斯分布所占的权重不相同。

  GMM和K-means直观对比

  最后我们比较GMM和K-means两个算法的步骤。

  GMM:

  先计算所有数据对每个分模型的响应度

  根据响应度计算每个分模型的参数

  迭代

  K-means:

  先计算所有数据对于K个点的距离,取距离最近的点作为自己所属于的类

  根据上一步的类别划分更新点的位置(点的位置就可以看做是模型参数)

  迭代

  可以看出GMM和K-means还是有很大的相同点的。GMM中数据对高斯分量的响应度就相当于K-means中的距离计算,GMM中的根据响应度计算高斯分量参数就相当于K-means中计算分类点的位置。然后它们都通过不断迭代达到最优。不同的是:GMM模型给出的是每一个观测点由哪个高斯分量生成的概率,而K-means直接给出一个观测点属于哪一类。

  案例来源AI with python (gmm_classifier.py)

  import numpy as np

  import pandas as pd

  import matplotlib.pyplot as plt

  from matplotlib import patches #

  from sklearn import datasets

  from sklearn.mixture import GaussianMixture #GMM更换为GaussianMixture

  from sklearn.model_selection import StratifiedKFold

  # Load the iris dataset

  iris = datasets.load_iris()

  #print(iris) #数据分为data和target两组

  # Split dataset into training and testing (80/20 split)

  skf = StratifiedKFold(n_splits=5) #将数据分为5组。

  indices=skf.split(iris.data,iris.target) #将数据分为4组train,1组test

  # Take the first fold

  train_index, test_index = next(iter(indices))

  # Extract training data and labels

  X_train = iris.data[train_index]

  y_train = iris.target[train_index]

  # Extract testing data and labels

  X_test = iris.data[test_index]

  y_test = iris.target[test_index]

  # Extract the number of classes

  num_classes = len(np.unique(y_train))

  # Build GMM

  classifier = GaussianMixture(n_components=num_classes, covariance_type='full', #n_components指的是下层分布由几个构成,本项目中指的是num_classes. covariance_type指一致性算法的类别

  init_params='kmeans', max_iter=20) #init_params中w代表weights,c代表covariance在迭代中进行更新;n_iter迭代次数

  # Initialize the GMM means

  classifier.means_ = np.array([X_train[y_train == i].mean(axis=0)

  for i in range(num_classes)])

  # Train the GMM classifier

  classifier.fit(X_train)

  # Draw boundaries

  plt.figure()

  colors = 'bgr'

  for i, color in enumerate(colors):

  # Extract eigenvalues and eigenvectors

  eigenvalues, eigenvectors = np.linalg.eigh(classifier.covariances_[i][:2, :2])

  #参照GaussianMixture的属性修改为covariances_。在covariances_()时报错,希望通过dataframe的类对象的方法得到#numpy数组。不应带括号,他是属性,不是方法。

  # Normalize the first eigenvector

  norm_vec = eigenvectors[0] / np.linalg.norm(eigenvectors[0])

  # Extract the angle of tilt

  angle = np.arctan2(norm_vec[1], norm_vec[0])

  angle = 180 * angle / np.pi

  # Scaling factor to magnify the ellipses

  # (random value chosen to suit our needs)

  scaling_factor = 8

  eigenvalues *= scaling_factor

  # Draw the ellipse

  ellipse = patches.Ellipse(classifier.means_[i, :2],

  eigenvalues[0], eigenvalues[1], 180 + angle,

  color=color)

  axis_handle = plt.subplot(1, 1, 1)

  ellipse.set_clip_box(axis_handle.bbox)

  ellipse.set_alpha(0.6)

  axis_handle.add_artist(ellipse)

  # Plot the data

  colors = 'bgr'

  for i, color in enumerate(colors):

  cur_data = iris.data[iris.target == i]

  plt.scatter(cur_data[:,0], cur_data[:,1], marker='o',

  facecolors='none', edgecolors='black', s=40,

  label=iris.target_names[i])

  test_data = X_test[y_test == i]

  plt.scatter(test_data[:,0], test_data[:,1], marker='s',

  facecolors='black', edgecolors='black', s=40,

  label=iris.target_names[i])

  # Compute predictions for training and testing data

  y_train_pred = classifier.predict(X_train)

  accuracy_training = np.mean(y_train_pred.ravel() == y_train.ravel()) * 100

  print('Accuracy on training data =', accuracy_training)

  y_test_pred = classifier.predict(X_test)

  accuracy_testing = np.mean(y_test_pred.ravel() == y_test.ravel()) * 100

  print('Accuracy on testing data =', accuracy_testing)

  plt.title('GMM classifier')

  plt.xticks(())

  plt.yticks(())

  plt.show()

  存在问题 :生成图形有差别,预测精度也有问题。特征值、特征向量的用法

  6、近邻传播算法

  Affinity Propagation 聚类算法的通俗解释。下面案例是网上找的AP算法的一个案例。亲测可用。

  https://blog.csdn.net/notHeadache/article/details/89003044

  print(__doc__)

  from sklearn.cluster import AffinityPropagation

  from sklearn import metrics

  from sklearn.datasets.samples_generator import make_blobs

  # #############################################################################

  # Generate sample data

  centers = [[1, 1], [-1, -1], [1, -1]]

  X, labels_true = make_blobs(n_samples=300, centers=centers, cluster_std=0.5,

  random_state=0)

  # #############################################################################

  # Compute Affinity Propagation

  af = AffinityPropagation(preference=-50).fit(X)

  cluster_centers_indices = af.cluster_centers_indices_

  labels = af.labels_

  n_clusters_ = len(cluster_centers_indices) #

  print('Estimated number of clusters: %d' % n_clusters_)

  print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels))

  print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels))

  print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))

  print("Adjusted Rand Index: %0.3f"

  % metrics.adjusted_rand_score(labels_true, labels))

  print("Adjusted Mutual Information: %0.3f"

  % metrics.adjusted_mutual_info_score(labels_true, labels))

  print("Silhouette Coefficient: %0.3f"

  % metrics.silhouette_score(X, labels, metric='sqeuclidean'))

  # #############################################################################

  # Plot result

  import matplotlib.pyplot as plt

  from itertools import cycle

  plt.close('all')

  plt.figure(1)

  plt.clf()

  colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')

  for k, col in zip(range(n_clusters_), colors):

  class_members = labels == k

  cluster_center = X[cluster_centers_indices[k]]

  plt.plot(X[class_members, 0], X[class_members, 1], col + '.')

  plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,

  markeredgecolor='k', markersize=14)

  for x in X[class_members]:

  plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col)

  plt.title('Estimated number of clusters: %d' % n_clusters_)

  plt.show()无锡人流多少钱 http://www.bhnnk120.com/

  (二)问题要点

  1、数据连接

  r = pd.concat([r2, r1], axis=1) # 横向连接(0是纵向),得到聚类中心对应的类别下的数目

  r.columns = list(data.columns) + [u'类别数目'] # 重命名表头

  2、分别绘图

  pic_output = '../tmp/pd_' # 概率密度图文件名前缀

  for i in range(k):

  density_plot(data[r[u'聚类类别'] == i]).savefig(u'%s%s.png' % (pic_output, i))

  3、数据标准化

  data_zs = 1.0 * (data - data.mean()) / data.std() # 数据标准化

  4、kmeans++算法

  kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10) #k-means++采用智能方法找出初始聚类中心,n_init为迭代次数

  5、最优分类次数计算(采用轮廓系数判定)

  import numpy as np

  import matplotlib.pyplot as plt

  from sklearn import metrics

  from sklearn.cluster import KMeans

  # Load data from input file

  X = np.loadtxt('../code/data_quality.txt', delimiter=',')

  # Plot input data

  plt.figure()

  plt.scatter(X[:,0], X[:,1], color='black', s=80, marker='o', facecolors='none')

  x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1

  y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1

  plt.title('Input data')

  plt.xlim(x_min, x_max)

  plt.ylim(y_min, y_max)

  plt.xticks(())

  plt.yticks(())

  # Initialize variables

  scores = []

  values = np.arange(2, 10)

  # Iterate through the defined range

  for num_clusters in values: #从2到10里面选择

  # Train the KMeans clustering model

  kmeans = KMeans(init='k-means++', n_clusters=num_clusters, n_init=10)

  kmeans.fit(X)

  score = metrics.silhouette_score(X, kmeans.labels_,metric='euclidean', sample_size=len(X)) #轮廓系数

  print("\nNumber of clusters =", num_clusters)

  print("Silhouette score =", score)

  scores.append(score)

  # Plot silhouette scores

  plt.figure()

  plt.bar(values, scores, width=0.7, color='black', align='center')

  plt.title('Silhouette score vs number of clusters')

  # Extract best score and optimal number of clusters

  num_clusters = np.argmax(scores) + values[0]

  print('\nOptimal number of clusters =', num_clusters)

  plt.show()