KMeans与深度学习自编码AutoEncoder结合提高聚类效果

特征是用户的消费商品的消费金额,原始数据(部分)是这样的:

id,goods_name,goods_amount
1,男士手袋,1882.0
2,淑女装,2491.0
3,淑女装,2492.0
2,女士手袋,345.0
4,基础内衣,328.0
5,商务正装,4985.0
5,时尚,969.0
5,女饰品,86.0
6,专业运动,399.0
6,童装(中大童),2033.0
6,男士配件,38.0

看到同一个id下面有不同的消费记录,这个数据不能直接拿来用,写了python程序来进行处理datadeal.py:

# !/usr/bin/python
#coding:utf-8
#author:wuyy
'''

数据预处理
'''
import pandas as pd
import  numpy as np
import time
import re

#加载文件
x=pd.read_table('info.txt',sep = ",")
x=x.dropna(axis=0)
a1=list(x.iloc[:,0])
a2=list(x.iloc[:,1])
a3=list(x.iloc[:,2])
print("数据表:",x)

#A是商品类别
dicta=dict(zip(a2,zip(a1,a3)))
print("dicta:",dicta)
A=list(dicta.keys())
#B是用户id
B=list(set(a1))

#创建商品类别字典
a = np.arange(len(A))
lista = list(a)
dict_class = dict(zip(A,lista))

#将商品分类写入
f=open('class.txt','w')
for k ,v in dict_class.items():
     f.write(str(k)+'\t'+str(v)+'\n')
f.close()

start=time.clock()
#创建大字典存储数据
dictall = {}
for i in range(len(a1)):
    if a1[i] in dictall.keys():
        value = dictall[a1[i]]
        j = dict_class[a2[i]]
        value[j] = a3[i]
        dictall[a1[i]]=value
    else:
        value = list(np.zeros(len(A)))
        j = dict_class[a2[i]]
        value[j] = a3[i]
        dictall[a1[i]]=value
print('dictall:',dictall)

#将字典转化为dataframe
dictall1 = pd.DataFrame(dictall)
dictall_matrix = dictall1.T
print("dictall_matrix:",dictall_matrix)
dictall_matrix.to_csv("data_matrix.txt",index=True,header=None)
# fw2=open("dictall_matrix.txt",'w')
# fw2.write(dictall_matrix)
# fw2.close()
dictall_matrix
end = time.clock()
print ("赋值过程运行时间是:%f s"%(end-start))

df=pd.DataFrame(columns=['id','id1'])
df[id]=1
print(df)

数据处理完成之后,进行AutoEncoder编码AE.py,

# !/usr/bin/python
#coding:utf-8
#author:wuyy
'''

AE模型(Auto-encoder)
主要是能够把数据缩放,如果你输入的维数比较大,譬如实际的特征是几千维的,全部拿到算法里跑,效果不见得好,
因为并不是所有特征都是有用的,用AE模型后,你可以压缩成m维(就是隐含层的节点数)

'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing


class AutoEncoder():
    """ Auto Encoder
    layer      1     2    ...    ...    L-1    L
      W        0     1    ...    ...    L-2
      B        0     1    ...    ...    L-2
      Z              0     1     ...    L-3    L-2
      A              0     1     ...    L-3    L-2
    """

    def __init__(self, X, Y, nNodes):
        # training samples
        self.X = X
        self.Y = Y
        # number of samples
        self.M = len(self.X)
        # layers of networks
        self.nLayers = len(nNodes)
        # nodes at layers
        self.nNodes = nNodes
        # parameters of networks
        self.W = list()
        self.B = list()
        self.dW = list()
        self.dB = list()
        self.A = list()
        self.Z = list()
        self.delta = list()
        for iLayer in range(self.nLayers - 1):
            self.W.append(
                np.random.rand(nNodes[iLayer] * nNodes[iLayer + 1]).reshape(nNodes[iLayer], nNodes[iLayer + 1]))
            self.B.append(np.random.rand(nNodes[iLayer + 1]))
            self.dW.append(np.zeros([nNodes[iLayer], nNodes[iLayer + 1]]))
            self.dB.append(np.zeros(nNodes[iLayer + 1]))
            self.A.append(np.zeros(nNodes[iLayer + 1]))
            self.Z.append(np.zeros(nNodes[iLayer + 1]))
            self.delta.append(np.zeros(nNodes[iLayer + 1]))

        # value of cost function
        self.Jw = 0.0
        # active function (logistic function)
        self.sigmod = lambda z: 1.0 / (1.0 + np.exp(-z))
        # learning rate 1.2
        self.alpha = 2.5
        # steps of iteration 30000
        self.steps = 10000

    def BackPropAlgorithm(self):
        # clear values
        self.Jw -= self.Jw
        for iLayer in range(self.nLayers - 1):
            self.dW[iLayer] -= self.dW[iLayer]
            self.dB[iLayer] -= self.dB[iLayer]
        # propagation (iteration over M samples)
        for i in range(self.M):
            # Forward propagation
            for iLayer in range(self.nLayers - 1):
                if iLayer == 0:  # first layer
                    self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
                else:
                    self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])
                self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
                # Back propagation
            for iLayer in range(self.nLayers - 1)[::-1]:  # reserve
                if iLayer == self.nLayers - 2:  # last layer
                    self.delta[iLayer] = -(self.X[i] - self.A[iLayer]) * (self.A[iLayer] * (1 - self.A[iLayer]))
                    self.Jw += np.dot(self.Y[i] - self.A[iLayer], self.Y[i] - self.A[iLayer]) / self.M
                else:
                    self.delta[iLayer] = np.dot(self.W[iLayer].T, self.delta[iLayer + 1]) * (
                    self.A[iLayer] * (1 - self.A[iLayer]))
                # calculate dW and dB
                if iLayer == 0:
                    self.dW[iLayer] += self.X[i][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
                else:
                    self.dW[iLayer] += self.A[iLayer - 1][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
                self.dB[iLayer] += self.delta[iLayer]
                # update
        for iLayer in range(self.nLayers - 1):
            self.W[iLayer] -= (self.alpha / self.M) * self.dW[iLayer]
            self.B[iLayer] -= (self.alpha / self.M) * self.dB[iLayer]

    def PlainAutoEncoder(self):
        for i in range(self.steps):
            self.BackPropAlgorithm()
            print("step:%d" % i, "Jw=%f" % self.Jw)

    def ValidateAutoEncoder(self):
        a = np.array([i for i in range(1, 6)])
        df = pd.DataFrame(a, columns=['weidu'])
        for i in range(self.M):
            print( self.X[i])

            for iLayer in range(self.nLayers - 1):
                if iLayer == 0:  # input layer
                    self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
                else:
                    self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])
                self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
                print("\t layer=%d" % iLayer, self.A[iLayer])
                if iLayer==0:
                    df[str(i+1)]=self.A[iLayer]

        df.to_csv("jaingwei.txt",index=False)




data = []
index = []
f = open('./data_matrix.txt', 'r')
for line in f.readlines():
    ss = line.replace('\n', '').split(',')
    index.append(ss[0])
    ss1 = ss[1:]
    tmp = []
    for i in range(len(ss1)):
        tmp.append(float(ss1[i]))
    data.append(tmp)
f.close()

x = np.array(data)
# 归一化处理
xx = preprocessing.scale(x)
nNodes = np.array([10, 5, 10])
ae3 = AutoEncoder(xx, xx, nNodes)
ae3.PlainAutoEncoder()
ae3.ValidateAutoEncoder()

print("ae结果:",ae3.A[0])

# # 这是个例子,输出的结果也是这个
# xx = np.array([[0,0,0,0,0,0,0,1], [0,0,0,0,0,0,1,0], [0,0,0,0,0,1,0,0], [0,0,0,0,1,0,0,0],[0,0,0,1,0,0,0,0], [0,0,1,0,0,0,0,0]])
# nNodes = np.array([ 8, 3, 8 ])
# ae2 = AutoEncoder(xx,xx,nNodes)
# ae2.PlainAutoEncoder()
# ae2.ValidateAutoEncoder()

使用sklearn 的Kmeans 进行聚类

# !/usr/bin/python
# coding:utf-8
# Author :wuyy

from matplotlib import pyplot
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster   import KMeans
from scipy import sparse
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle
from sklearn.externals import joblib


#加载数据
data = pd.read_table('jaingwei.txt',sep = ",")
data=data.T
x = data.ix[1:,0:5]
print(x)
card = data.ix[:,0]

x1 = np.array(x)
print("x1:",x1)
xx = preprocessing.scale(x1)

print("preprocessing.scale xx:",xx)
num_clusters = 3

clf = KMeans(n_clusters=num_clusters,  n_init=1, n_jobs = 1,verbose=1) #job=-1 并行化处理
clf.fit(xx)
print("label:",clf.labels_)
labels = clf.labels_
#score是轮廓系数
score = metrics.silhouette_score(xx, labels)
# clf.inertia_用来评估簇的个数是否合适,距离越小说明簇分的越好
print ("clf.inertia_",clf.inertia_)
print (score)

github地址:https://github.com/wu-yy/Kmeans

转载自:http://www.cnblogs.com/charlotte77/p/5366578.html

你可能感兴趣的:(机器学习)