特征是用户的消费商品的消费金额,原始数据(部分)是这样的:
id,goods_name,goods_amount
1,男士手袋,1882.0
2,淑女装,2491.0
3,淑女装,2492.0
2,女士手袋,345.0
4,基础内衣,328.0
5,商务正装,4985.0
5,时尚,969.0
5,女饰品,86.0
6,专业运动,399.0
6,童装(中大童),2033.0
6,男士配件,38.0
看到同一个id下面有不同的消费记录,这个数据不能直接拿来用,写了python程序来进行处理datadeal.py:
# !/usr/bin/python
#coding:utf-8
#author:wuyy
'''
数据预处理
'''
import pandas as pd
import numpy as np
import time
import re
#加载文件
x=pd.read_table('info.txt',sep = ",")
x=x.dropna(axis=0)
a1=list(x.iloc[:,0])
a2=list(x.iloc[:,1])
a3=list(x.iloc[:,2])
print("数据表:",x)
#A是商品类别
dicta=dict(zip(a2,zip(a1,a3)))
print("dicta:",dicta)
A=list(dicta.keys())
#B是用户id
B=list(set(a1))
#创建商品类别字典
a = np.arange(len(A))
lista = list(a)
dict_class = dict(zip(A,lista))
#将商品分类写入
f=open('class.txt','w')
for k ,v in dict_class.items():
f.write(str(k)+'\t'+str(v)+'\n')
f.close()
start=time.clock()
#创建大字典存储数据
dictall = {}
for i in range(len(a1)):
if a1[i] in dictall.keys():
value = dictall[a1[i]]
j = dict_class[a2[i]]
value[j] = a3[i]
dictall[a1[i]]=value
else:
value = list(np.zeros(len(A)))
j = dict_class[a2[i]]
value[j] = a3[i]
dictall[a1[i]]=value
print('dictall:',dictall)
#将字典转化为dataframe
dictall1 = pd.DataFrame(dictall)
dictall_matrix = dictall1.T
print("dictall_matrix:",dictall_matrix)
dictall_matrix.to_csv("data_matrix.txt",index=True,header=None)
# fw2=open("dictall_matrix.txt",'w')
# fw2.write(dictall_matrix)
# fw2.close()
dictall_matrix
end = time.clock()
print ("赋值过程运行时间是:%f s"%(end-start))
df=pd.DataFrame(columns=['id','id1'])
df[id]=1
print(df)
数据处理完成之后,进行AutoEncoder编码AE.py,
# !/usr/bin/python
#coding:utf-8
#author:wuyy
'''
AE模型(Auto-encoder)
主要是能够把数据缩放,如果你输入的维数比较大,譬如实际的特征是几千维的,全部拿到算法里跑,效果不见得好,
因为并不是所有特征都是有用的,用AE模型后,你可以压缩成m维(就是隐含层的节点数)
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
class AutoEncoder():
""" Auto Encoder
layer 1 2 ... ... L-1 L
W 0 1 ... ... L-2
B 0 1 ... ... L-2
Z 0 1 ... L-3 L-2
A 0 1 ... L-3 L-2
"""
def __init__(self, X, Y, nNodes):
# training samples
self.X = X
self.Y = Y
# number of samples
self.M = len(self.X)
# layers of networks
self.nLayers = len(nNodes)
# nodes at layers
self.nNodes = nNodes
# parameters of networks
self.W = list()
self.B = list()
self.dW = list()
self.dB = list()
self.A = list()
self.Z = list()
self.delta = list()
for iLayer in range(self.nLayers - 1):
self.W.append(
np.random.rand(nNodes[iLayer] * nNodes[iLayer + 1]).reshape(nNodes[iLayer], nNodes[iLayer + 1]))
self.B.append(np.random.rand(nNodes[iLayer + 1]))
self.dW.append(np.zeros([nNodes[iLayer], nNodes[iLayer + 1]]))
self.dB.append(np.zeros(nNodes[iLayer + 1]))
self.A.append(np.zeros(nNodes[iLayer + 1]))
self.Z.append(np.zeros(nNodes[iLayer + 1]))
self.delta.append(np.zeros(nNodes[iLayer + 1]))
# value of cost function
self.Jw = 0.0
# active function (logistic function)
self.sigmod = lambda z: 1.0 / (1.0 + np.exp(-z))
# learning rate 1.2
self.alpha = 2.5
# steps of iteration 30000
self.steps = 10000
def BackPropAlgorithm(self):
# clear values
self.Jw -= self.Jw
for iLayer in range(self.nLayers - 1):
self.dW[iLayer] -= self.dW[iLayer]
self.dB[iLayer] -= self.dB[iLayer]
# propagation (iteration over M samples)
for i in range(self.M):
# Forward propagation
for iLayer in range(self.nLayers - 1):
if iLayer == 0: # first layer
self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
else:
self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])
self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
# Back propagation
for iLayer in range(self.nLayers - 1)[::-1]: # reserve
if iLayer == self.nLayers - 2: # last layer
self.delta[iLayer] = -(self.X[i] - self.A[iLayer]) * (self.A[iLayer] * (1 - self.A[iLayer]))
self.Jw += np.dot(self.Y[i] - self.A[iLayer], self.Y[i] - self.A[iLayer]) / self.M
else:
self.delta[iLayer] = np.dot(self.W[iLayer].T, self.delta[iLayer + 1]) * (
self.A[iLayer] * (1 - self.A[iLayer]))
# calculate dW and dB
if iLayer == 0:
self.dW[iLayer] += self.X[i][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
else:
self.dW[iLayer] += self.A[iLayer - 1][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T
self.dB[iLayer] += self.delta[iLayer]
# update
for iLayer in range(self.nLayers - 1):
self.W[iLayer] -= (self.alpha / self.M) * self.dW[iLayer]
self.B[iLayer] -= (self.alpha / self.M) * self.dB[iLayer]
def PlainAutoEncoder(self):
for i in range(self.steps):
self.BackPropAlgorithm()
print("step:%d" % i, "Jw=%f" % self.Jw)
def ValidateAutoEncoder(self):
a = np.array([i for i in range(1, 6)])
df = pd.DataFrame(a, columns=['weidu'])
for i in range(self.M):
print( self.X[i])
for iLayer in range(self.nLayers - 1):
if iLayer == 0: # input layer
self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])
else:
self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])
self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])
print("\t layer=%d" % iLayer, self.A[iLayer])
if iLayer==0:
df[str(i+1)]=self.A[iLayer]
df.to_csv("jaingwei.txt",index=False)
data = []
index = []
f = open('./data_matrix.txt', 'r')
for line in f.readlines():
ss = line.replace('\n', '').split(',')
index.append(ss[0])
ss1 = ss[1:]
tmp = []
for i in range(len(ss1)):
tmp.append(float(ss1[i]))
data.append(tmp)
f.close()
x = np.array(data)
# 归一化处理
xx = preprocessing.scale(x)
nNodes = np.array([10, 5, 10])
ae3 = AutoEncoder(xx, xx, nNodes)
ae3.PlainAutoEncoder()
ae3.ValidateAutoEncoder()
print("ae结果:",ae3.A[0])
# # 这是个例子,输出的结果也是这个
# xx = np.array([[0,0,0,0,0,0,0,1], [0,0,0,0,0,0,1,0], [0,0,0,0,0,1,0,0], [0,0,0,0,1,0,0,0],[0,0,0,1,0,0,0,0], [0,0,1,0,0,0,0,0]])
# nNodes = np.array([ 8, 3, 8 ])
# ae2 = AutoEncoder(xx,xx,nNodes)
# ae2.PlainAutoEncoder()
# ae2.ValidateAutoEncoder()
使用sklearn 的Kmeans 进行聚类
# !/usr/bin/python
# coding:utf-8
# Author :wuyy
from matplotlib import pyplot
import scipy as sp
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy import sparse
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import pickle
from sklearn.externals import joblib
#加载数据
data = pd.read_table('jaingwei.txt',sep = ",")
data=data.T
x = data.ix[1:,0:5]
print(x)
card = data.ix[:,0]
x1 = np.array(x)
print("x1:",x1)
xx = preprocessing.scale(x1)
print("preprocessing.scale xx:",xx)
num_clusters = 3
clf = KMeans(n_clusters=num_clusters, n_init=1, n_jobs = 1,verbose=1) #job=-1 并行化处理
clf.fit(xx)
print("label:",clf.labels_)
labels = clf.labels_
#score是轮廓系数
score = metrics.silhouette_score(xx, labels)
# clf.inertia_用来评估簇的个数是否合适,距离越小说明簇分的越好
print ("clf.inertia_",clf.inertia_)
print (score)
github地址:https://github.com/wu-yy/Kmeans
转载自:http://www.cnblogs.com/charlotte77/p/5366578.html