最近在做文本挖掘中的若干问题的研究,由于个人的起点不高,实验初期属于复现别人系统以及用简单的算法小做实验中。此文主要是针对文本聚类叙述一二。在神经网络基础知识打的不牢固的情况下,只能从简单的机器学习算法做起,在机器学习算法中,KNN和k-means比较明显的差别就是KNN属于有监督学习,K-MEANS属于无监督学习。本文就是对k-means算法的一个简单应用。
在下面的代码中,首先是k-means算法的模型,其次是对文本的处理。
#coding=utf-8
import numpy
import matplotlib.pyplot as plt
#计算向量的欧式距离
# def distEclud(vecA,vecB):
# return numpy.sqrt(sum(numpy.power(vecA-vecB, 2)))
#改余弦
def distEclud(vecA,vecB):
dist=1 - numpy.dot(vecA,vecB)/(numpy.linalg.norm(vecA)*numpy.linalg.norm(vecB))
return dist
# 该函数为给定数据集构建一个包含k个随机质心的集合。
# 随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成。
# 随后生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。
def randCent(dataMatArray,k):
#shape函数取维度
n = numpy.shape(dataMatArray)[1]
#生成k行n列的全零矩阵
centroids = numpy.array(numpy.zeros((k, n)))
#构建簇质心
for j in range(n):
#找到数据集每一维的最小值 max找最大
minJ = min(dataMatArray[:, j])
rangeJ = float(max(dataMatArray[:,j])-minJ)
#往全零矩阵里面添值
centroids[:,j] = numpy.array(minJ + rangeJ * numpy.random.rand(k, 1)).flatten()
return centroids
#print(randCent(dataMatArray, 2))
# kMeans函数一开始确定数据集中数据点的总数,然后创建一个矩阵来存储每个点的簇分配结果。
# 簇分配结果矩阵clusterAssment包含两列:一列记录簇索引值,第二列存储误差(误差即为当前点到簇质心的距离)
def kMeans(dataMatArray,k,distMeas=distEclud,createCent = randCent):
m = numpy.shape(dataMatArray)[0]
#生成m行两列的误差“矩阵”
clusterAssment = numpy.zeros((m,2))
#生成k个随机质心的集合
centroids = createCent(dataMatArray,k)
#标志变量clusterChanged,如果该值为true则继续迭代。
# while循环的作用为遍历所有数据找到距离每个点最近的质心,即对每个点遍历所有质心并计算点到每个质心的距离。
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m):
minDist = numpy.inf; minIndex = -1
for j in range(k):
distJI = distMeas(centroids[j,:],dataMatArray[i,:])
if distJI len(mark):
print("Sorry! Your k is too large! please contact Zouxy")
return 1
# draw all samples
for i in xrange(numSamples):
markIndex = int(clusterAssment[i, 0])
plt.plot(dataMatArray[i, 0], dataMatArray[i, 1], mark[markIndex])
# draw the centroids
for i in range(k):
plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12)
plt.show()
#coding=utf-8
import jieba
import jieba.posseg
import numpy as np
from numpy.ma.core import zeros
from numpy.distutils.fcompiler import none
from numpy.core.fromnumeric import mean
def cutParagraph(paragraph):
i=0
j=1
sentenses=[]
for char in paragraph:
if char=="。"or char=="?":
sentenses.append(paragraph[i:j])
i=j
j=j+1
return sentenses
def caculateFrequecyOfAllWords(wordNum,wordsLen):
return wordNum/wordsLen
def censusFrequency(sentense_word):#统计词频
wordsCollection=[]
wordCensusDictionary={}
frequency_flag=[]
for eachSentence in sentense_word:
for eachWord in eachSentence:
wordsCollection.append(eachWord)
for item in wordsCollection:
if item not in wordCensusDictionary:
wordCensusDictionary[item]=1
else:
wordCensusDictionary[item]+=1
print(wordCensusDictionary)
return wordCensusDictionary
def cutSentenceAndGetVector(sentences): #得到词向量
wordCensusDictionary={}
sentense_word=[]
cutConclusion=[]
natureDictionary={}
longs=[]
for eachSentence in sentences:
sentence=list(jieba.cut(eachSentence))
cutConclusion=list(jieba.posseg.cut(eachSentence))
for each_word in cutConclusion:
natureDictionary[each_word.word]=each_word.flag
sentense_word.append(sentence)
wordCensusDictionary=censusFrequency(sentense_word)
wordsLen=len(sentense_word)
for i in range(len(sentense_word)):
for each_word in sentense_word[i]:
if each_word in natureDictionary:
if each_word in("【","】",",","。"):
sentense_word[i].remove(each_word)
continue
elif natureDictionary[each_word] in ("u","p","uj","c","w","ul"):#助词,介词,的,连词,标点,了
sentense_word[i].remove(each_word)
width=len(sentense_word)
print(width,sentense_word,"sentense_word")
for eachSentence in sentense_word:
longs.append(len(eachSentence))
print(longs,"longs")
long=int(mean(longs))
vector=zeros((width,long))
print(long,width,"向量的长宽")
i=0
j=0
for eachSentence in sentense_word:
if len(eachSentence)>long:
frequencyConclution1=[]
for each_word in eachSentence:
frequencyConclution1.append(caculateFrequecyOfAllWords(wordCensusDictionary[each_word],wordsLen))
frequencyConclution1.sort(key=None, reverse=True)
frequencyConclution1=frequencyConclution1[0:long]
vector[i,:]=frequencyConclution1
else:
for each_word in eachSentence:
frequencyConclution=caculateFrequecyOfAllWords(wordCensusDictionary[each_word],wordsLen)
vector[i][j]=frequencyConclution
j=j+1
i=i+1
j=0
return vector
#得到最终聚类的结果
def getResult(k,clusterAssment,sentences):
print("in")
dictionary1={}
dictionary2={}
result=[]
for i in range(len(clusterAssment)):
dictionary1.setdefault(int(clusterAssment[i][0]),[]).append([i,clusterAssment[i][1]])
print(dictionary1)
for i in range(k):
print(i)
if i in dictionary1:
Collection=dictionary1[i]
else:
continue
print(Collection)
min=np.inf
index=-1
for eachGroup in Collection:
if(eachGroup[1]