NLP之简单k-means实现的文本聚类

最近在做文本挖掘中的若干问题的研究,由于个人的起点不高,实验初期属于复现别人系统以及用简单的算法小做实验中。此文主要是针对文本聚类叙述一二。在神经网络基础知识打的不牢固的情况下,只能从简单的机器学习算法做起,在机器学习算法中,KNN和k-means比较明显的差别就是KNN属于有监督学习,K-MEANS属于无监督学习。本文就是对k-means算法的一个简单应用。
在下面的代码中,首先是k-means算法的模型,其次是对文本的处理。

#coding=utf-8 
import numpy
import matplotlib.pyplot as plt

#计算向量的欧式距离
# def distEclud(vecA,vecB):
#     return numpy.sqrt(sum(numpy.power(vecA-vecB, 2)))

#改余弦
def distEclud(vecA,vecB):
    dist=1 - numpy.dot(vecA,vecB)/(numpy.linalg.norm(vecA)*numpy.linalg.norm(vecB))
    return dist
# 该函数为给定数据集构建一个包含k个随机质心的集合。
# 随机质心必须要在整个数据集的边界之内,这可以通过找到数据集每一维的最小和最大值来完成。
# 随后生成0到1.0之间的随机数并通过取值范围和最小值,以便确保随机点在数据的边界之内。

def randCent(dataMatArray,k):
    #shape函数取维度
    n = numpy.shape(dataMatArray)[1]
    #生成k行n列的全零矩阵
    centroids = numpy.array(numpy.zeros((k, n)))
    #构建簇质心
    for j in range(n):
        #找到数据集每一维的最小值  max找最大
        minJ = min(dataMatArray[:, j])
        rangeJ = float(max(dataMatArray[:,j])-minJ)
        #往全零矩阵里面添值
        centroids[:,j] = numpy.array(minJ + rangeJ * numpy.random.rand(k, 1)).flatten()
    return centroids
#print(randCent(dataMatArray, 2))

# kMeans函数一开始确定数据集中数据点的总数,然后创建一个矩阵来存储每个点的簇分配结果。
# 簇分配结果矩阵clusterAssment包含两列:一列记录簇索引值,第二列存储误差(误差即为当前点到簇质心的距离)
def kMeans(dataMatArray,k,distMeas=distEclud,createCent = randCent):
    m = numpy.shape(dataMatArray)[0]
    #生成m行两列的误差“矩阵”
    clusterAssment = numpy.zeros((m,2))
    #生成k个随机质心的集合
    centroids = createCent(dataMatArray,k)
    #标志变量clusterChanged,如果该值为true则继续迭代。
    # while循环的作用为遍历所有数据找到距离每个点最近的质心,即对每个点遍历所有质心并计算点到每个质心的距离。
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):
            minDist = numpy.inf; minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j,:],dataMatArray[i,:])
                if distJI len(mark):
        print("Sorry! Your k is too large! please contact Zouxy")
        return 1
        # draw all samples
    for i in xrange(numSamples):
        markIndex = int(clusterAssment[i, 0])
        plt.plot(dataMatArray[i, 0], dataMatArray[i, 1], mark[markIndex])
    # draw the centroids
    for i in range(k):
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12)
    plt.show()




#coding=utf-8 
import jieba
import jieba.posseg
import numpy as np
from numpy.ma.core import zeros
from numpy.distutils.fcompiler import none
from numpy.core.fromnumeric import mean
def cutParagraph(paragraph):
    i=0
    j=1
    sentenses=[]
    for char in paragraph:
            if char=="。"or char=="?":
                sentenses.append(paragraph[i:j])
                i=j
            j=j+1
    return sentenses

def caculateFrequecyOfAllWords(wordNum,wordsLen):
    return wordNum/wordsLen

def censusFrequency(sentense_word):#统计词频
    wordsCollection=[]
    wordCensusDictionary={}
    frequency_flag=[]
    for eachSentence in sentense_word:
        for eachWord in eachSentence:
            wordsCollection.append(eachWord)
    for item in wordsCollection:
            if item not in wordCensusDictionary:
                wordCensusDictionary[item]=1
            else:
                wordCensusDictionary[item]+=1
    print(wordCensusDictionary)
    return wordCensusDictionary

def cutSentenceAndGetVector(sentences): #得到词向量
    wordCensusDictionary={}
    sentense_word=[]
    cutConclusion=[]
    natureDictionary={}
    longs=[]
    for eachSentence in sentences:
        sentence=list(jieba.cut(eachSentence))
        cutConclusion=list(jieba.posseg.cut(eachSentence))
        for each_word in cutConclusion:
            natureDictionary[each_word.word]=each_word.flag
        sentense_word.append(sentence)
    wordCensusDictionary=censusFrequency(sentense_word)
    wordsLen=len(sentense_word)
    for i in range(len(sentense_word)):   
        for each_word in sentense_word[i]:        
            if each_word in natureDictionary:
                if each_word in("【","】",",","。"):
                    sentense_word[i].remove(each_word)
                    continue
                elif natureDictionary[each_word] in ("u","p","uj","c","w","ul"):#助词,介词,的,连词,标点,了
                    sentense_word[i].remove(each_word)
    width=len(sentense_word)
    print(width,sentense_word,"sentense_word")
    for eachSentence in sentense_word:
        longs.append(len(eachSentence))
    print(longs,"longs")    
    long=int(mean(longs))
    vector=zeros((width,long)) 
    print(long,width,"向量的长宽")
    i=0
    j=0 
    for eachSentence in sentense_word: 
        if len(eachSentence)>long:
            frequencyConclution1=[]
            for each_word in eachSentence:
                frequencyConclution1.append(caculateFrequecyOfAllWords(wordCensusDictionary[each_word],wordsLen))
                frequencyConclution1.sort(key=None, reverse=True)
                frequencyConclution1=frequencyConclution1[0:long]
            vector[i,:]=frequencyConclution1
        else:
            for each_word in  eachSentence: 
                frequencyConclution=caculateFrequecyOfAllWords(wordCensusDictionary[each_word],wordsLen)
                vector[i][j]=frequencyConclution
                j=j+1
        i=i+1
        j=0
    return vector

#得到最终聚类的结果
def getResult(k,clusterAssment,sentences):
    print("in")
    dictionary1={}
    dictionary2={}
    result=[]
    for i in range(len(clusterAssment)):
        dictionary1.setdefault(int(clusterAssment[i][0]),[]).append([i,clusterAssment[i][1]])
    print(dictionary1)
    
    for i in range(k):
        print(i)
        if i in dictionary1:
            Collection=dictionary1[i]
        else:
            continue
        print(Collection)
        min=np.inf
        index=-1
        for eachGroup in Collection:
            if(eachGroup[1]



你可能感兴趣的:(NLP之简单k-means实现的文本聚类)