R文本分析(三)

主题模型训练

############################################

library(lda)

corpus <- lexicalize(sample.words, lower=TRUE)

num.topics <- 4#4个主题

## Initialize the params

params <- sample(c(-1, 1), num.topics, replace=TRUE)

poliblog.ratings<- sample(c(-100, 100), 207, replace=TRUE)

result <- slda.em(documents=corpus$documents,

K=num.topics,

vocab=corpus$vocab,

num.e.iterations=30,

num.m.iterations=12,

alpha=1.0, eta=0.1,

poliblog.ratings / 100,

params,

variance=0.25,

lambda=1.0,

logistic=FALSE,

method="sLDA")

## Make a pretty picture.

Topics <- apply(top.topic.words(result$topics, 8, by.score=TRUE),

2, paste, collapse=" ")

aa=length(Topics)

t=c()

for(i in 1:aa)

{t[i]=paste(i,Topics[i],sep="")}

a=apply(result$document_sums,

1,sum)

names(a)<-t

p=data.frame(a=t,b=a)

p=p[order(p[,2],decreasing=T),]

a1=c()

c=c("a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z"

,"za","zb","zc","zd")

for(i in 1:aa)

{

a1[i]= paste(c[i],p$a[i],sep="")

}

p1=data.frame(a=a1,主题得分=p$b)

library(ggplot2)

ggplot(data=p1, aes(x=a, y=主题得分, fill=主题得分)) +

geom_bar(colour="black", stat="identity") +

labs(x = "主题", y = "得分") + ggtitle("文档主题排名顺序")+ coord_flip()

Topics <- top.topic.words(result$topics, 20, by.score=TRUE)

a=c()

b=c()

for(i in 1:5)

{

a=c(a,Topics[,i])

b=c(b,rep(paste("主题",i,sep=""),20))

}

a = table(a, b)

a = as.matrix(a)

library(wordcloud)

comparison.cloud(a, scale = c(1, 1.5), rot.per = 0.5, colors = brewer.pal(ncol(a),

"Dark2"))

主题1:金融主题2 :禅道主题3 :军事主题4 科技

你可能感兴趣的:(R文本分析(三))