因为,最近论文,需要基于图模型的过程,仿真数据。因此,找了一些已有的代码和论文。以下,是利用LDA的生成过程仿真数据的过程,这个代码是使用R语言编写的,代码来源于:https://www.r-bloggers.com/topic-modeling-1-simulated-lda-corpus/(https://gist.github.com/robbymeals/3985469),并做了一些修改。
### Basic LDA Topic Model Simulation ###
### Generate Simulated Corpus ###
library(ggplot2)
library(tm)
library(MCMCpack)
simulateCorpus <- function(
M, # number of documents
nTerms,
docLengths,
K, # Number of Topics
alphA, # parameter for symmetric
# Document/Topic dirichlet distribution
betA, # parameter for Topic/Term dirichlet distribution
Alpha=rep(alphA,K), # number-of-topics length vector
# set to symmetric alpha parameter
# across all topics
Beta=rep(betA,nTerms)) # number-of-terms length vector
# set to symmetric beta parameter
# across all terms
{
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))
## Generate latent topic and term distributions
# "True" Document/Topic distribution matrix
Theta <- rdirichlet(M, Alpha)
colnames(Theta) <- Topics
rownames(Theta) <- Documents
# "True" Topic/Term Distribution Matrix
Phi <- rdirichlet(K, Beta)
colnames(Phi) <- Terms
rownames(Phi) <- Topics
## Function to generate individual document
generateDoc <- function(docLength, topic_dist, terms_topics_dist){
# docLength is specific document length
# topic_dist is specific topic distribution for this document
# terms_topics_dist is terms distribution matrix over all topics
document <- c()
for (i in seq(docLength)){
# For each word in a document,
# choose a topic from that
# document's topic distribution
topic <- rmultinom(1, 1, topic_dist)
# Then choose a term from that topic's term distribution
term <- rmultinom(1, 1, terms_topics_dist[topic,])
# and append term to document vector
document <- c(document,
colnames(terms_topics_dist)[which.max(term)])
}
return(document)
}
## generate "observed" corpus as list of terms
corpus <- list()
for (i in seq(M)){
corpus[[i]] <- generateDoc(docLengths[i], Theta[i,], Phi)
}
## convert document term vectors to frequency vectors
freqsLists <- llply(corpus, table)
## write values to termFreqMatrix
termFreqMatrix <- matrix(nrow=M, ncol=nTerms, 0)
colnames(termFreqMatrix) <- Terms
rownames(termFreqMatrix) <- Documents
for (i in seq(M)){
termFreqMatrix[i,names(freqsLists[[i]])] <- freqsLists[[i]]
}
stopifnot(rowSums(termFreqMatrix) == docLengths)
return(list("docs"=corpus,
'termFreqMatrix'=termFreqMatrix,
"Theta"=Theta,
"Phi"=Phi))
}
使用方式:
## Perform Inference on Simulated LDA Corpus
library(MCMCpack)
library(ggplot2)
library(tm)
library(topicmodels)
library(plyr)
require(reshape2)
require(scales)
setwd('D:/')
##################################
### 1. Simulate Data
# Default Values
M <- 1000 # number of documents
nTerms <- 100 # number of terms
## document lengths all identical at 100
docLengths <- rep(100,M)
## document lengths (word counts) distributed according to poisson(100)
#docLengths <- rpois(M,100)
## Set additional hyperparameters to some customary values used in LDA priors
#K <- round(nTerms/M) # Number of Topics
K <- 10 # Number of Topics
alphA <- 1/K # parameter for symmetric Document/Topic dirichlet distribution
betA <- 1/K # parameter for Topic/Term dirichlet distribution
AlphA <- rep(alphA, K) # number-of-topics length vector set to symmetric alpha paramater across all topics
BetA <- rep(betA, nTerms) # number-of-terms length vector set to symmetric beta paramater across all terms
## generate simulated corpus (See script SimulateCorpus.R)
# Returns corpus list object, default 100 documents, 1000 terms, 1000/100=10 topics
# with documents and "true" values for doc/topic matrix and topic/term matrix
source('SimLDACorpusSource.R')
corpus <- simulateCorpus(M, nTerms, docLengths, K, alphA, betA)
##################################
##################################
### 2. Inference
# A. Using R Package topicmodels:
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))
LDA(corpus[['termFreqMatrix']], K, control = list(seed = 2015,burnin = 1000,thin = 100,iter = 1000), method='Gibbs') -> lda1
# "Estimated" Document/Topic distribution matrix
Theta_est <- posterior(lda1,corpus[['termFreqMatrix']])$topics
colnames(Theta_est) <- Topics
rownames(Theta_est) <- Documents
# "Estimated" Topic/Term Distribution Matrix
Phi_est <- posterior(lda1,corpus[['termFreqMatrix']])$terms
colnames(Phi_est) <- Terms
rownames(Phi_est) <- Topics
##################################
##################################
### 3. Compare "True" to Estimate
Theta_true <- corpus[['Theta']]
Phi_true <- corpus[['Phi']]
Theta_true
Phi_true
# 绘制热力图
Theta_true.m <- melt(Theta_true)
head(Theta_true.m)
Theta_true.m <- ddply(Theta_true.m, .(Var2), transform,rescale = rescale(value))
head(Theta_true.m)
p <- ggplot(Theta_true.m, aes(x=Var1,y=Var2)) + geom_tile(aes(fill = rescale),colour = "white") + scale_fill_gradient(low = "white",high = "steelblue")
p
##################################
Taddy M. On estimation and selection for topic models[C]//Artificial Intelligence and Statistics. 2012: 1184-1193.
Roberts M E, Stewart B M, Airoldi E M. A model of text for experimentation in the social sciences[J]. Journal of the American Statistical Association, 2016, 111(515): 988-1003.
Roberts M E, Stewart B M, Tingley D. stm: R package for structural topic models[J]. Journal of Statistical Software, 2014, 10(2): 1-40.
Gerlach M, Peixoto T P, Altmann E G. A network approach to topic models[J]. Science advances, 2018, 4(7): eaaq1360.