利用LDA主题模型的生成过程仿真数据

仿真代码

因为,最近论文,需要基于图模型的过程,仿真数据。因此,找了一些已有的代码和论文。以下,是利用LDA的生成过程仿真数据的过程,这个代码是使用R语言编写的,代码来源于:https://www.r-bloggers.com/topic-modeling-1-simulated-lda-corpus/(https://gist.github.com/robbymeals/3985469),并做了一些修改。

### Basic LDA Topic Model Simulation ###
### Generate Simulated Corpus ###
library(ggplot2)
library(tm)
library(MCMCpack)
 
simulateCorpus <- function(
	M, # number of documents
	nTerms, 
	docLengths, 
	K,  	# Number of Topics
	alphA, 	# parameter for symmetric 
                # Document/Topic dirichlet distribution
	betA, 	# parameter for Topic/Term dirichlet distribution
	Alpha=rep(alphA,K), # number-of-topics length vector 
                            # set to symmetric alpha parameter 
                            # across all topics
	Beta=rep(betA,nTerms))  # number-of-terms length vector 
                                # set to symmetric beta parameter 
                                # across all terms
	{
	# Labels
	Terms <- paste("Term",seq(nTerms))
	Topics <- paste("Topic", seq(K))
	Documents <- paste("Document", seq(M))
 
	## Generate latent topic and term distributions
	# "True" Document/Topic distribution matrix
	Theta <- rdirichlet(M, Alpha) 
	colnames(Theta) <- Topics
	rownames(Theta) <- Documents
 
	# "True" Topic/Term Distribution Matrix
	Phi <- rdirichlet(K, Beta) 
	colnames(Phi) <- Terms
	rownames(Phi) <- Topics
 
	## Function to generate individual document
	generateDoc <- function(docLength, topic_dist, terms_topics_dist){
		# docLength is specific document length
		# topic_dist is specific topic distribution for this document
		# terms_topics_dist is terms distribution matrix over all topics
		document <- c()
		for (i in seq(docLength)){
			# For each word in a document, 
			# choose a topic from that 
			# document's topic distribution
			topic <- rmultinom(1, 1, topic_dist) 
 
			# Then choose a term from that topic's term distribution
			term <- rmultinom(1, 1, terms_topics_dist[topic,]) 
 
			# and append term to document vector
			document <- c(document, 
			colnames(terms_topics_dist)[which.max(term)]) 
			}
		return(document)
		}
 
	## generate "observed" corpus as list of terms
	corpus <- list()
	for (i in seq(M)){
		corpus[[i]] <- generateDoc(docLengths[i], Theta[i,], Phi)
		}
 
	## convert document term vectors to frequency vectors
	freqsLists <- llply(corpus, table)
 
	## write values to termFreqMatrix
	termFreqMatrix <- matrix(nrow=M, ncol=nTerms, 0)
	colnames(termFreqMatrix) <- Terms
	rownames(termFreqMatrix) <- Documents
	for (i in seq(M)){
		termFreqMatrix[i,names(freqsLists[[i]])] <- freqsLists[[i]]
		}
 
	stopifnot(rowSums(termFreqMatrix) == docLengths)
 
	return(list("docs"=corpus, 
	'termFreqMatrix'=termFreqMatrix, 
	"Theta"=Theta, 
	"Phi"=Phi))
}

使用方式:

## Perform Inference on Simulated LDA Corpus

library(MCMCpack)
library(ggplot2)
library(tm)
library(topicmodels)
library(plyr)
require(reshape2)
require(scales)
setwd('D:/')

##################################
### 1. Simulate Data

# Default Values
M <- 1000 # number of documents
nTerms <- 100 # number of terms

## document lengths all identical at 100
docLengths <- rep(100,M)

## document lengths (word counts) distributed according to poisson(100)
#docLengths <- rpois(M,100)

## Set additional hyperparameters to some customary values used in LDA priors
#K <- round(nTerms/M) # Number of Topics
K <- 10 # Number of Topics
alphA <- 1/K # parameter for symmetric Document/Topic dirichlet distribution
betA <- 1/K # parameter for Topic/Term dirichlet distribution
AlphA <- rep(alphA, K) # number-of-topics length vector set to symmetric alpha paramater across all topics
BetA <- rep(betA, nTerms) # number-of-terms length vector set to symmetric beta paramater across all terms 

## generate simulated corpus (See script SimulateCorpus.R)
# Returns corpus list object, default 100 documents, 1000 terms, 1000/100=10 topics
# with documents and "true" values for doc/topic matrix and topic/term matrix
source('SimLDACorpusSource.R') 
corpus <- simulateCorpus(M, nTerms, docLengths, K, alphA, betA)

##################################


##################################
### 2. Inference

# A. Using R Package topicmodels:
# Labels
Terms <- paste("Term",seq(nTerms))
Topics <- paste("Topic", seq(K))
Documents <- paste("Document", seq(M))

LDA(corpus[['termFreqMatrix']], K, control = list(seed = 2015,burnin = 1000,thin = 100,iter = 1000), method='Gibbs') -> lda1

# "Estimated" Document/Topic distribution matrix
Theta_est <- posterior(lda1,corpus[['termFreqMatrix']])$topics
colnames(Theta_est) <- Topics
rownames(Theta_est) <- Documents

# "Estimated" Topic/Term Distribution Matrix
Phi_est <- posterior(lda1,corpus[['termFreqMatrix']])$terms
colnames(Phi_est) <- Terms
rownames(Phi_est) <- Topics

##################################

##################################
### 3. Compare "True" to Estimate

Theta_true <- corpus[['Theta']]
Phi_true <- corpus[['Phi']]
Theta_true
Phi_true
# 绘制热力图
Theta_true.m <- melt(Theta_true)
head(Theta_true.m)
Theta_true.m <- ddply(Theta_true.m, .(Var2), transform,rescale = rescale(value))
head(Theta_true.m)
p <- ggplot(Theta_true.m, aes(x=Var1,y=Var2)) + geom_tile(aes(fill = rescale),colour = "white") + scale_fill_gradient(low = "white",high = "steelblue")
p


##################################

可视化结果

利用LDA主题模型的生成过程仿真数据_第1张图片

概率主题模型涉及仿真的论文

Taddy M. On estimation and selection for topic models[C]//Artificial Intelligence and Statistics. 2012: 1184-1193.
Roberts M E, Stewart B M, Airoldi E M. A model of text for experimentation in the social sciences[J]. Journal of the American Statistical Association, 2016, 111(515): 988-1003.
Roberts M E, Stewart B M, Tingley D. stm: R package for structural topic models[J]. Journal of Statistical Software, 2014, 10(2): 1-40.
Gerlach M, Peixoto T P, Altmann E G. A network approach to topic models[J]. Science advances, 2018, 4(7): eaaq1360.

你可能感兴趣的:(自然语言处理方法及应用,概率主题模型,计算机顶会及顶刊,机器学习,自然语言处理)