本文对LDA原始论文的作者所提供的C代码中LDA的主要逻辑部分做注释,原代码可在这里下载到:https://github.com/Blei-Lab/lda-c
这份代码实现论文《Latent Dirichlet Allocation》中介绍的LDA模型,用变分EM算法求解参数。
为了使代码在vs2013中运行,做了一些微小改动,但不影响原代码的逻辑。
vs2013工程可在我的资源中下载:
http://download.csdn.net/detail/happyer88/8861773
----------------------------------------------------------------------2.1,LDA原理及推导
《Latent Dirichlet Allocation》论文
我的LDA学习笔记1-4系列
2.2,充分统计量
https://en.wikipedia.org/wiki/Sufficient_statistic
----------------------------------------------------------------------
原代码中main函数在lda-estimate.c中,创建vs工程时把它挪到了main.c中。
#include
#include
#include
#include
#include "cokus.h"
#include "lda-alpha.h"
#include"lda-data.h"
#include"lda-estimate.h"
#include"lda-inference.h"
#include"lda.h"
#include"utils.h"
char * datasetName = "scene8"; //数据集名字,必须与文件夹名字相同
int expec = 1; // expec==1,expect , inf
int vocabularySize_global = 512; // 字典大小
int k = 100; //topic的数目
char* params ="../settings.txt"; //估计:估计过程需要的参数
char* params1 ="../inf-settings.txt"; //推断:推断过程需要的参数
char dataset_train[500]; //估计:估计参数的数据文件
char dataset_test[500]; //推断:推断的数据文件
char dir_trainData[500]; //估计:估计的中间数据和最终数据文件夹路径
char dir_testData[500]; //推断:推断的中间数据和最终数据文件夹路径
char model_pre[500];
void assignParameter();
int main()
{
corpus* corpus;
clock_t start,finish;
double totaltime;
long double totaltime_EMiteration;
assignParameter();
//myCreateDirectory();
start=clock();
if(expec)
{
INITIAL_ALPHA = 1; //狄利克雷分布的参数alpha
NTOPICS =k; //主题个数
read_settings(params); //读取参数。。。最大迭代次数,收敛条件阈值;EM的最大迭代次数、收敛条件阈值;
corpus = read_data(dataset_train); //读取数据。。。数据格式:(每一行)在一个文档中出现的word总数目(去掉次数=0的)index_word1:counts index_word2:counts ...........
totaltime_EMiteration = run_em("seeded", dir_trainData, corpus); //求解参数。。。EM过程求解参数--输入:中间数据和最终数据存放目录、语料库
printf("inferencing test images!\n");
read_settings(params1);
corpus = read_data(dataset_test);
infer(model_pre, dir_testData, corpus);
//用完开始释放
}
else
{
read_settings(params1);
corpus = read_data(dataset_test);
infer(model_pre, dir_testData, corpus);
}
finish=clock();
totaltime=(double)(finish-start)/CLOCKS_PER_SEC;
printf("nTopic = %d, nTerm = %d estimation time: \n", k, vocabularySize_global);
printf(" EM iteration takes %f seconds(this is %f miniutes)\n", totaltime_EMiteration*60, totaltime_EMiteration);
printf("Running Time(--estimate trainData and inference trainData and testData--):%f\n",totaltime);
printf("\ntrain--- final data are saved to: %s\n", dir_trainData);
printf("test---- final data are saved to: %s\n", dir_testData);
getch();
return(0);
}
void assignParameter()
{
sprintf(dataset_train,"../train.txt");
sprintf(dataset_test,"../test.txt");
sprintf(dir_trainData, "../ResultData");
sprintf(dir_testData, "../ResultData");
sprintf(model_pre, "../ResultData/final");
}
自定义数据结构
#ifndef LDA_H
#define LDA_H
typedef struct
{
int* words; //文档中的单词,这里存的是该单词在文档集字典中的ID
int* counts; //每个单词文档中出现次数
int length; //文档中出现的单词个数,去重的,也就是重复出现的单词不计
int total; //文档中总单词数,不去重
} document;
typedef struct
{
document* docs;
int num_terms; //文档集中出现的单词个数,去重的,也就是文档集字典大小
int num_docs; //文档集中文档个数
} corpus;
typedef struct
{
double alpha; //论文中的模型参数alpha,本来应该是k维,程序中实现的是对称分布的Dirichlet,k维的值是相同的
double** log_prob_w; //论文中的模型参数beta,每一行存一个主题的词分布,维度k*V
int num_topics; //主题个数
int num_terms;
} lda_model;
typedef struct
{
double** class_word;//模型参数beta的充分统计量,维度:主题个数*文档集字典大小(K*V)
double* class_total;//存主题分布z的 充分统计量,维度:主题个数K
double alpha_suffstats; //模型参数alpha的充分统计量
int num_docs;
} lda_suffstats;
#endif
主要是初始化lda模型(有三种方法),一种是所有值都为0,'random'是用随机数,'seeded'是随机挑选一些文档来初始化模型
还有计算模型参数alpha , beta (lda_mle)
#include "lda-model.h"
/*
* compute MLE lda model from sufficient statistics
*
*/
void lda_mle(lda_model* model, lda_suffstats* ss, int estimate_alpha)
{
int k; int w;
for (k = 0; k < model->num_topics; k++)
{
for (w = 0; w < model->num_terms; w++)
{
if (ss->class_word[k][w] > 0)
{
//log_prob_w是模型参数beta,主题-词分布
//class_word和class_total都是充分统计量(sufficient statistic)
//所以log相减是在做归一化,beta中的值是概率,要在0-1之间
model->log_prob_w[k][w] =
log(ss->class_word[k][w]) -
log(ss->class_total[k]);
}
else
model->log_prob_w[k][w] = -100;
}
}
if (estimate_alpha == 1)
{
//用牛顿方法优化得到alpha
//注意这里alpha_suffstats的值
model->alpha = opt_alpha(ss->alpha_suffstats,
ss->num_docs,
model->num_topics);
printf("new alpha = %5.5f\n", model->alpha);
}
}
/*
* allocate sufficient statistics
*
*/
lda_suffstats* new_lda_suffstats(lda_model* model)
{
int num_topics = model->num_topics;
int num_terms = model->num_terms;
int i,j;
lda_suffstats* ss = malloc(sizeof(lda_suffstats));
ss->class_total = malloc(sizeof(double)*num_topics);
ss->class_word = malloc(sizeof(double*)*num_topics);
for (i = 0; i < num_topics; i++)
{
ss->class_total[i] = 0;
ss->class_word[i] = malloc(sizeof(double)*num_terms);
for (j = 0; j < num_terms; j++)
{
ss->class_word[i][j] = 0;
}
}
return(ss);
}
void free_lda_ss(lda_suffstats* ss, lda_model* model)
{
int i=0;
for (i=0; i < model->num_topics; i++)
free(ss->class_word[i]);
free(ss->class_word);
free(ss->class_total);
free(ss);
}
/*
* various intializations for the sufficient statistics
*
*/
void zero_initialize_ss(lda_suffstats* ss, lda_model* model)
{
int k, w;
for (k = 0; k < model->num_topics; k++)
{
ss->class_total[k] = 0;
for (w = 0; w < model->num_terms; w++)
{
ss->class_word[k][w] = 0;
}
}
ss->num_docs = 0;
ss->alpha_suffstats = 0;
}
void random_initialize_ss(lda_suffstats* ss, lda_model* model)
{
int num_topics = model->num_topics;
int num_terms = model->num_terms;
int k, n;
for (k = 0; k < num_topics; k++)
{
for (n = 0; n < num_terms; n++)
{
ss->class_word[k][n] += 1.0/num_terms + myrand();
ss->class_total[k] += ss->class_word[k][n];
}
}
}
void corpus_initialize_ss(lda_suffstats* ss, lda_model* model, corpus* c)
{
int num_topics = model->num_topics;
int i, k, d, n;
document* doc;
for (k = 0; k < num_topics; k++)//每个主题用一些文档的来初始化其主题-词 分布 的充分统计量
{
for (i = 0; i < NUM_INIT; i++)//在文档集中随机挑选NUM_INIT=1个文档
{
d = floor(myrand() * c->num_docs); //随机挑选
printf("initialized with document %d\n", d);
doc = &(c->docs[d]);
for (n = 0; n < doc->length; n++)
{
//将NUM_INIT个文档的词频统计,作为第k个主题的词分布的统计量
ss->class_word[k][doc->words[n]] += doc->counts[n];
}
}
for (n = 0; n < model->num_terms; n++)
{
ss->class_word[k][n] += 1.0;//因为后面要对它求log,所以值必须大于0
//是对class_word按行求和的结果,是主题k被选中的次数,也就是该主题下的词出现次数的和
ss->class_total[k] = ss->class_total[k] + ss->class_word[k][n];
}
//这样用文档的词频信息初始化,total必然不为0
//if (ss->class_total[k] == 0)
// ss->class_total[k] = 1;
}
}
/*
* allocate new lda model
*
*/
lda_model* new_lda_model(int num_terms, int num_topics)
{
int i,j;
lda_model* model;
model = malloc(sizeof(lda_model));
model->num_topics = num_topics;
model->num_terms = num_terms;
model->alpha = 1.0;
model->log_prob_w = malloc(sizeof(double*)*num_topics);
for (i = 0; i < num_topics; i++)
{
model->log_prob_w[i] = malloc(sizeof(double)*num_terms);
for (j = 0; j < num_terms; j++)
model->log_prob_w[i][j] = 0;
}
return(model);
}
/*
* deallocate new lda model
*
*/
void free_lda_model(lda_model* model)
{
int i;
for (i = 0; i < model->num_topics; i++)
{
free(model->log_prob_w[i]);
}
free(model->log_prob_w);
free(model);
}
/*
* save an lda model
*
*/
void save_lda_model(lda_model* model, char* model_root)
{
char filename[100];
FILE* fileptr;
int i, j;
sprintf(filename, "%s.beta", model_root);
fileptr = fopen(filename, "w");
for (i = 0; i < model->num_topics; i++)
{
for (j = 0; j < model->num_terms; j++)
{
fprintf(fileptr, " %5.10f", model->log_prob_w[i][j]);
}
fprintf(fileptr, "\n");
}
fclose(fileptr);
sprintf(filename, "%s.other", model_root);
fileptr = fopen(filename, "w");
fprintf(fileptr, "num_topics %d\n", model->num_topics);
fprintf(fileptr, "num_terms %d\n", model->num_terms);
fprintf(fileptr, "alpha %5.10f\n", model->alpha);
fclose(fileptr);
}
lda_model* load_lda_model(char* model_root)
{
char filename[100];
FILE* fileptr;
int i, j, num_terms, num_topics;
float x, alpha;
lda_model* model;
sprintf(filename, "%s.other", model_root);
printf("loading %s\n", filename);
fileptr = fopen(filename, "r");
fscanf(fileptr, "num_topics %d\n", &num_topics);
fscanf(fileptr, "num_terms %d\n", &num_terms);
fscanf(fileptr, "alpha %f\n", &alpha);
fclose(fileptr);
model = new_lda_model(num_terms, num_topics);
model->alpha = alpha;
sprintf(filename, "%s.beta", model_root);
printf("loading %s\n", filename);
fileptr = fopen(filename, "r");
for (i = 0; i < num_topics; i++)
{
for (j = 0; j < num_terms; j++)
{
fscanf(fileptr, "%f", &x);
model->log_prob_w[i][j] = x;
}
}
fclose(fileptr);
return(model);
}
其中包含和模型求解相关的函数,em算法(run_em)和e-step(doc_e_step)
#include "lda-estimate.h"
/*
* perform inference on a document and update sufficient statistics
*
*/
int LAG=5;
double doc_e_step(document* doc, double* gamma, double** phi,
lda_model* model, lda_suffstats* ss)
{
double likelihood;
int n, k;
double gamma_sum;
// posterior inference
likelihood = lda_inference(doc, model, gamma, phi);
// update sufficient statistics
//这里更新alpha的 充分统计量
//alpha_suffstats = sum(digamma(gamma)) - K*digamma(gamm_sum)
gamma_sum = 0;
for (k = 0; k < model->num_topics; k++)
{
gamma_sum += gamma[k];
ss->alpha_suffstats += digamma(gamma[k]); //log gamma函数的一阶导数
}
ss->alpha_suffstats -= model->num_topics * digamma(gamma_sum);
for (n = 0; n < doc->length; n++)
{
for (k = 0; k < model->num_topics; k++)
{
//phi[n][k]是第n个word由第k个主题生成的概率,在log space
ss->class_word[k][doc->words[n]] += doc->counts[n]*phi[n][k];
ss->class_total[k] += doc->counts[n]*phi[n][k];
}
}
//加入充分统计量的文档数
ss->num_docs = ss->num_docs + 1;
return(likelihood);
}
/*
* writes the word assignments line for a document to a file
*
*/
int write_word_assignment(FILE* result, FILE* f, document* doc, double** phi, lda_model* model)
{
int n;
//f中保存phi, result中保存结果:[wordID:概率最大的topicID]
fprintf(result, "%03d", doc->length);
for (n = 0; n < doc->length; n++)
{
int k;
for (k=0;k< model->num_topics;k++)
fprintf(f, "%f\t",phi[n][k]); //一行对应一个word由每个topic生成的概率
fprintf(f, "\n");
fprintf(result, " %04d:%02d",
doc->words[n], argmax(phi[n], model->num_topics));//argmax 找出phi[n]中最大的元素对应的索引位置,也就是topicID
}
fprintf(result, "\n"); //一行对应一个文档的 每个word对应的概率最大的topic
fflush(f);
fflush(result);
return 0;
}
/*
* saves the gamma parameters of the current dataset
*
*/
void save_gamma(char* filename, double** gamma, int num_docs, int num_topics)
{
FILE* fileptr;
int d, k;
fileptr = fopen(filename, "w");
for (d = 0; d < num_docs; d++)
{
fprintf(fileptr, "%5.10f", gamma[d][0]);
for (k = 1; k < num_topics; k++)
{
fprintf(fileptr, " %5.10f", gamma[d][k]);
}
fprintf(fileptr, "\n");
}
fclose(fileptr);
}
/*
* run_em
*
*/
long double run_em(char* start, char* directory, corpus* corpus)
{
clock_t start_EM, finish_EM;
double * theta;
FILE * thetaFile;
int d, n;
lda_model *model = NULL;
double **var_gamma, **phi;
FILE* likelihood_file;
int max_length;
char filename[500];
char filename1[500];
int i;
double likelihood, likelihood_old, converged;
lda_suffstats* ss;
FILE* w_asgn_file;
FILE* result;
// allocate variational parameters
//为变分参数gamma分配空间,维度:文档数*主题数
var_gamma = malloc(sizeof(double*)*(corpus->num_docs));
for (d = 0; d < corpus->num_docs; d++)
var_gamma[d] = malloc(sizeof(double) * NTOPICS);
//为变分参数phi分配空间,维度:文档集中文档的最大单词数(去重) * 主题数
max_length = max_corpus_length(corpus);
phi = malloc(sizeof(double*)*max_length);
for (n = 0; n < max_length; n++)
phi[n] = malloc(sizeof(double) * NTOPICS);
// initialize model
ss = NULL;
if (strcmp(start, "seeded")==0)
{
model = new_lda_model(corpus->num_terms, NTOPICS);
ss = new_lda_suffstats(model);
corpus_initialize_ss(ss, model, corpus); //初始化tw分布
lda_mle(model, ss, 0); //compute MLE lda model from sufficient statistics
model->alpha = INITIAL_ALPHA;
}
else if (strcmp(start, "random")==0)
{
model = new_lda_model(corpus->num_terms, NTOPICS);
ss = new_lda_suffstats(model);
random_initialize_ss(ss, model);
lda_mle(model, ss, 0);
model->alpha = INITIAL_ALPHA;
}
else
{
model = load_lda_model(start);
ss = new_lda_suffstats(model);
}
sprintf(filename,"%s/000",directory);
save_lda_model(model, filename);
// run expectation maximization
i = 0;
likelihood_old = 0;
converged = 1;
sprintf(filename, "%s/likelihood.dat", directory);
likelihood_file = fopen(filename, "w");
start_EM = clock();
//em迭代继续执行条件:以下1和2同时满足
//1,
//converaged<0 也就是新值比旧值好
//或converaged>EM_CONVERGED 新值和旧值还不够近似
//或迭代步骤执行太少(<=2)
//2,
//当前迭代step数在规定的最大迭代步数以内
//或者没有指定最大迭代步数(-1)
while (((converged < 0) || (converged > EM_CONVERGED) || (i <= 2)) && ((i <= EM_MAX_ITER) || (EM_MAX_ITER == -1)) )
{
i++; printf("**** em iteration %d ****\n", i);
likelihood = 0;
zero_initialize_ss(ss, model); //把统计量的值都赋为0
// e-step
//固定alpha和beta,对每一篇文档找到优化的gamma和phi,更新充分统计量,计算似然
for (d = 0; d < corpus->num_docs; d++)
{
if ((d % 10) == 0) printf("document %d in %d EM iteration\n",d, i);
likelihood += doc_e_step(&(corpus->docs[d]),
var_gamma[d],
phi,
model,
ss);
}
// m-step
//根据当前的充分统计量,更新模型参数alpha,beta
lda_mle(model, ss, ESTIMATE_ALPHA);
// check for convergence
converged = (likelihood_old - likelihood) / (likelihood_old);
if (converged < 0) VAR_MAX_ITER = VAR_MAX_ITER * 2;
likelihood_old = likelihood;
// output model and likelihood
fprintf(likelihood_file, "%10.10f\t%5.5e\n", likelihood, converged);
fflush(likelihood_file);
if ((i % LAG) == 0)
{
sprintf(filename,"%s/%03d",directory, i);
save_lda_model(model, filename);
sprintf(filename,"%s/%03d.gamma",directory, i);
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
}
}
//EM迭代结束
finish_EM = clock();
printf("nTopic = %d, nTerm = %d estimation time: \n", model->num_topics, model->num_terms);
printf(" EM iteration takes %f seconds(this is %d miniutes)\n", (double)(finish_EM-start_EM)/CLOCKS_PER_SEC, (finish_EM-start_EM)/CLOCKS_PER_SEC/60);
// output the final model
sprintf(filename,"%s/final",directory);
save_lda_model(model, filename); //此函数中保存了beta到final.beta文件 , 还有.other文件
sprintf(filename,"%s/final.gamma",directory);
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
// output theta
theta = (double*)malloc(sizeof(double)*model->num_topics);
sprintf(filename1, "%s/final.theta", directory);
thetaFile = fopen(filename1, "w");
// output the word assignments (for visualization)
sprintf(filename1, "%s/result-doc-assgn.dat", directory);
result = fopen(filename1, "w");
for (d = 0; d < corpus->num_docs; d++)
{
sprintf(filename, "%s/result_%d_phi.dat", directory,d); //调试这一部分有越界的错误,完毕,filename数组空间太小。
w_asgn_file = fopen(filename, "w");
printf("final e step document %d\n",d);
likelihood += lda_inference(&(corpus->docs[d]), model, var_gamma[d], phi);
write_word_assignment(result, w_asgn_file, &(corpus->docs[d]), phi, model);
computeTheta( thetaFile, &(corpus->docs[d]), phi, model, theta);
fclose(w_asgn_file);
}
fclose(result);
fclose(thetaFile);
fclose(likelihood_file);
//释放空间
free(theta);
for (d = 0; d < corpus->num_docs; d++)
free(var_gamma[d]);
free(var_gamma);
for (n = 0; n < max_length; n++)
free(phi[n]);
free(phi);
free_lda_ss( ss, model);
free_lda_model(model);
return (long double)(finish_EM-start_EM)/CLOCKS_PER_SEC/60;
}
void computeTheta( FILE* thetaFile, document* doc, double** phi, lda_model* model, double * theta)
{
int n;
for (n=0; n< model->num_topics; n++)
theta[n] = 0;
for (n = 0; nlength; n++)
{
int topicIndex = argmax(phi[n], model->num_topics);
theta[ topicIndex ] = theta[ topicIndex ] + doc->counts[ n ];
}
for (n=0; nnum_topics; n++)
{
theta[n] = theta[n]/doc->total;
fprintf(thetaFile, "%f\t", theta[n]);
}
fprintf(thetaFile, "\n");
fflush(thetaFile);
}
/*
* read settings.
*
*/
void read_settings(char* filename)
{
FILE* fileptr;
char alpha_action[100];
fileptr = fopen(filename, "r");
fscanf(fileptr, "var max iter %d\n", &VAR_MAX_ITER);
fscanf(fileptr, "var convergence %f\n", &VAR_CONVERGED);
fscanf(fileptr, "em max iter %d\n", &EM_MAX_ITER);
fscanf(fileptr, "em convergence %f\n", &EM_CONVERGED);
fscanf(fileptr, "alpha %s", alpha_action);
if (strcmp(alpha_action, "fixed")==0)
{
ESTIMATE_ALPHA = 0;
}
else
{
ESTIMATE_ALPHA = 1;
}
fclose(fileptr);
}
/*
* inference only
*
*/
void infer(char* model_root, char* save, corpus* corpus)
{
FILE* fileptr;
FILE* result;
FILE* w_asgn_file;
char filename[100];
char filename1[200];
int i, d, n;
lda_model *model;
double **var_gamma, likelihood, **phi;
document* doc;
/*double ***corpusPhi;
corpusPhi = (double***)malloc(sizeof(double**)*(corpus->num_docs));
for (i=0;inum_docs));
for (i = 0; i < corpus->num_docs; i++)
var_gamma[i] = (double*)malloc(sizeof(double)*model->num_topics);
//int max_length = max_corpus_length(corpus);
sprintf(filename, "%s-lda-lhood.dat", save);
fileptr = fopen(filename, "w");
for (d = 0; d < corpus->num_docs; d++)
{
if (((d % 100) == 0) && (d>0)) printf("document %d\n",d);
doc = &(corpus->docs[d]);
phi = (double**) malloc(sizeof(double*) * doc->length);
//phi = (double**) malloc(sizeof(double*) * max_length);
for (n = 0; n < doc->length; n++)
//for (n = 0; n < max_length; n++)
phi[n] = (double*) malloc(sizeof(double) * model->num_topics);
likelihood = lda_inference(doc, model, var_gamma[d], phi);
fprintf(fileptr, "%5.5f\n", likelihood);
//输出每一个文档的phi到文件result_%d_phi.dat中 另外每一个word对应的概率最大的topic保存在文件result-doc-assgn.dat中 一行对应一个文档
sprintf(filename, "%s/result_%d_phi.dat", save,d);
w_asgn_file = fopen(filename, "w");
printf("final e step document %d\n",d);
write_word_assignment(result,w_asgn_file, &(corpus->docs[d]), phi, model);
fclose(w_asgn_file);
for (n = 0; n < doc->length; n++)
free(phi[n]);
free(phi);
}
fclose(fileptr);
sprintf(filename, "%s-gamma.dat", save);
save_gamma(filename, var_gamma, corpus->num_docs, model->num_topics);
fclose(result);
for (d = 0; d < corpus->num_docs; d++)
free(var_gamma[d]);
free(var_gamma);
free_lda_model(model);
}
其中包含变分参数求解相关的函数
#include "lda-inference.h"
/*
* variational inference
*
*/
int lisnan(double x) {
return x != x;
}
double lda_inference(document* doc, lda_model* model, double* var_gamma, double** phi)
{
double converged = 1;
double phisum = 0, likelihood = 0;
double likelihood_old = 0, *oldphi=(double *)malloc(sizeof(double)*(model->num_topics));
int k, n, var_iter;
double *digamma_gam=(double *)malloc(sizeof(double)*(model->num_topics));
// compute posterior dirichlet
for (k = 0; k < model->num_topics; k++)
{
//初始化变分参数gamma=alpha + 当前文档中单词个数(不去重) N / 主题个数 k
var_gamma[k] = model->alpha + (doc->total/((double) model->num_topics));
//log gamma函数的一阶导数
digamma_gam[k] = digamma(var_gamma[k]);
//初始化变分参数phi=1/k
for (n = 0; n < doc->length; n++)
phi[n][k] = 1.0/model->num_topics;
}
var_iter = 0;
//开始迭代
while ((converged > VAR_CONVERGED) &&
((var_iter < VAR_MAX_ITER) || (VAR_MAX_ITER == -1)))
{
var_iter++;
for (n = 0; n < doc->length; n++)
{
phisum = 0;
for (k = 0; k < model->num_topics; k++)
{
oldphi[k] = phi[n][k];
//更新变分参数 phi
//就是论文中变分推断算法的式子 phi(n,i) = b(i,wn) * exp(digamma(gamma(i)))
//这里因为有exp所以在log空间计算,算得的phi也是log space的
phi[n][k] =
digamma_gam[k] +
model->log_prob_w[k][doc->words[n]];
if (k > 0)
phisum = log_sum(phisum, phi[n][k]);//在log space对phi求和
else
phisum = phi[n][k]; // note, phi is in log space
}
for (k = 0; k < model->num_topics; k++)
{
//归一化,使phi(n)和为1
phi[n][k] = exp(phi[n][k] - phisum);
//更新变分参数 gamma
var_gamma[k] =
var_gamma[k] + doc->counts[n]*(phi[n][k] - oldphi[k]);
// !!! a lot of extra digamma's here because of how we're computing it
// !!! but its more automatically updated too.
digamma_gam[k] = digamma(var_gamma[k]);
}
}
likelihood = compute_likelihood(doc, model, phi, var_gamma);
assert(!isnan(likelihood));
converged = (likelihood_old - likelihood) / likelihood_old;
likelihood_old = likelihood;
// printf("[LDA INF] %8.5f %1.3e\n", likelihood, converged);
}//迭代结束
return(likelihood);
}
/*
* compute likelihood bound
*
*/
//按照论文附录(15)式计算L(gamma,phi;alpha,beta)
double
compute_likelihood(document* doc, lda_model* model, double** phi, double* var_gamma)
{
double likelihood = 0, digsum = 0, var_gamma_sum = 0, *dig=(double *)malloc(sizeof(double)*(model->num_topics));
int k, n;
for (k = 0; k < model->num_topics; k++)
{
dig[k] = digamma(var_gamma[k]);
var_gamma_sum += var_gamma[k];
}
digsum = digamma(var_gamma_sum);
//论文(14)式中的Eq,第1个和第4个是合在一起再拆分算的,第2,3,5个是合在一起算的
//Eq[logp(theta|alpha)]中的前两个部分 和 Eq[logq(theta)]中第一部分
likelihood =
log_gamma(model->alpha * model -> num_topics)
- model -> num_topics * log_gamma(model->alpha)
- (log_gamma(var_gamma_sum));
for (k = 0; k < model->num_topics; k++)
{
//Eq[logp(theta|alpha)]中的第三个部分 和 Eq[logq(theta)]中剩余的
likelihood +=
(model->alpha - 1)*(dig[k] - digsum) + log_gamma(var_gamma[k])
- (var_gamma[k] - 1)*(dig[k] - digsum);
//Eq[logp(z|theta)] + Eq[logp(w|z,beta)] - Eq[logq(z)]
for (n = 0; n < doc->length; n++)
{
if (phi[n][k] > 0)
{
likelihood += doc->counts[n]*
(phi[n][k]*((dig[k] - digsum) - log(phi[n][k])
+ model->log_prob_w[k][doc->words[n]]));
}
}
}
return(likelihood);
}
数据集读入
#include "lda-data.h"
corpus* read_data(char* data_filename)
{
FILE *fileptr;
int length, count, word, n, nd, nw;
corpus* c;
printf("reading data from %s\n", data_filename);
c = malloc(sizeof(corpus));
c->docs = 0;
c->num_terms = 0;
c->num_docs = 0;
fileptr = fopen(data_filename, "r");
nd = 0; nw = 0;
while ((fscanf_s(fileptr, "%10d", &length) != EOF)) //读入每行数据的第一个数字,是文档的字典大小(文档中单词去重的个数)
{
//对于第nd个文档
c->docs = (document*) realloc(c->docs, sizeof(document)*(nd+1)); //(数据类型*)realloc(要改变内存大小的指针名,新的大小) 新的大小一定要大于原来的大小,不然的话会导致数据丢失!
c->docs[nd].length = length; //文档中出现过的单词的个数,也就是文档字典大小,是去重的
c->docs[nd].total = 0; //文档中总单词个数,不去重,是对counts的求和。
c->docs[nd].words = malloc(sizeof(int)*length); //文档中的word在文档集字典中的ID
c->docs[nd].counts = malloc(sizeof(int)*length); //文档中word出现次数
for (n = 0; n < length; n++)//读入每行数据剩下的数据,词频统计
{
fscanf_s(fileptr, "%10d:%10d", &word, &count); //读入每个 [wordID:word出现次数]
word = word - OFFSET;
c->docs[nd].words[n] = word;
c->docs[nd].counts[n] = count;
c->docs[nd].total += count;
if (word >= nw) { nw = word + 1; } //nw记录文档集最大的那个word ID,也就是文档集字典中的单词个数
//if (word >= nw) { nw = word; }
}
nd++;
}
fclose(fileptr);
c->num_docs = nd;
c->num_terms = nw;
printf("number of docs : %d\n", nd);
printf("number of terms : %d\n", nw);
return(c);
}
int max_corpus_length(corpus* c)//输出数据集中单词数(去重后)最多的文档的单词数,这个length是去重后的长度
{
int n, max = 0;
for (n = 0; n < c->num_docs; n++)
if (c->docs[n].length > max) max = c->docs[n].length;
return(max);
}
牛顿法计算模型参数alpha
#include "lda-alpha.h"
#include "lda-inference.h"
/*
* objective function and its derivatives
*
*/
double alhood(double a, double ss, int D, int K)
{ return(D * (log_gamma(K * a) - K * log_gamma(a)) + (a - 1) * ss); }
double d_alhood(double a, double ss, int D, int K)
{ return(D * (K * digamma(K * a) - K * digamma(a)) + ss); }
double d2_alhood(double a, int D, int K)
{ return(D * (K * K * trigamma(K * a) - K * trigamma(a))); }
/*
* newtons method
*
*/
double opt_alpha(double ss, int D, int K)
{
double a, log_a, init_a = 100;
double f, df, d2f;
int iter = 0;
log_a = log(init_a);
do
{
iter++;
a = exp(log_a);
if (isnan(a))
{
init_a = init_a * 10;
printf("warning : alpha is nan; new init = %5.5f\n", init_a);
a = init_a;
log_a = log(a);
}
f = alhood(a, ss, D, K); //附录A4.2中的L(a)
df = d_alhood(a, ss, D, K); //L对a的一阶偏导
d2f = d2_alhood(a, D, K); //二阶偏导
log_a = log_a - df/(d2f * a + df);//迭代公式
printf("alpha maximization : %5.5f %5.5f\n", f, df);
}
while ((fabs(df) > NEWTON_THRESH) && (iter < MAX_ALPHA_ITER));
return(exp(log_a));
}
还有cokus.c 和 utils.c 中是一些数学计算的函数。