http://www.aboutyun.com/thread-22359-1-1.html
问题导读:
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
package
com.blogchong.spark.mllib.base
import
org.apache.log
4
j.{Level, Logger}
import
org.apache.spark.{SparkConf, SparkContext}
import
org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel}
import
org.apache.spark.mllib.linalg.Vectors
/**
* Describe:LDA主题模型基础实例
*/
object
LdaArithmetic {
def
main(args
:
Array[String]) {
// 屏蔽不必要的日志显示在终端上
Logger.getLogger(
"org.apache.spark"
).setLevel(Level.WARN)
Logger.getLogger(
"org.eclipse.jetty.server"
).setLevel(Level.OFF)
// 设置运行环境
val
conf
=
new
SparkConf().setAppName(
"LDA"
).setMaster(
"local"
)
val
sc
=
new
SparkContext(conf)
val
modelPath
=
"file:///export/software/github/spark-2.1.0-bin-hadoop2.6/data/mllib/result/lda/model"
//doc-topic
val
modelPath
2
=
"file:///export/software/github/spark-2.1.0-bin-hadoop2.6/data/mllib/result/lda/model2"
//1 加载数据,返回的数据格式为:documents: RDD[(Long, Vector)]
// 其中:Long为文章ID,Vector为文章分词后的词向量
// 可以读取指定目录下的数据,通过分词以及数据格式的转换,转换成RDD[(Long, Vector)]即可
val
data
=
sc.textFile(
"file:///export/software/github/spark-2.1.0-bin-hadoop2.6/data/mllib/sample_lda_data.txt"
,
1
)
val
parsedData
=
data.map(s
=
> Vectors.dense(s.split(
' '
).map(
_
.toDouble)))
//通过唯一id为文档构建index
val
corpus
=
parsedData.zipWithIndex.map(
_
.swap).cache()
//2 建立模型,设置训练参数,训练模型
/**
* k: 主题数,或者聚类中心数
* DocConcentration:文章分布的超参数(Dirichlet分布的参数),必需>1.0
* TopicConcentration:主题分布的超参数(Dirichlet分布的参数),必需>1.0
* MaxIterations:迭代次数
* setSeed:随机种子
* CheckpointInterval:迭代计算时检查点的间隔
* Optimizer:优化计算方法,目前支持"em", "online"
*/
val
ldaModel
=
new
LDA().
setK(
3
).
setDocConcentration(
5
).
setTopicConcentration(
5
).
setMaxIterations(
20
).
setSeed(
0
L).
setCheckpointInterval(
10
).
setOptimizer(
"em"
).
run(corpus)
//3 模型输出,模型参数输出,结果输出,输出的结果是是针对于每一个分类,对应的特征打分
// Output topics. Each is a distribution over words (matching word count vectors)
println(
"Learned topics (as distributions over vocab of "
+ ldaModel.vocabSize +
" words):"
)
val
topics
=
ldaModel.topicsMatrix
for
(topic <- Range(
0
,
3
)) {
//print(topic + ":")
val
words
=
for
(word <- Range(
0
, ldaModel.vocabSize)) {
" "
+ topics(word, topic); }
topic +
":"
+ words
// println()
}
val
dldaModel
=
ldaModel.asInstanceOf[DistributedLDAModel]
val
tmpLda
=
dldaModel.topTopicsPerDocument(
3
).map {
f
=
>
(f.
_
1
, f.
_
2
zip f.
_
3
)
}.map(f
=
> s
"${f._1} ${f._2.map(k => k._1 + "
:
" + k._2).mkString("
")}"
).repartition(
1
).saveAsTextFile(modelPath
2
)
//保存模型文件
ldaModel.save(sc, modelPath)
//再次使用
//val sameModel = DistributedLDAModel.load(sc, modelPath)
sc.stop()
}
}
|
01
02
03
04
05
06
07
08
09
10
11
12
|
10 0:0.4314975441651938 1:0.23556758034173494 2:0.3329348754930712
4 0:0.4102948931589844 1:0.24776090803928308 2:0.34194419880173255
11 0:0.2097946758876284 1:0.45373753641180287 2:0.3364677877005687
0 0:0.2979553770395886 1:0.3739169154377782 2:0.3281277075226332
1 0:0.27280146347774675 1:0.3908486412393842 2:0.336349895282869
6 0:0.5316139195059199 1:0.20597059190339642 2:0.2624154885906837
7 0:0.424646102395855 1:0.23807706795712158 2:0.3372768296470235
8 0:0.23953838371693498 1:0.4115439191094815 2:0.3489176971735836
9 0:0.2748266604374283 1:0.41148754032514906 2:0.31368579923742274
3 0:0.5277762550221995 1:0.20882605277709107 2:0.2633976922007094
5 0:0.24464389209216816 1:0.4074778880433907 2:0.34787821986444123
2 0:0.2973287069168621 1:0.3780115877202354 2:0.3246597053629025
|
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
|
//对迭代次数进行循环
for
(i<-Array(
5
,
10
,
20
,
40
,
60
,
120
,
200
,
500
)){
val
lda
=
new
LDA()
.setK(
3
)
.setTopicConcentration(
3
)
.setDocConcentration(
3
)
.setOptimizer(
"online"
)
.setCheckpointInterval(
10
)
.setMaxIter(i)
val
model
=
lda.fit(dataset
_
lpa)
val
ll
=
model.logLikelihood(dataset
_
lpa)
val
lp
=
model.logPerplexity(dataset
_
lpa)
println(s
"$i $ll"
)
println(s
"$i $lp"
)
}
|
01
02
03
04
05
06
07
08
09
10
11
12
13
|
//EM 方法,分析setDocConcentration的影响,计算(50/k)+1=50/5+1=11
for
(i<-Array(
1.2
,
3
,
5
,
7
,
9
,
11
,
12
,
13
,
14
,
15
,
16
,
17
,
18
,
19
,
20
)){
val
lda
=
new
LDA()
.setK(
5
)
.setTopicConcentration(
1.1
)
.setDocConcentration(i)
.setOptimizer(
"em"
)
.setMaxIter(
30
)
val
model
=
lda.fit(dataset
_
lpa)
val
lp
=
model.logPerplexity(dataset
_
lpa)
println(s
"$i $lp"
)
}
|
1
2
3
4
5
|
1.1 2.602768469
1.2 2.551084142
1.5 2.523405179
2.0 2.524881353
5 2.575868552
|