首先介绍一下这个开源分词软件
以下摘自原作者的话:
Ansj中文分词是一款纯Java的、主要应用于自然语言处理的、高精度的中文分词工具,目标是“准确、高效、自由地进行中文分词”,可用于人名识别、地名识别、组织机构名识别、多级词性标注、关键词提取、指纹提取等领域,支持行业词典、用户自定义词典。
中文分词实例
(1)导入maven
1
2
3
4
5
6
|
<
dependency
>
<
groupId
>org.ansj</
groupId
>
<
artifactId
>ansj_seg</
artifactId
>
<
version
>0.9</
version
>
<
classifier
>sources</
classifier
>
</
dependency
>
|
(2)读取停用词表
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
package
com.blogchong.mite.util;
import
java.io.BufferedReader;
import
java.io.File;
import
java.io.FileInputStream;
import
java.io.InputStreamReader;
import
java.util.ArrayList;
import
java.util.HashMap;
import
java.util.List;
import
java.util.Map;
import
com.blogchong.mite.util.MacroDef;
/**
* @author:blogchong
* @E-mail: [email protected]
* @Version:1.0
* @blog www.blogchong.com
* @CreateTime:2014年12月16日 下午3:12:26
* @Description:获取系统的停用词表
*/
public
class
GetStopWordList {
// get stopword 1
@SuppressWarnings
(
"rawtypes"
)
public
Map<String, List> getStopWordList()
throws
Exception {
Map<String, List> map =
new
HashMap<String, List>();
String path_chinese =
"chinese_stopword.txt"
;
String path_english =
"english_stopword.txt"
;
GetStopWordList getStopWordList =
new
GetStopWordList();
List<String> list_c = getStopWordList.readStopWord(path_chinese);
List<String> list_e = getStopWordList.readStopWord(path_english);
map.put(MacroDef.STOP_CHINESE, list_c);
map.put(MacroDef.STOP_ENGLISH, list_e);
return
map;
}
// get stopword 2
@SuppressWarnings
(
"rawtypes"
)
public
Map<String, List> getStopWordList(String path_chinese,
String path_english)
throws
Exception {
Map<String, List> map =
new
HashMap<String, List>();
GetStopWordList getStopWordList =
new
GetStopWordList();
List<String> list_c = getStopWordList.readStopWord(path_chinese);
List<String> list_e = getStopWordList.readStopWord(path_english);
map.put(MacroDef.STOP_CHINESE, list_c);
map.put(MacroDef.STOP_ENGLISH, list_e);
return
map;
}
// read stopword from file
@SuppressWarnings
(
"resource"
)
public
List<String> readStopWord(String path)
throws
Exception {
List<String> list =
new
ArrayList<String>();
File file =
new
File(path);
InputStreamReader isr =
new
InputStreamReader(
new
FileInputStream(file), MacroDef.ENCODING);
BufferedReader bf =
new
BufferedReader(isr);
String stopword =
null
;
while
((stopword = bf.readLine()) !=
null
) {
stopword = stopword.trim();
list.add(stopword);
}
return
list;
}
@SuppressWarnings
({
"unchecked"
,
"rawtypes"
})
public
static
void
main(String[] args)
throws
Exception {
GetStopWordList getStopWordList =
new
GetStopWordList();
Map<String, List> map = getStopWordList.getStopWordList();
List<String> list = map.get(MacroDef.STOP_CHINESE);
for
(String str : list) {
System.out.println(str);
}
}
}
|
简单的读取两个停用词表,一个是中文停用词字典,另一个是英文停用词字典。
(2)中文分词
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
package
com.blogchong.mite.participle;
import
java.util.ArrayList;
import
java.util.List;
import
java.util.Map;
import
org.ansj.domain.Term;
import
org.ansj.splitWord.analysis.ToAnalysis;
import
com.blogchong.mite.util.GetStopWordList;
import
com.blogchong.mite.util.MacroDef;
/**
* @author:blogchong
* @E-mail: [email protected]
* @Version:1.0
* @blog www.blogchong.com
* @CreateTime:2014年12月16日 下午3:44:19
* @Description:抽词,包括去除停词
*/
public
class
ExtractWord {
// extract word
@SuppressWarnings
({
"unchecked"
,
"rawtypes"
})
public
List<String> extracWord(String article, Map<String, List> map)
throws
Exception {
List<String> list =
new
ArrayList<String>();
List<String> list_c = map.get(MacroDef.STOP_CHINESE);
List<String> list_e = map.get(MacroDef.STOP_ENGLISH);
List<Term> parse = ToAnalysis.parse(article);
for
(Term term : parse) {
boolean
flag =
true
;
String str = term.getName().trim();
for
(String str_c : list_c) {
if
(str_c.equals(str))
flag =
false
;
}
for
(String str_e : list_e) {
if
(str_e.equals(str))
flag =
false
;
}
if
(str ==
""
)
flag =
false
;
if
(flag)
list.add(str);
}
return
list;
}
@SuppressWarnings
(
"rawtypes"
)
public
static
void
main(String[] args)
throws
Exception {
ExtractWord extractWord =
new
ExtractWord();
GetStopWordList getStopWordList =
new
GetStopWordList();
//使用默认的停用词表
Map<String, List> map = getStopWordList.getStopWordList();
List<String> list = extractWord
.extracWord(
"Maven是基于项目对象模型(POM),可以通过一小段描述信息来管理项目的构建,报告和文档的软件项目管理工具。Maven 除了以程序构建能力为特色之外,还提供高级项目管理工具。"
,
map);
for
(String str : list) {
System.out.println(str);
}
}
}
|
其实用起来很简单,ansj定义了不同级别的分词,包括过滤程度,分词粒度,是否识别人名等,基本够用了。
在分词的同时,通过停用词表进行停用词过滤。