这里提供我开发的中文分词器源代码。代码分为三个部分:
(一)状态矩阵元素对象GNode。 动态规划中,状态矩阵的元素需要记录当前最大概率和前一个最优匹配状态,这个矩阵是一个二维数组,每一个元素就是一个GNode对象。
(二)统计学习使用的嵌套哈希表THash。 用于统计学习时记录结果的HashMap,这个哈希表有三层嵌套,对于收入数据有一些必要操作,为了简单起见,我将这个哈希表进行了封装,并加几个方法方便程序的编写。
(三)分词器对象CGSegmenter。 其中封装了学习方法,标注器行为等,是分词器的主要功能对象。
注:本文只分享代码,不对代码进行分析解释,需要了解原理和代码分析,请参考博文《【中文分词】亲手开发一款中文分词器——原理》
源代码如下:
GNode
public class GNode {
public Double MaxPos;
public char CurTag;
public char PreTag;
public GNode(){
this.MaxPos = 0.0;
}
}
THash
public class THash {
//一个哈希表,记录结构为:字符->符号->字符->符号->概率。代表一个字符标注为某一符号时,下一字符标注为某一符号的概率值,值为Double类型。
private HashMap>>> _myHashMap;
private static Double INITIAL_VALUE = 1.0;
private static Double POSSIBILITY_INTERPOLATE_VALUE = 1.02;
public THash() {
this._myHashMap = new HashMap<>();
}
public void PutValue(char pri_key, char pri_tag, char sec_key, char sec_tag) {
if (!this._myHashMap.containsKey(pri_key)) {
this._myHashMap.put(pri_key, new HashMap>>());
}
HashMap>> prihash = this._myHashMap
.get(pri_key);
if (!prihash.containsKey(pri_tag)) {
prihash.put(pri_tag,
new HashMap>());
}
HashMap> seccharhash = prihash
.get(pri_tag);
if (!seccharhash.containsKey(sec_key)) {
seccharhash.put(sec_key, new HashMap());
}
HashMap sectaghash = seccharhash.get(sec_key);
if (!sectaghash.containsKey(sec_tag)) {
sectaghash.put(sec_tag, THash.INITIAL_VALUE);
} else {
Double _temp = sectaghash.get(sec_tag);
_temp++;
}
}
public void calculatePossibilityForAllCombinations(String path, String format) {
File f = new File(path);
try {
if (!f.exists())
f.createNewFile();
PrintWriter writer = new PrintWriter(path, format);
for(Character pri_key : this._myHashMap.keySet()){
HashMap>>
_pritaghash = this._myHashMap.get(pri_key);
for(Character pri_tag : _pritaghash.keySet()){
HashMap>
_sechash = _pritaghash.get(pri_tag);
for(Character sec_key : _sechash.keySet()){
HashMap _sectaghash = _sechash.get(sec_key);
Double total = 0.0;
for(Character sec_tag : _sectaghash.keySet()){
total += _sectaghash.get(sec_tag);
}
total *= THash.POSSIBILITY_INTERPOLATE_VALUE;
for(Character sec_tag : _sectaghash.keySet()){
StringBuilder sb = new StringBuilder();
sb.append(pri_key);
sb.append(pri_tag);
sb.append(sec_key);
sb.append(sec_tag);
sb.append(_sectaghash.get(sec_tag) / total);
writer.println(sb.toString());
}
}
}
}
writer.close();
} catch (IOException e) {
System.err
.println("Error in method calculatePossibilityForAllCombinations()");
}
}
}
CGSegmenter
public class CGSegmenter {
private String TrainingMaterialPath;
private String TaggedTrainingMaterialPath;
private String FinalTagFilePathForRelation;
private String FinalTagFilePathForSingle;
private String DefaultFileFormat = "UTF-8";
private THash _thash;
private HashMap _charHash;
private HashMap> _tagHashForRelation;
private HashMap _tagHashForSingle;
private Double StrangeCombinationDefaultPossibility = 0.00035;
private Double StrangeSingleDefaultPossibility = 0.0005;
private List resultlist;
public void setDefaultStrangeCombinationPossibility(Double possibility){
this.StrangeCombinationDefaultPossibility = possibility;
}
public Double getStrangeCombinationDefaultPossibility() {
return StrangeCombinationDefaultPossibility;
}
public void setStrangeSingleDefaultPossibility(Double strangeSingleDefaultPossibility) {
StrangeSingleDefaultPossibility = strangeSingleDefaultPossibility;
}
public Double getStrangeSingleDefaultPossibility() {
return StrangeSingleDefaultPossibility;
}
public CGSegmenter(boolean NeedInitialiseFinalTagFile) {
this.FinalTagFilePathForRelation = "tf/tag-re.txt";
this.FinalTagFilePathForSingle = "tf/tag-si.txt";
this.resultlist = new LinkedList();
if (NeedInitialiseFinalTagFile) {
this.TrainingMaterialPath = "tm/msr-in.txt";
this.TaggedTrainingMaterialPath = "tm/msr-out.txt";
this.processTrainingMaterial();
this.statisticTaggedTrainingMaterial();
}
this.initialiseTagHashMap();
}
// 预处理中文语料库
protected void processTrainingMaterial() {
File f = new File(this.TrainingMaterialPath);
if (!f.exists()) {
System.err.println("未找到中文语料库文件: "
+ this.TrainingMaterialPath);
} else {
try {
if (!f.exists()) {
f.createNewFile();
}
FileInputStream fis = new FileInputStream(f);
InputStreamReader re = new InputStreamReader(fis,
this.DefaultFileFormat);
BufferedReader reader = new BufferedReader(re);
String temp;
PrintWriter writer = new PrintWriter(
this.TaggedTrainingMaterialPath, this.DefaultFileFormat);
System.out.println(new Date().toString() + " 开始预处理中文语料库。");
while ((temp = reader.readLine()) != null) {
char[] chararr = temp.toCharArray();
StringBuilder sb = new StringBuilder();
int i = 0;
if (chararr[i] == 65279)
i = 1;
int j = i;
while (j <= chararr.length - 1) {
while (j < chararr.length - 1 && chararr[j] == ' ')
j++;
i = j;
while (j < chararr.length - 1 && chararr[j] != 32)
j++;
if (j - i == 1) {
sb.append(chararr[i] + "S");
} else if (j - i == 2) {
sb.append(chararr[i] + "B");
sb.append(chararr[j - 1] + "E");
} else if (j - i > 2) {
sb.append(chararr[i++] + "B");
while (i != j - 1) {
sb.append(chararr[i++] + "M");
}
sb.append(chararr[i] + "E");
}
if (j >= chararr.length - 1)
break;
}
writer.println(sb.toString());
}
System.out.println(new Date().toString() + " 完成中文语料库预处理。");
writer.close();
fis.close();
re.close();
reader.close();
} catch (IOException e) {
System.err
.println("Error in method processTrainingMaterial() : "
+ e.getMessage());
}
}
}
// 统计学习
protected void statisticTaggedTrainingMaterial() {
this.learningSingleTag(); //状态独立概率P(x)
this.learningRelationTag(); //状态转移概率P(X | Y)
}
// 学习状态独立概率
protected void learningSingleTag() {
File f = new File(this.TaggedTrainingMaterialPath);
if (!f.exists()) {
System.err.println("未找到训练语料文件"
+ this.TaggedTrainingMaterialPath);
} else {
try {
this._charHash = new HashMap<>();
Double total = 0.0;
FileInputStream fis = new FileInputStream(f);
InputStreamReader re = new InputStreamReader(fis,
this.DefaultFileFormat);
BufferedReader reader = new BufferedReader(re);
String temp = null;
System.out.println(new Date().toString() + " 开始学习独立概率。");
while ((temp = reader.readLine()) != null) {
char[] chararr = temp.toCharArray();
if (chararr.length == 0)
continue;
int i = 0;
if ((int) chararr[i] == 65279)
i++;
while (i < chararr.length) {
StringBuilder charsb = new StringBuilder();
charsb.append(chararr[i]);
charsb.append(chararr[i + 1]);
if (this._charHash.containsKey(charsb.toString())) {
Double _t = this._charHash.get(charsb.toString());
_t = _t + 1.0;
this._charHash.put(charsb.toString(), _t);
} else
this._charHash.put(charsb.toString(), 1.0);
total += 1.0;
i += 2;
}
}
File _f = new File(this.FinalTagFilePathForSingle);
if (!_f.exists())
_f.createNewFile();
PrintWriter writer_char = new PrintWriter(
this.FinalTagFilePathForSingle, this.DefaultFileFormat);
for (String key : this._charHash.keySet()) {
writer_char.print(key);
writer_char.println(this._charHash.get(key) / total);
}
System.out
.println(new Date().toString() + " 完成独立概率学习。");
writer_char.close();
fis.close();
re.close();
reader.close();
} catch (IOException e) {
System.err.println("Error in method learningSingleTag()");
}
}
}
// 学习状态转移概率
protected void learningRelationTag() {
File f = new File(this.TaggedTrainingMaterialPath);
if (!f.exists()) {
System.err.println("未找到训练语料库"
+ this.TaggedTrainingMaterialPath);
} else {
try {
this._thash = new THash();
FileInputStream fis = new FileInputStream(f);
InputStreamReader re = new InputStreamReader(fis,
this.DefaultFileFormat);
BufferedReader reader = new BufferedReader(re);
String temp = null;
System.out.println(new Date().toString()
+ " 开始学习状态转移概率。");
while ((temp = reader.readLine()) != null) {
char[] chararr = temp.toCharArray();
int i = 0;
while (i < chararr.length) {
if (i == 0)
this._thash.PutValue('~', '~', chararr[i],
chararr[i + 1]);
else
this._thash.PutValue(chararr[i - 2],
chararr[i - 1], chararr[i], chararr[i + 1]);
i += 2;
}
}
fis.close();
re.close();
reader.close();
this._thash.calculatePossibilityForAllCombinations(
this.FinalTagFilePathForRelation,
this.DefaultFileFormat);
System.out.println(new Date().toString()
+ " 完成状态转移概率学习。");
System.out.println(new Date().toString() + " 训练语料库学习完毕。");
} catch (IOException e) {
System.err.println("Error in method learningRelationTag()");
}
}
}
// 初始化标注器表
protected void initialiseTagHashMap() {
File f = new File(this.FinalTagFilePathForRelation);
if (!f.exists()) {
System.out.println("未找到标注器初始化文件"
+ this.FinalTagFilePathForRelation);
} else {
try {
this._tagHashForRelation = new HashMap<>();
FileInputStream fis = new FileInputStream(f);
InputStreamReader re = new InputStreamReader(fis,
this.DefaultFileFormat);
BufferedReader reader = new BufferedReader(re);
String temp = null;
while ((temp = reader.readLine()) != null) {
char[] chararr = temp.toCharArray();
StringBuilder pri_key_sb = new StringBuilder();
StringBuilder sec_key_sb = new StringBuilder();
pri_key_sb.append(chararr[0]);
pri_key_sb.append(chararr[1]);
sec_key_sb.append(chararr[2]);
sec_key_sb.append(chararr[3]);
int j = 6;
char[] pos_chararr = new char[7];
for (int n = 0; n < 7; n++, j++) {
pos_chararr[n] = chararr[j];
}
Double pos = 0.1 * this.convertStringtoDouble(pos_chararr,
0);
HashMap _hash;
if (this._tagHashForRelation.containsKey(pri_key_sb
.toString())) {
_hash = this._tagHashForRelation.get(pri_key_sb
.toString());
_hash.put(sec_key_sb.toString(), pos);
} else {
_hash = new HashMap<>();
_hash.put(sec_key_sb.toString(), pos);
this._tagHashForRelation.put(pri_key_sb.toString(),
_hash);
}
}
fis.close();
re.close();
reader.close();
} catch (IOException e) {
System.out.println("Error in method initialise -> relation");
}
}
f = new File(this.FinalTagFilePathForSingle);
if (!f.exists()) {
System.out.println("为找到标注器初始化文件"
+ this.FinalTagFilePathForSingle);
} else {
try {
this._tagHashForSingle = new HashMap<>();
FileInputStream fis = new FileInputStream(f);
InputStreamReader re = new InputStreamReader(fis,
this.DefaultFileFormat);
BufferedReader reader = new BufferedReader(re);
String temp = null;
while ((temp = reader.readLine()) != null) {
char[] chararr = temp.toCharArray();
StringBuilder sb = new StringBuilder();
sb.append(chararr[0]);
sb.append(chararr[1]);
char[] pos_chararr = new char[5];
for (int j = 0, i = 4; j < 5; j++, i++) {
pos_chararr[j] = chararr[i];
}
Double pos = 0.1 * this.convertStringtoDouble(pos_chararr,
0);
pos = pos + chararr[2] - 48;
if (chararr[chararr.length - 2] == '-') {
int n = chararr[chararr.length - 1] - 48;
for (int i = 0; i < n; i++) {
pos *= 0.1;
}
}
this._tagHashForSingle.put(sb.toString(), pos);
}
fis.close();
re.close();
reader.close();
} catch (IOException e) {
System.err.println("Error in method initialise -> single");
}
}
}
protected double convertStringtoDouble(char[] chararr, int position) {
if (position == chararr.length - 1)
return chararr[0] - 48;
else
return chararr[position] - 48 + 0.1
* convertStringtoDouble(chararr, position + 1);
}
// 拆分句子,分离标点,符号,数字和英文字符等
protected List segmentSentence(char[] sentence) {
int i = 0;
int j = i;
try {
while (j < sentence.length) {
if (sentence[j] == '。' || sentence[j] == ','
|| sentence[j] == '?' || sentence[j] == ':'
|| sentence[j] == '!' || sentence[j] == ' '
|| sentence[j] == '(' || sentence[j] == ')') {
if (i < j)
this.segmentWords(sentence, i, j - 1);
i = ++j;
continue;
}
if ((sentence[j] >= 65 && sentence[j] <= 90)
|| (sentence[j] >= 97 && sentence[j] <= 122)) {
if (i != j)
this.segmentWords(sentence, i, j - 1);
i = j;
while (j < sentence.length
&& ((sentence[j] >= 65 && sentence[j] <= 90) || (sentence[j] >= 97 && sentence[j] <= 122))) {
j++;
}
StringBuilder sb = new StringBuilder();
while (i < j) {
sb.append(sentence[i]);
i++;
}
this.resultlist.add(sb.toString());
i = j;
continue;
}
if (sentence[j] < 127) {
if (i < j) {
segmentWords(sentence, i, j - 1);
i = j;
}
while (j < sentence.length && sentence[j] < 127) {
j++;
}
StringBuilder sb = new StringBuilder();
while (i < j) {
sb.append(sentence[i]);
i++;
}
this.resultlist.add(sb.toString());
i = j;
continue;
}
j++;
}
if (i < j)
this.segmentWords(sentence, i, j - 1);
return resultlist;
} catch (OutOfMemoryError | ArrayIndexOutOfBoundsException e) {
System.out.println(sentence.length);
System.out.println(sentence[i]);
System.out.println(sentence[j]);
return this.resultlist;
}
}
// 分词操作
protected void segmentWords(char[] sentence, int start, int end) {
int length = end - start + 1;
GNode[][] graph = new GNode[length][4];
for (int i = 0; i < length; i++) {
for (int j = 0; j < 4; j++) {
graph[i][j] = new GNode();
}
}
// 初始化状态矩阵
for (int j = 0; j < 4; j++) {
graph[0][j].CurTag = getTag(j);
if (j == 0 || j == 3) {
StringBuilder sb = new StringBuilder();
sb.append(sentence[start]);
sb.append(graph[0][j].CurTag);
graph[0][j].MaxPos = this.getPossiblity("~~", sb.toString());
} else
graph[0][j].MaxPos = 0.0;
}
// 动态规划过程
for (int i = 1; i < length; i++) {
for (int j = 0; j < 4; j++) {
graph[i][j].CurTag = this.getTag(j);
StringBuilder sec_key_sb = new StringBuilder();
sec_key_sb.append(sentence[i + start]);
sec_key_sb.append(graph[i][j].CurTag);
for (int n = 0; n < 4; n++) {
if (!this.checkLogicalCombination(graph[i - 1][n].CurTag,
graph[i][j].CurTag))
continue;
StringBuilder pri_key_sb = new StringBuilder();
pri_key_sb.append(sentence[i + start - 1]);
pri_key_sb.append(graph[i - 1][n].CurTag);
Double _pos = this.getPossiblity(pri_key_sb.toString(),
sec_key_sb.toString());
if (this._tagHashForSingle.containsKey(pri_key_sb
.toString()))
_pos *= this._tagHashForSingle.get(pri_key_sb
.toString());
else
_pos *= this.StrangeSingleDefaultPossibility;
_pos *= graph[i - 1][n].MaxPos;
if (_pos >= graph[i][j].MaxPos) {
graph[i][j].MaxPos = _pos;
graph[i][j].PreTag = graph[i - 1][n].CurTag;
}
}
}
}
// 筛选最优解
int m = 0;
Double _maxpos = 0.0;
for (int j = 0; j < 4; j++) {
if (graph[length - 1][j].MaxPos >= _maxpos) {
_maxpos = graph[length - 1][j].MaxPos;
m = j;
}
}
char[] chararr = new char[length * 2];
for (int i = end - start, j = chararr.length - 1, n = end; i >= 0
&& j > 0; i--, j -= 2, n--) {
chararr[j] = graph[i][m].CurTag;
chararr[j - 1] = sentence[n];
m = this.getInt(graph[i][m].PreTag);
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < chararr.length; i += 2) {
sb.append(chararr[i]);
if (chararr[i + 1] == 'E' || chararr[i + 1] == 'S') {
this.resultlist.add(sb.toString());
sb = new StringBuilder();
} else if (i == chararr.length - 2)
this.resultlist.add(sb.toString());
}
}
protected char getTag(int value) {
switch (value) {
case 0:
return 'B';
case 1:
return 'M';
case 2:
return 'E';
case 3:
return 'S';
default:
return 'S';
}
}
protected int getInt(char tag) {
switch (tag) {
case 'B':
return 0;
case 'M':
return 1;
case 'E':
return 2;
case 'S':
return 3;
default:
return 3;
}
}
//筛选可能搭配
protected boolean checkLogicalCombination(char i, char j) {
if ((i == 'B' || i == 'M') && (j == 'M' || j == 'E'))
return true;
if ((i == 'E' || i == 'S') && (j == 'B' || j == 'S'))
return true;
else
return false;
}
protected Double getPossiblity(String pri_key, String sec_key) {
if (this._tagHashForRelation.containsKey(pri_key)) {
HashMap _hash = this._tagHashForRelation
.get(pri_key);
if (_hash.containsKey(sec_key))
return _hash.get(sec_key);
}
return this.StrangeCombinationDefaultPossibility;
}
}