可以自己看看是不是很高效。为了加快速度,尽量精简了算法。测试表明,精确度还可以。
由于没有实现完整的一套字典机制,而是普通的文本字典,所以就不提供完整源码下载了,贴出核心的源码。从版本完整度上来说只能算是0.6版。
另外,本分词系统使用的词库是ShootAnalyzer的词库。
使用方法:
参考以下代码
1
[TestMethod]
2
public
void
TestMethod1()
3
{
4
//
5
//
TODO: 在此 添加测试逻辑
6
//
7
8
Participle p
=
new
Participle();
9
p.Init(
@"
D:\labs\xxxx
"
);
10
string
txt
=
@"
天下真的有神吗?我不是呀
"
;
11
string
outstr
=
string
.Empty;
12
Stopwatch st
=
new
Stopwatch();
13
st.Start();
14
outstr
=
p.TextSpliter(txt);
15
st.Stop();
16
17
Stopwatch st2
=
new
Stopwatch();
18
st2.Start();
19
List
<
string
>
hs
=
p.TextArray(txt);
20
st2.Stop();
21
Console.WriteLine(outstr);
22
Console.WriteLine(st.ElapsedMilliseconds.ToString(
"
f2
"
));
23
Console.WriteLine(st2.ElapsedMilliseconds.ToString(
"
f2
"
));
24
25
YurowAnalyzer.YurowAnalyzer y
=
new
YurowAnalyzer.YurowAnalyzer(
@"
D:\labs\xxxx
"
);
26
TokenStream t
=
y.TokenStream(
null
,
new
StringReader(txt));
27
28
Token token
=
t.Next();
29
while
(token
!=
null
)
30
{
31
Console.WriteLine(token.TermText()
+
"
\t
"
+
token.StartOffset()
+
"
\t
"
+
token.EndOffset());
32
token
=
t.Next();
33
}
34
t.Close();
35
}
在Lucene.Net 索引或者搜索中直接使用YurowAnalyzer.YurowAnalyzer 分析器。
下载地址:
http://files.cnblogs.com/birdshover/YurowAnalyzer.rar
下面贴上些关键源码:
Participle类(分词类)
1
2
public
List
<
int
>
StartArr;
3
4
public
List
<
string
>
TextArray(
string
text)
5
{
6
List
<
string
>
hs
=
new
List
<
string
>
();
7
StartArr
=
new
List
<
int
>
();
8
int
start
=
0
;
9
for
(
int
i
=
0
; i
<
text.Length; i
++
)
10
{
11
char
nowchar
=
text[i];
12
char
nextchar
=
(i
==
text.Length
-
1
)
?
'
\0
'
: text[i
+
1
];
13
if
(DataCatch.EnglishChar.Contains(nowchar))
14
{
15
if
(start
<
1
)
16
start
=
i;
17
if
(DataCatch.EnglishChar.Contains(nextchar))
18
i
++
;
19
else
20
{
21
hs.Add(text.Substring(start, i
-
start));
22
StartArr.Add(start);
23
start
=
0
;
24
}
25
continue
;
26
}
27
28
if
(DataCatch.Num.Contains(nowchar))
29
{
30
if
(start
<
1
)
31
start
=
i;
32
if
(DataCatch.Num.Contains(nextchar))
33
{
34
i
++
;
35
}
36
else
37
{
38
hs.Add(text.Substring(start, i
-
start));
39
StartArr.Add(start);
40
start
=
0
;
41
}
42
continue
;
43
}
44
if
(nowchar
==
'
'
)
45
{
46
continue
;
47
}
48
if
(nextchar
==
'
'
||
nextchar
==
'
\0
'
)
49
{
50
hs.Add(nowchar.ToString());
51
StartArr.Add(i);
52
i
++
;
53
continue
;
54
}
55
if
(DataCatch.GetDict().ContainsKey(nowchar)
&&
DataCatch.GetDict()[nowchar].ContainsKey(nextchar))
56
{
57
HashSet
<
string
>
list
=
DataCatch.GetDict()[nowchar][nextchar];
58
if
(list.Count
==
0
)
59
{
60
hs.Add(nowchar.ToString()
+
nextchar.ToString());
61
StartArr.Add(i);
62
i
++
;
63
continue
;
64
}
65
int
maxnum
=
0
;
66
string
temp
=
string
.Empty;
67
string
outstr
=
string
.Empty;
68
foreach
(
string
item
in
list)
69
{
70
if
(text.Length
-
i
>
item.Length
+
1
)
71
{
72
temp
=
text.Substring(i
+
2
, item.Length);
73
if
(list.Contains(temp))
74
{
75
if
(maxnum
>
item.Length)
76
continue
;
77
else
78
{
79
maxnum
=
item.Length;
80
outstr
=
temp;
81
}
82
}
83
}
84
}
85
if
(
!
string
.IsNullOrEmpty(outstr))
86
{
87
hs.Add(nowchar.ToString()
+
nextchar.ToString()
+
outstr);
88
StartArr.Add(i);
89
i
=
i
+
maxnum
+
1
;
90
}
91
else
92
{
93
hs.Add(nowchar.ToString()
+
nextchar.ToString());
94
StartArr.Add(i);
95
i
++
;
96
}
97
}
98
else
99
{
100
hs.Add(nowchar.ToString());
101
StartArr.Add(i);
102
}
103
}
104
return
hs;
105
}
DefaultDict类(加载分词具体实现)
private Dictionary<char, Dictionary<char, HashSet<string>>> dictMemory = new Dictionary<char, Dictionary<char, HashSet<string>>>(DataCatch.InitPage);
1
protected
virtual
void
DoFormat()
2
{
3
Stream stream
=
new
FileStream(dictSourcePath, FileMode.Open, FileAccess.Read, FileShare.Read);
4
StreamReader sr
=
new
StreamReader(stream, Encoding.Default);
5
while
(sr.Peek()
>
-
1
)
6
{
7
string
line
=
sr.ReadLine();
8
if
(line.Length
>
1
)
9
{
10
char
charfirst
=
line[
0
];
11
char
charseconde
=
line[
1
];
12
string
other
=
line.Length
>
2
?
line.Remove(
0
,
2
) :
null
;
13
if
(dictMemory.ContainsKey(charfirst))
14
{
15
if
(dictMemory[charfirst].ContainsKey(charseconde))
16
{
17
HashSet
<
string
>
list
=
dictMemory[charfirst][charseconde];
18
if
(
!
string
.IsNullOrEmpty(other)
&&
!
list.Contains(other))
19
list.Add(other);
20
}
21
else
22
{
23
HashSet
<
string
>
list
=
new
HashSet
<
string
>
();
24
if
(
!
string
.IsNullOrEmpty(other))
25
list.Add(other);
26
dictMemory[charfirst].Add(charseconde, list);
27
}
28
}
29
else
30
{
31
Dictionary
<
char
, HashSet
<
string
>>
d
=
new
Dictionary
<
char
, HashSet
<
string
>>
();
32
HashSet
<
string
>
list
=
new
HashSet
<
string
>
();
33
if
(
!
string
.IsNullOrEmpty(other))
34
list.Add(other);
35
d.Add(charseconde, list);
36
dictMemory.Add(charfirst, d);
37
}
38
}
39
}
40
}
转换到Lucene接口
1
public
class
YurowTokenizer : Tokenizer
2
{
3
private
string
text;
4
private
List
<
string
>
list;
5
int
current
=
0
;
6
private
string
path;
7
static
Participle p;
8
bool
isfirstrun
=
true
;
9
10
public
YurowTokenizer(TextReader textreader,
string
path)
11
{
12
text
=
textreader.ReadToEnd();
13
this
.path
=
path;
14
if
(p
==
null
)
15
{
16
p
=
new
Participle();
17
p.Init(path);
18
}
19
}
20
21
public
override
Token Next()
22
{
23
if
(
string
.IsNullOrEmpty(text))
24
return
null
;
25
26
if
(isfirstrun)
27
{
28
list
=
p.TextArray(text);
29
isfirstrun
=
false
;
30
}
31
if
(list.Count
<
1
||
current
>=
list.Count)
32
return
null
;
33
int
start
=
p.StartArr[current];
34
string
currentstr
=
list[current];
35
Token token
=
new
Token(currentstr, start, start
+
currentstr.Length);
36
current
++
;
37
return
token;
38
}
39
}
有兴趣的朋友可以自己反编译查看源码。暂时不提供完整源码。
http://www.cnblogs.com/birdshover/ by yurow