参考文章 https://github.com/NLP-LOVE/Introduction-NLP/blob/master/chapter/2.词典分词.md
废话不多说了,go代码如下:
package level
import (
"bufio"
"fmt"
"io"
"os"
"strings"
)
type PLN struct {
Dict map[string]string
}
func NewPLN(dictFile string) *PLN {
p := &PLN{}
p.loadDict(dictFile)
return p
}
func (p *PLN) loadDict(file string) error {
f, err := os.Open(file)
if err != nil {
fmt.Println("load dict error: ", err)
}
dic := make(map[string]string)
buf := bufio.NewReader(f)
for {
line, err := buf.ReadString('\n')
if err != nil && err != io.EOF {
fmt.Println("raad dict file fail", err)
return err
}
c := strings.Split(line, "\t")
if len(c) > 1 {
dic[c[0]] = line
} else {
fmt.Println("dict line split fail", c)
}
// io.EOF
if err != nil {
break
}
}
p.Dict = dic
return nil
}
func (p *PLN) FullSegment(text string) []string {
var ret []string
s := []rune(text)
for i := 0; i < len(s); i++ {
for j := i + 1; j <= len(s); j++ {
c := string(s[i:j])
//fmt.printf("%#v %#v\n", c, s[i:j])
if _, ok := p.Dict[c]; ok {
ret = append(ret, c)
}
}
}
return ret
}
func (p *PLN) ForwardSegment(text string) []string {
var ret []string
var count int
s := []rune(text)
for i := 0; i < len(s); i += count {
count = 0
for j := i + 1; j <= len(s); j++ {
c := string(s[i:j])
//fmt.printf("%#v %#v\n", c, s[i:j])
if _, ok := p.Dict[c]; ok {
if m := j - i; m > count {
count = m
}
}
}
ret = append(ret, string(s[i:i+count]))
}
return ret
}
func (p *PLN) BackwardSegment(text string) []string {
var ret []string
var count int
s := []rune(text)
for j := len(s); j > 0; j -= count {
count = 0
for i := j - 1; i >= 0; i-- {
c := string(s[i:j])
//fmt.printf("%#v %#v\n", c, s[i:j])
if _, ok := p.Dict[c]; ok {
if m := j - i; m > count {
count = m
}
}
}
//fmt.Println(string(s[j-count : j]))
ret = append(ret, string(s[j-count:j]))
}
var rret []string
for i := len(ret) - 1; i >= 0; i-- {
rret = append(rret, ret[i])
}
return rret
}
func countSingleChar(chars []string) int {
count := 0
for _, ch := range chars {
s := []rune(ch)
if len(s) == 1 {
count += 1
}
}
return count
}
func (p *PLN) BidirectionalSegment(text string) []string {
f := p.ForwardSegment(text)
b := p.BackwardSegment(text)
if len(f) < len(b) {
return f
}
if len(f) > len(b) {
return b
}
if countSingleChar(f) > countSingleChar(b) {
return b
}
return b
}
测试代码:
package level
import "testing"
func TestLoadDict(t *testing.T) {
p := NewPLN("CoreNatureDictionary.mini.txt")
c := 10
for k, v := range p.Dict {
t.Logf("k:%v v:%v\n", k, v)
c -= 1
if c <= 0 {
break
}
}
t.Log("项目: ", p.Dict["项目"])
t.Log("dict len: ", len(p.Dict))
}
var testCase = []string{
"就读北京大学",
"研究生命起源",
"项目的研究",
"当下雨天地面积水",
"结婚的和未结婚的",
"欢迎新老师生前来就餐",
}
func TestFullSegment(t *testing.T) {
p := NewPLN("CoreNatureDictionary.mini.txt")
for _, text := range testCase {
seg := p.FullSegment(text)
t.Logf("full seg: %v", seg)
}
}
func TestForwardSegment(t *testing.T) {
p := NewPLN("CoreNatureDictionary.mini.txt")
for _, text := range testCase {
seg := p.ForwardSegment(text)
t.Logf("forward seg: %v", seg)
}
}
func TestBackwardSegment(t *testing.T) {
p := NewPLN("CoreNatureDictionary.mini.txt")
for _, text := range testCase {
seg := p.BackwardSegment(text)
t.Logf("back seg: %v", seg)
}
}
func TestBidirectionalSegment(t *testing.T) {
p := NewPLN("CoreNatureDictionary.mini.txt")
for _, text := range testCase {
seg := p.BidirectionalSegment(text)
t.Logf("bi seg: %v", seg)
}
}