笔者并不了解,各大搜索网站是怎么实现智能搜索的。以下只是笔者一时的想法,笔者觉得这个方法可以实现智能匹配搜索内容。
一、首先我们获取细胞词库内容
①建表语句:
DROP TABLE IF EXISTS `sougou_ciku`;
CREATE TABLE `sougou_ciku` (
`id` varchar(50) NOT NULL,
`text` varchar(100) NOT NULL,
`below` varchar(50) default NULL,
`remark` varchar(100) default NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8;
②创建映射实体类:
package com.css.java.learning.model;
public class SouGouCiKu {
private String id;//主键
private String text; //内容
private String below;//所属
private String remark;//备注
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public String getText() {
return text;
}
public void setText(String text) {
this.text = text;
}
public String getBelow() {
return below;
}
public void setBelow(String below) {
this.below = below;
}
public String getRemark() {
return remark;
}
public void setRemark(String remark) {
this.remark = remark;
}
}
③创建搜狗scel文件阅读器:
package com.css.java.learning.massbag;
import java.util.List;
import java.util.Map;
public class SougouScelMdel {
private Map> wordMap;
private String name;
private String type;
private String description;
private String sample;
public Map> getWordMap() {
return wordMap;
}
void setWordMap(Map> wordMap) {
this.wordMap = wordMap;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getSample() {
return sample;
}
public void setSample(String sample) {
this.sample = sample;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
}
package com.css.java.learning.massbag;
import java.io.*;
import java.net.URL;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class SougouScelReader {
public SougouScelMdel read(File file) throws IOException {
return read(new FileInputStream(file));
}
public SougouScelMdel read(URL url) throws IOException {
return read(url.openStream());
}
protected ByteArrayOutputStream output=new ByteArrayOutputStream();
protected String readString(DataInputStream input,int pos,int[] reads) throws IOException {
int read=reads[0];
input.skip(pos-read);
read=pos;
output.reset();
while(true) {
int c1 = input.read();
int c2 = input.read();
read+=2;
if(c1==0 && c2==0) {
break;
} else {
output.write(c1);
output.write(c2);
}
}
reads[0]=read;
return new String(output.toByteArray(),encoding);
}
protected static String encoding = "UTF-16LE";
public SougouScelMdel read(InputStream in) throws IOException {
SougouScelMdel model = new SougouScelMdel();
DataInputStream input = new DataInputStream(in);
int read;
try {
byte[] bytes = new byte[4];
input.readFully(bytes);
assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0);
input.readFully(bytes);
int flag1 = bytes[0];
assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01);
int[] reads=new int[]{8};
model.setName(readString(input,0x130,reads));
model.setType(readString(input,0x338,reads));
model.setDescription(readString(input,0x540,reads));
model.setSample(readString(input,0xd40,reads));
read = reads[0];
input.skip(0x1540 - read);
read=0x1540;
input.readFully(bytes);
read += 4;
assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0);
bytes = new byte[128];
Map pyMap = new LinkedHashMap();
while (true) {
int mark = readUnsignedShort(input);
int size = input.readUnsignedByte();
input.skip(1);
read += 4;
assert (size > 0 && (size % 2) == 0);
input.readFully(bytes, 0, size);
read += size;
String py = new String(bytes, 0, size, encoding);
pyMap.put(mark, py);
if ("zuo".equals(py)) {
break;
}
}
if (flag1 == 0x44) {
input.skip(0x2628 - read);
} else if (flag1 == 0x45) {
input.skip(0x26C4 - read);
}
StringBuffer buffer = new StringBuffer();
Map> wordMap = new LinkedHashMap>();
while (true) {
int size = readUnsignedShort(input);
if (size < 0) {
break;
}
int count = readUnsignedShort(input);
int len = count / 2;
assert (len * 2 == count);
buffer.setLength(0);
for (int i = 0; i < len; i++) {
int key = readUnsignedShort(input);
buffer.append(pyMap.get(key)).append("'");
}
buffer.setLength(buffer.length() - 1);
String py = buffer.toString();
List list = wordMap.get(py);
if (list == null) {
list = new ArrayList();
wordMap.put(py, list);
}
for (int i = 0; i < size; i++) {
count = readUnsignedShort(input);
if (count > bytes.length) {
bytes = new byte[count];
}
input.readFully(bytes, 0, count);
String word = new String(bytes, 0, count, encoding);
input.skip(12);
list.add(word);
}
}
model.setWordMap(wordMap);
return model;
} finally {
in.close();
}
}
protected final int readUnsignedShort(InputStream in) throws IOException {
int ch1 = in.read();
int ch2 = in.read();
if ((ch1 | ch2) < 0) {
return Integer.MIN_VALUE;
}
return (ch2 << 8) + (ch1 << 0);
}
}
④搜狗官网下下载细胞词库.scel文件
略!
⑤读取细胞词库文件.scel插入数据库
private static void sogou(String path) throws IOException{
File file=new File(path);
SougouScelMdel model = new SougouScelReader().read(file);
Map> words = model.getWordMap(); //词<拼音,词>
Set>> set = words.entrySet();
Iterator>> iter = set.iterator();
while(iter.hasNext()){
Entry> entry = iter.next();
List list = entry.getValue();
int size = list.size();
for(int i = 0; i < size; i++){
String word = list.get(i);
/*判断,该词是否在数据库中出现,无则加之有则不做处理
* 此处方法不做呈现
*/
boolean is_exit = jugeWord(word);
if(is_exit) {
/*将该词,插入到数据库中,供后续使用
* 此方法亦不做呈现
*/
insert(word);
}
System.out.println(word);
}
}
}
⑥执行搜狗细胞词库插入数据库
笔者以下面的文件为例:
得到以下等数据
下篇讲解,笔者自创的简单算法,拆分输入语句匹配词库完成搜索过程。