IndexerMmseg4j.java代码如下:
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.lang.SystemUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.chenlb.mmseg4j.analysis.SimpleAnalyzer;
public class IndexerMmseg4j {
private final static Logger logger = LoggerFactory.getLogger(IndexerMmseg4j.class);
static Version matchVersion = Version.LUCENE_36;
static String indexPath = "D:" + SystemUtils.FILE_SEPARATOR + "contentWindow" + SystemUtils.FILE_SEPARATOR + "index";
static String filePath = "D:" + SystemUtils.FILE_SEPARATOR + "contentWindow" + SystemUtils.FILE_SEPARATOR + "files" + SystemUtils.FILE_SEPARATOR + "mytestfile.txt";
static Analyzer analyzer = new StandardAnalyzer(matchVersion);
static Analyzer a3 = new CJKAnalyzer(matchVersion); //二分法分词
static Analyzer a4 = new SimpleAnalyzer(); //中文分词器mmseg4j中提供的一种分词器
public static byte[] getBytesFromFile(File file) {
if(file != null){
FileInputStream fis;
try {
fis = new FileInputStream(file);
if(fis != null){
int len = fis.available();
byte[] bytes = new byte[len];
fis.read(bytes); //现在file中的内容全读到了byte[]数组中
return bytes;
}
} catch (FileNotFoundException e) {
logger.error(e.getMessage());
e.printStackTrace();
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
/*
if (file == null){
return null;
}
try {
FileInputStream stream = new FileInputStream(file);
ByteArrayOutputStream out = new ByteArrayOutputStream(1000);
byte[] b = new byte[1000];
int n;
while ((n = stream.read(b)) != -1)
out.write(b, 0, n);
stream.close();
out.close();
return out.toByteArray();
} catch (IOException e){
}
*/
/*
InputStream is;
try {
is = new FileInputStream(file);
long length = file.length();
if (length > Integer.MAX_VALUE) {
// File is too large
}
byte[] bytes = new byte[(int)length];
int offset = 0;
int numRead = 0;
while (offset < bytes.length && (numRead=is.read(bytes, offset, bytes.length-offset)) >= 0) {
offset += numRead;
}
if (offset < bytes.length) {
throw new IOException("Could not completely read file " + file.getName());
}
is.close();
return bytes;
} catch (FileNotFoundException e) {
logger.error(e.getMessage());
e.printStackTrace();
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
*/
return null;
}
public static byte[] addByte(byte[] array1, byte[] array2) {
if(array1.length==0 && array2.length==0){
return null;
}
byte[] message = new byte[array1.length+array2.length];
for(int i=0; i<message.length; i++){
if(i < array1.length){
message[i] = array1[i];
}else{
message[i] = array2[i-array1.length];
}
}
return message;
}
@org.junit.Test
public static void testCreateIndex(){
System.out.println(indexPath);
System.out.println(filePath);
Directory dir;
try {
dir = FSDirectory.open(new File(indexPath));
Document doc = new Document();
Field titleField = new Field("title", new File(filePath).getName(), Store.YES, Index.ANALYZED);
String content = new String(IndexerMmseg4j.getBytesFromFile(new File(filePath)), "GB18030");
/*
* Store.YES : 用于存储字段值。适用于显示搜索结果的字段。例如,文件路径和URL
* Store.NO : 没有存储字段值。例如,电子邮件消息正文
* Index.ANALYZED : 用于字段索引和分析。例如,电子邮件消息正文和标题
* Index.NO : 适用于未搜索的字段。例如,仅用于存储字段
* Index.NOT_ANALYZED : 用于编制索引但不分析的字段。它在整体中保留字段的原值。例如,日期和个人名称
* */
Field contentField = new Field("content", content, Store.YES, Index.ANALYZED);
doc.add(titleField);
doc.add(contentField);
IndexWriterConfig iwc = new IndexWriterConfig(matchVersion, a4);
//默认create_or_append
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //总是重新创建
IndexWriter iw = new IndexWriter(dir, iwc);
iw.addDocument(doc);
iw.close();
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
@org.junit.Test
public static void testDeleteIndex(){
Directory dir;
try {
dir = FSDirectory.open(new File(indexPath));
IndexReader indexReader = IndexReader.open(dir);
indexReader.deleteDocuments(new Term("content", "上海"));
indexReader.close();
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
}
SearcherMmseg4j.java代码如下:
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import org.apache.commons.lang.SystemUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Index;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryParser.MultiFieldQueryParser;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.FuzzyQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PrefixQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.WildcardQuery;
import org.apache.lucene.search.highlight.Formatter;
import org.apache.lucene.search.highlight.Fragmenter;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.Scorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.AttributeSource;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.chenlb.mmseg4j.analysis.SimpleAnalyzer;
public class SearcherMmseg4j {
private final static Logger logger = LoggerFactory.getLogger(SearcherMmseg4j.class);
static Version matchVersion = Version.LUCENE_36;
static String indexPath = "D:" + SystemUtils.FILE_SEPARATOR + "contentWindow" + SystemUtils.FILE_SEPARATOR + "index";
static String filePath = "D:" + SystemUtils.FILE_SEPARATOR + "contentWindow" + SystemUtils.FILE_SEPARATOR + "files" + SystemUtils.FILE_SEPARATOR + "mytestfile.txt";
static Analyzer analyzer = new StandardAnalyzer(matchVersion);
static Analyzer a3 = new CJKAnalyzer(matchVersion); //二分法分词
static Analyzer a4 = new SimpleAnalyzer(); //中文分词器mmseg4j中提供的一种分词器
public static byte[] getBytesFromFile(File file) {
if(file != null){
FileInputStream fis;
try {
fis = new FileInputStream(file);
if(fis != null){
int len = fis.available();
byte[] bytes = new byte[len];
fis.read(bytes); //现在file中的内容全读到了byte[]数组中
return bytes;
}
} catch (FileNotFoundException e) {
logger.error(e.getMessage());
e.printStackTrace();
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
return null;
}
@org.junit.Test
public static void testSearch(){
Directory dir;
try {
dir = FSDirectory.open(new File(indexPath), null);
IndexSearcher is = new IndexSearcher(dir);
System.out.println(is.maxDoc());
String[] fields = {"title", "content"};
QueryParser qp = new MultiFieldQueryParser(matchVersion, fields, a4);
// QueryParser qp = new QueryParser(matchVersion, "content", analyzer);
// Query query = qp.parse("汉字");
Query query = qp.parse("上海");
// System.out.println(query.toString("content"));
/*
Query wildcardQuery = new WildcardQuery(new Term("content", "How*")); //实现通配符查询(*表示零个以上,?表示1个以上)
Query fuzzyQuery = new FuzzyQuery(new Term("content", "administritor")); //查找与administritor最相近的单词(主要检索拼写错误)
Term beginDate = new Term("date", "20120601");
Term endDate = new Term("date", "20120630");
Query rangeQuery = new TermRangeQuery("date", "20120601", "20120630", true, true); //在某个范围内搜索
Query prefixQuery = new PrefixQuery(new Term("content", "Why")); //构造前缀搜索引擎
*/
//TopDocs用来封装搜索结果以及ScoreDoc的总数(ScoreDoc: 搜索结果中指向文档的简单指针)
TopDocs tDocs = is.search(query, 10000); //一次查询多少个结果
ScoreDoc[] scoreDoc = tDocs.scoreDocs; //获取搜索结果中指向文档的简单指针
//准备高亮器
Formatter formatter = new SimpleHTMLFormatter("<span class=\"highlighter\">", "</span>");
Scorer fragmentScorer = new QueryScorer(query);
Highlighter highlighter = new Highlighter(formatter, fragmentScorer);
Fragmenter fragmenter = new SimpleFragmenter(100); //高亮范围
highlighter.setTextFragmenter(fragmenter);
int numTotalHits = tDocs.totalHits;
System.out.println("总共有【" + numTotalHits + "】条结果");
System.out.println(tDocs.scoreDocs.length); //打印ScoreDoc的总数
// int k = tDocs.scoreDocs[0].doc ; //文档内部编号
// Document doc = is.doc(k) ; //更具文档编号取出对应文档
Document doc = is.doc(0);
// doc.getField("content"); //获取属性值,与下相同
String content = doc.get("content"); //获取属性值
//如果当前属性值中没有出现关键字, 则返回null
String hc = highlighter.getBestFragment(a4, "content", content);
System.out.println("hc:" + hc);
if(hc == null){ //如果无结果那么返回原文的前50个字符
hc = content.substring(0, Math.min(50,content.length()));
// Field contentField=doc.getFieldable("content");
}
Field contentField = (Field) doc.getFieldable("content");
contentField.setValue(hc);
// doc.getField("content").setValue(hc);
TokenStream ts = a4.tokenStream("content", new StringReader(content));
// System.out.println("token: " + ts.getAttribute(String.class).toString());
OffsetAttribute offsetAttribute = ts.getAttribute(OffsetAttribute.class);
TermAttribute termAttribute = ts.getAttribute(TermAttribute.class);
while (ts.incrementToken()) {
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = termAttribute.term();
// System.out.println(term);
}
/*
KeepOnlyLastCommitDeletionPolicy kolcdp = new KeepOnlyLastCommitDeletionPolicy(); //删除过时的索引,只保留最后一次提交的索引策略
//下面的句子删除所有的索引
Directory deleteDir = FSDirectory.open(new File(indexPath));
IndexReader indexReader = IndexReader.open(deleteDir);
for(int i=0; i<scoreDoc.length; i++){
indexReader.deleteDocument(i);
}
indexReader.close();
*/
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
} catch (ParseException e) {
logger.error(e.getMessage());
e.printStackTrace();
} catch (InvalidTokenOffsetsException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
@org.junit.Test
public static void testCreateRAMandFS(){
Directory fsDir;
try {
fsDir = FSDirectory.open(new File(indexPath));
//1.将索引读取到内存中
Directory ramDir = new RAMDirectory(fsDir);
//2.填入文档
Document doc = new Document();
Field titleField = new Field("title", new File(filePath).getName(), Store.YES, Index.ANALYZED);
String content = new String(SearcherMmseg4j.getBytesFromFile(new File(filePath)));
Field contentField = new Field("content", content, Store.YES, Index.ANALYZED);
doc.add(titleField);
doc.add(contentField);
IndexWriterConfig ramiwc = new IndexWriterConfig(matchVersion, analyzer);
//默认create_or_append
// ramiwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //总是重新创建
IndexWriter ramiw = new IndexWriter(ramDir, ramiwc);
ramiw.addDocument(doc);
ramiw.close();
//3.关闭时,写入到文件
IndexWriterConfig fsiwc = new IndexWriterConfig(matchVersion, analyzer);
//默认create_or_append
fsiwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //总是重新创建
IndexWriter fsiw = new IndexWriter(fsDir, fsiwc);
//将内存的索引文件加入到fsiw中
fsiw.addIndexes(ramDir);
fsiw.commit();
//优化索引文件(合并索引文件)
fsiw.optimize();
fsiw.close();
System.out.println("===执行完毕");
} catch (IOException e) {
logger.error(e.getMessage());
e.printStackTrace();
}
}
/**
* http://www.cnblogs.com/java_cSharp/archive/2011/07/17/lucene.html
*/
public static void main(String[] args) {
IndexerMmseg4j.testCreateIndex();
// SearcherMmseg4j.testSearch();
SearcherMmseg4j.testCreateRAMandFS();
}
}
mytestfile.txt内容如下:
Why is luanma?
于士博
于时欢
上海
上海滩
北京
北京是首都
北京房价贵
How are you?
yes, I am!
1+1=2
2*3=6
@#$%^&