使用Apache Lucene可以对文本文件作全文关键字检索,加入其它库的支持可以对pdf、word、excel等作全文内容检索,建立检索索引。
以下记录对word的两种格式作索引建立和全文检索以及高亮显示,但未作效率对比(相对文件读取)。
版本:
Lucene:7.2.1
POI:3.1.7
public class LuceneTest1 {
public static String[] suf = new String[]{".doc",".docx"}; //{".txt"}; //
public static List lst = new ArrayList();
static {
lst = Arrays.asList(suf);
}
public static void main(String[] args) {
//F:\worklog
//createIndex(true, "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_txt" );
//search("字符串","D:\\ex_java\\lucene_test\\index_worklog_txt");//
//createIndex(true, "F:\\worklog", "D:\\ex_java\\lucene_test\\index_worklog_msword" );
search("等值连接","D:\\ex_java\\lucene_test\\index_worklog_msword");//doc/docx类型必需加入POI库支持
}
/***
*
*
* 以下方法是【文档检索】
*
*
* */
public static void search(String qwords, String indexdir ) {
try {
System.out.println("[搜索词]:【"+qwords+"】");
IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexdir)));
IndexSearcher searcher = new IndexSearcher(reader);
SmartChineseAnalyzer anal = new SmartChineseAnalyzer();
QueryParser parser = new QueryParser("contents",anal);//lastmodify,contents,path,title
Query qr = parser.parse(qwords);
TopDocs tps = searcher.search(qr, 50);
/**高亮部分*/
SimpleHTMLFormatter shf = new SimpleHTMLFormatter("","");
QueryScorer scorer = new QueryScorer(qr);
Fragmenter frgm = new SimpleSpanFragmenter(scorer);//根据得分计算出一个片段
Highlighter hlt = new Highlighter(shf, scorer);
hlt.setTextFragmenter(frgm);
/**高亮部分*/
for(ScoreDoc sdoc: tps.scoreDocs) {
Document doc = searcher.doc(sdoc.doc);
//System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify")+", "+ doc.get("contents")+", "+ doc.get("title"));
System.out.println("["+sdoc.score+"]: "+ doc.get("path")+", "+ doc.get("lastmodify"));
//摘要高亮片段(已保存的Field)
TokenStream tsm = anal.tokenStream("contents", new StringReader(doc.get("contents")));//////////////////////
String summary = hlt.getBestFragment(tsm, doc.get("contents"));
tsm.close();
System.out.println();
System.out.println("[摘要开始]------------------------------------------------------");
System.out.println(summary);
System.out.println("[摘要结束]------------------------------------------------------");
System.out.println();
}
reader.close();
} catch (Exception e) {
System.err.println("Directory|Parse wrong. "+ e.toString());
}
}
/***
*
*
* 以下方法是【建立索引】
*
* *
* */
//创建或更新索引
public static void createIndex(boolean create, String docspath, String indexpath) {
Path docspt = Paths.get(docspath); //
if(!Files.isReadable(docspt)) {
System.err.println("Docs Path not readable: "+ docspt);
System.exit(1);
}
long stime = System.currentTimeMillis();
System.out.println("Begin Index ......");
try {
Directory dir = FSDirectory.open(Paths.get(indexpath)); //FSDirectory
Analyzer anal = new SmartChineseAnalyzer(); //SmartChineseAnalyzer
IndexWriterConfig iwc = new IndexWriterConfig(anal); //IndexWriterConfig
if(create) {
iwc.setOpenMode(OpenMode.CREATE);
}else {
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
}
IndexWriter iwrt = new IndexWriter(dir,iwc); //IndexWriter
indexDocs(iwrt, docspt);
iwrt.close();
} catch (Exception e) {
System.err.println(e.toString());
}
long etime = System.currentTimeMillis();
System.out.println("End Index, total time spend: " + (etime-stime)/1000 + " seconds.");
}
//为文件夹内文件建立索引
public static void indexDocs(final IndexWriter writer, Path pth) throws Exception {
if(Files.isDirectory(pth)) {//此处也可以递归实现
Files.walkFileTree(pth, new SimpleFileVisitor(){
@Override
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
try {
indexDoc(writer, file, attrs.lastModifiedTime().toMillis());
} catch (Exception e) {
}
return FileVisitResult.CONTINUE;
}
});
}else {
indexDoc(writer, pth, Files.getLastModifiedTime(pth).toMillis());
}
}
//建立索引
public static void indexDoc(IndexWriter writer, Path path, long lastmodify) throws IOException, OpenXML4JException, XmlException {
String suffix = path.toString().substring(path.toString().lastIndexOf(".")).toLowerCase();
if(!lst.contains(suffix)) {
return;
}
InputStream in = Files.newInputStream(path);
Document doc = new Document();
Field pathfield = new TextField("path", path.toString(), Store.YES);
doc.add(pathfield);
doc.add(new TextField("title", path.getFileName().toString(), Store.YES));
doc.add(new LongPoint("lastmodify",lastmodify));
if(".doc".equals(suffix)) {
//WordExtractor wd = new WordExtractor(in);
WordExtractor wd = (WordExtractor) ExtractorFactory.createExtractor(in);
doc.add(new TextField("contents",wd.getText(),Store.YES));
//wd.close();
}else if(".docx".equals(suffix)){
XWPFWordExtractor wdx = (XWPFWordExtractor) ExtractorFactory.createExtractor(in);
doc.add(new TextField("contents",wdx.getText(),Store.YES));
//wdx.close();
}else {
//doc.add(new TextField("contents", ));
File tmpfile = Paths.get(path.toUri()).toFile();
Long len = tmpfile.length();
FileInputStream fin = new FileInputStream(tmpfile);
byte[] buf = new byte[len.intValue()];
fin.read(buf);
String text = new String(buf,"gb2312");
fin.close();
doc.add(new TextField("contents", text, Store.YES));
}
if(writer.getConfig().getOpenMode() == OpenMode.CREATE) {
System.out.println("adding doc: " + path);
writer.addDocument(doc);
}else {
System.out.println("updating doc: " + path);
writer.updateDocument(new Term("path",path.toString()), doc);
}
}
}
检索结果(部分):
[搜索词]:【等值连接】
[8.205898]: F:\worklog\leftjoin_innerjoin_rightjoin.doc, null
[摘要开始]------------------------------------------------------
join(等值连接) 只返回两个表中联结字段相等的行
举例如下:
--------------------------------------------
表A记录如下:
aID aNum
[摘要结束]------------------------------------------------------