lucene中使用directory、indexwriter、document、filed、NumericField、TokenStream、indexsearch、还有高亮器的例子。包含的lucene的简单使用
testlucene.java 主类
public class TestLucene { public Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_20); // 分词器 public static String pathIndx = "E://tempworkspace//lucene//index"; // 索引存放的路径 public static String pathfile = "E://tempworkspace//lucene//source//File2DocumentUtils.txt"; // 需要生成索引的文档 /** * 创建索引 * * @throws Exception */ public void createIndex() throws Exception { Directory dir = FSDirectory.open(new File(pathIndx));// 这里使用fsdirectiory。也可以用其他。如RAMDirectory(放如内存中) IndexWriter iw = new IndexWriter(dir, analyzer, true, MaxFieldLength.LIMITED);// 必须的 iw.addDocument(doc(pathfile)); this.analyze(analyzer, TestLucene.getContent(pathfile));// 此处的自定义方法是打印出分词后的每个term的详细信息(测试分词效果) iw.close(); } /** * 创建document,需要多个field的(一个document由多个filed组合成,document相当于数据库表,filed就像是字段) * * @param 文件的路径 * @return 生成document对象 * @throws Exception */ public Document doc(String filePath) throws Exception { Document doc = new Document(); File file = new File(filePath); Field fileName = new Field("fileName", file.getName(), Store.YES, Index.ANALYZED); NumericField size = new NumericField("size", Store.YES, false); // 在2.0时我们需要存放数字类型的filed时需要 // 用NumberTools.longToString()存放,拿出来的时候再使用她的stringTOlong方法,而想你在使用这个就能有效解决,而且可以多次使用. // 参数2是否存储,参数3是否索引 Field content = new Field("content", TestLucene.getContent(filePath), Store.YES, Index.ANALYZED); Field path = new Field("path", file.getAbsolutePath(), Store.YES, Index.NOT_ANALYZED); doc.add(fileName); doc.add(size.setLongValue(file.length())); doc.add(content); doc.add(path); System.out.println(file.getName()); System.out.println(file.length()); System.out.println(TestLucene.getContent(filePath)); System.out.println(file.getAbsolutePath()); return doc; } /** * * @param 文件路径 * @return 文件内容 * @throws Exception */ public static String getContent(String filePath) throws Exception { StringBuffer sb = new StringBuffer(); BufferedReader br = new BufferedReader(new InputStreamReader( new FileInputStream(new File(filePath)))); for (String temp = null; (temp = br.readLine()) != null;) { sb.append(temp).append("/n"); } return sb.toString(); } public void analyze(Analyzer analyzer, String text) throws Exception { System.out.println("-------------> 分词器:" + analyzer.getClass()); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)); while (tokenStream.incrementToken()) { System.out.println(tokenStream.toString()); System.out.println("~~~~"); } } }
测试类
public class LuceneTest { /** * 创建index * @throws Exception */ @Test public void testIndexWriter()throws Exception { TestLucene tl = new TestLucene(); tl.createIndex(); } /** * 搜索 * @throws Exception */ @Test public void testIndexSearcher()throws Exception { Directory dir =FSDirectory.open(new File(TestLucene.pathIndx)); IndexSearcher is = new IndexSearcher(dir); Query query = new TermQuery(new Term("content","java.io.file")); TopDocs td = is.search(query, 1000); System.out.println(td.totalHits); System.out.println("------------"); for(int i=0;i<td.scoreDocs.length;i++){ ScoreDoc sd = td.scoreDocs[i]; Document document = is.doc(sd.doc); System.out.println(document.get("fileName")); System.out.println(document.get("size")); System.out.println(document.get("content")); System.out.println(document.get("path")); System.out.println("------------------------------"); } } /** * 独立 * 划分team。分词,答应出每个分词的信息(分词测试,没有生成index) * @throws Exception */ @Test public void test() throws Exception { String enText = "IndexWriter addDocument's a javadoc.txt"; Analyzer en1 = new StandardAnalyzer(Version.LUCENE_30); // 单字分词 analyze(en1, enText); } public void analyze(Analyzer analyzer, String text) throws Exception { System.out.println("-------------> 分词器:" + analyzer.getClass()); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)); while(tokenStream.incrementToken()){ System.out.println(tokenStream.toString()); System.out.println("~~~~"); } } /** * 搜索后高亮 * @throws Exception */ @Test public void testHightLigth()throws Exception { Directory dir =FSDirectory.open(new File(TestLucene.pathIndx)); IndexSearcher is = new IndexSearcher(dir); Query query = new TermQuery(new Term("content","dport")); TopDocs td = is.search(query, 1000); System.out.println(td.totalHits); System.out.println("------------"); //准备高亮器,要把高亮器的jar包倒进,在关键字前后加入代码 Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>"); Scorer scorer = new QueryScorer(query);//传入关键字 Highlighter highlighter = new Highlighter(formatter,scorer); //截取是否生成摘要,和摘要的长度(摘要一般是去关键字最频繁的部分) Fragmenter fragmenter = new SimpleFragmenter(100); highlighter.setTextFragmenter(fragmenter); for(int i=0;i<td.scoreDocs.length;i++){ ScoreDoc sd = td.scoreDocs[i]; Document document = is.doc(sd.doc); //得到document后需要高亮的字符串,如果没有出现关键字,则返回null String hc = highlighter.getBestFragment(new StandardAnalyzer(Version.LUCENE_30), "content", document.get("content")); if(hc!=null){ System.out.println(hc); } } } }
结果大家可以试试。只要吧文件名还有jar包加进去应该就ok了~