新建一个maven项目:
在eclipse插件里new一个也可以,不是maven项目也行,最下面是我新建项目的命令。
然后把项目导入到eclipse里
打开pom.xml 添加如下的依赖:
<properties> <lucene.version>4.7.2</lucene.version> <mmseg4j.version>1.9.1</mmseg4j.version> </properties> <dependencies> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> <version>${lucene.version}</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-queryparser</artifactId> <version>${lucene.version}</version> </dependency> <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-analyzers-common</artifactId> <version>${lucene.version}</version> </dependency>
然后刷新项目,或者mvn clean install. 然后就有lucene的jar包了。项目如下:
如果不是maven项目,可以去官网上下载lucene,解压后,手动的拷贝相应的jar包,添加到项目里,jar都在文件里不太好找。
然后我去找了最新版本的庖丁解牛分词器,iK analyzer,MMseg4j这几个分词器,几经尝试,都不支持4.7的lucene.实在是很无赖啊。本来是来学习如何分词的,结果非要用过了时的lucene版本。
然后照着官网写了一些lucene的入门例子,了解了一下lucene如何使用
public class Test { private static final String docLocation = "D:\\lucene\\docs"; private static final String indexLocation = "D:\\lucene\\index"; private static final String docLocation2 = "D:\\lucene\\otherDocs"; private static final String indexLocation2 = "D:\\lucene\\otherIndex"; public static void main(String[] args) { // 测试中文分词 // testSeg(); // 建立索引 //testIndex(); // 测试搜索 // testSearch(); //testSearchWithParser(); //测试合并 //testMergeIndex(); //测试删除文档 //testDeleteDoc(); } /** * 测试中文分词 */ private static void testSeg() { Reader reader = null; try { Analyzer analyzer = new ChineseAnalyzer(); reader = new FileReader("D:\\lucene\\docs\\海贼王.txt"); TokenStream ts = analyzer.tokenStream("", reader); ts.reset(); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); // 遍历分词数据 while (ts.incrementToken()) { System.out.print(term.toString() + "|"); // 我|是|要|成|为|海|贼|王|的|男|人|烧|烧|果|实|是|假|的|一|看|就|知|道|了| 这词分的真是一绝 } } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } } } } /** * 测试建立索引 */ private static void testIndex() { final File docDir = new File(docLocation2); if (!docDir.exists() || !docDir.canRead()) { System.out.println("please check the path"); } try { Directory dir = FSDirectory.open(new File(indexLocation2)); Analyzer analyzer = new ChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 如果已经有索引,就追加 // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(docDir, writer); // writer.forceMerge(1);//如果文档基本是静态的不怎么变,可以优化一下索引,这个过程比较耗时 writer.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 建立索引 * * @param docDir * @param writer * @throws IOException */ private static void indexDocs(File docDir, IndexWriter writer) throws IOException { if (docDir.isDirectory()) { File[] files = docDir.listFiles(); for (File file : files) { indexDocs(file, writer); } } else { FileInputStream fis = null; try { fis = new FileInputStream(docDir); } catch (FileNotFoundException e) { return; } try { Document doc = new Document(); // 索引但不存,不分词 Field fileName = new StringField("fileName", docDir.getName(), Field.Store.YES); doc.add(fileName); // 索引但不存 doc.add(new LongField("lastModified", docDir.lastModified(), Field.Store.NO)); doc.add(new TextField("content", new BufferedReader( new InputStreamReader(fis, "UTF-8")))); if (OpenMode.CREATE.equals(writer.getConfig().getOpenMode())) { writer.addDocument(doc); } else { writer.updateDocument( new Term("fileName", docDir.getName()), doc); } } finally { fis.close(); } } } /** * term查询 */ private static void testSearch() { try { IndexReader reader = DirectoryReader.open(FSDirectory .open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); // Term term = new Term("fileName","海贼王.txt"); // Term term = new Term("content","海贼王");//查不出来 Term term = new Term("content", "文");// 有 Query query = new TermQuery(term); Query query2 = new TermQuery(new Term("content", "火")); BooleanQuery query3 = new BooleanQuery(); query3.add(query, Occur.SHOULD); query3.add(query2, Occur.SHOULD);// 0.34144828 混淆.txt0.08536207 // 海贼王.txt0.08536207 火影忍者.txt TopDocs top5 = searcher.search(query3, 5); ScoreDoc[] hits = top5.scoreDocs; for (ScoreDoc doc : hits) { System.out.println(doc.score + " " + searcher.doc(doc.doc).get("fileName")); } } catch (IOException e) { e.printStackTrace(); } } /** * 用解析器查询 */ private static void testSearchWithParser() { try { IndexReader reader = DirectoryReader.open(FSDirectory .open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new ChineseAnalyzer(); QueryParser parser = new QueryParser(Version.LUCENE_47, "content", analyzer); Query query = parser.parse("海贼王 火影忍者"); TopDocs top5 = searcher.search(query, 5); ScoreDoc[] hits = top5.scoreDocs; for (ScoreDoc doc : hits) { System.out.println(doc.score + " " + searcher.doc(doc.doc).get("fileName")); } } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 测试合并索引 */ private static void testMergeIndex() { try { Directory dir = FSDirectory.open(new File(indexLocation)); Analyzer analyzer = new ChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47,analyzer); IndexWriter writer = new IndexWriter(dir, iwc); Directory dir2 = FSDirectory.open(new File(indexLocation2)); writer.addIndexes(dir2); writer.close(); } catch (IOException e) { e.printStackTrace(); } } /** * 删除一个索引 */ private static void testDeleteDoc(){ try { Directory dir = FSDirectory.open(new File(indexLocation)); Analyzer analyzer = new ChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47,analyzer); IndexWriter writer = new IndexWriter(dir, iwc); Term term = new Term("fileName","文章.txt"); writer.deleteDocuments(term); writer.close(); } catch (IOException e) { e.printStackTrace(); } } }
。首先在d盘建立了一个专用目录,存了一些简单的txt文件,以文件名为id,建立了索引。还测试了一下如何合并和删除索引。
成功的话,会生成如下的索引目录:
使用旧版本的lunece 和庖丁分词
public static void main(String[] args) { try { Analyzer analyzer = new PaodingAnalyzer(); Reader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File("D:\\lucene\\docs\\海贼王.txt")),"UTF-8")); TokenStream stream = analyzer.tokenStream("", reader); Token token = null; while((token=stream.next())!=null){ System.out.println(token.termText()); } } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }