lucene4.7 入门例子。中文分词例子。

新建一个maven项目:
lucene4.7 入门例子。中文分词例子。_第1张图片

在eclipse插件里new一个也可以,不是maven项目也行,最下面是我新建项目的命令。
然后把项目导入到eclipse里
打开pom.xml 添加如下的依赖:

<properties>
        <lucene.version>4.7.2</lucene.version>
        <mmseg4j.version>1.9.1</mmseg4j.version>
  </properties>
  
  
  <dependencies>
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-core</artifactId>
      <version>${lucene.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-queryparser</artifactId>
      <version>${lucene.version}</version>
    </dependency>
    <dependency>
      <groupId>org.apache.lucene</groupId>
      <artifactId>lucene-analyzers-common</artifactId>
      <version>${lucene.version}</version>
    </dependency>

然后刷新项目,或者mvn clean install. 然后就有lucene的jar包了。项目如下:

lucene4.7 入门例子。中文分词例子。_第2张图片

如果不是maven项目,可以去官网上下载lucene,解压后,手动的拷贝相应的jar包,添加到项目里,jar都在文件里不太好找。

然后我去找了最新版本的庖丁解牛分词器,iK analyzer,MMseg4j这几个分词器,几经尝试,都不支持4.7的lucene.实在是很无赖啊。本来是来学习如何分词的,结果非要用过了时的lucene版本。
然后照着官网写了一些lucene的入门例子,了解了一下lucene如何使用

public class Test {

    private static final String docLocation = "D:\\lucene\\docs";
    private static final String indexLocation = "D:\\lucene\\index";
    private static final String docLocation2 = "D:\\lucene\\otherDocs";
    private static final String indexLocation2 = "D:\\lucene\\otherIndex";

    public static void main(String[] args) {
        // 测试中文分词
        // testSeg();
        // 建立索引
         //testIndex();
        // 测试搜索
        // testSearch();
        //testSearchWithParser();
        //测试合并
        //testMergeIndex();
        //测试删除文档
        //testDeleteDoc();
    }

    /**
     * 测试中文分词
     */
    private static void testSeg() {
        Reader reader = null;
        try {
            Analyzer analyzer = new ChineseAnalyzer();
            reader = new FileReader("D:\\lucene\\docs\\海贼王.txt");
            TokenStream ts = analyzer.tokenStream("", reader);
            ts.reset();
            CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
            // 遍历分词数据
            while (ts.incrementToken()) {
                System.out.print(term.toString() + "|");
                // 我|是|要|成|为|海|贼|王|的|男|人|烧|烧|果|实|是|假|的|一|看|就|知|道|了| 这词分的真是一绝
            }
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } finally {
            if (reader != null) {
                try {
                    reader.close();
                } catch (IOException e) {
                    // TODO Auto-generated catch block
                    e.printStackTrace();
                }
            }
        }
    }

    /**
     * 测试建立索引
     */
    private static void testIndex() {
        final File docDir = new File(docLocation2);
        if (!docDir.exists() || !docDir.canRead()) {
            System.out.println("please check the path");
        }
        try {
            Directory dir = FSDirectory.open(new File(indexLocation2));
            Analyzer analyzer = new ChineseAnalyzer();
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47,
                    analyzer);
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 如果已经有索引,就追加
            // iwc.setRAMBufferSizeMB(256.0);
            IndexWriter writer = new IndexWriter(dir, iwc);
            indexDocs(docDir, writer);
            // writer.forceMerge(1);//如果文档基本是静态的不怎么变,可以优化一下索引,这个过程比较耗时
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 建立索引
     * 
     * @param docDir
     * @param writer
     * @throws IOException
     */
    private static void indexDocs(File docDir, IndexWriter writer)
            throws IOException {
        if (docDir.isDirectory()) {
            File[] files = docDir.listFiles();
            for (File file : files) {
                indexDocs(file, writer);
            }
        } else {
            FileInputStream fis = null;
            try {
                fis = new FileInputStream(docDir);
            } catch (FileNotFoundException e) {
                return;
            }

            try {
                Document doc = new Document();
                // 索引但不存,不分词
                Field fileName = new StringField("fileName", docDir.getName(),
                        Field.Store.YES);
                doc.add(fileName);
                // 索引但不存
                doc.add(new LongField("lastModified", docDir.lastModified(),
                        Field.Store.NO));
                doc.add(new TextField("content", new BufferedReader(
                        new InputStreamReader(fis, "UTF-8"))));
                if (OpenMode.CREATE.equals(writer.getConfig().getOpenMode())) {
                    writer.addDocument(doc);
                } else {
                    writer.updateDocument(
                            new Term("fileName", docDir.getName()), doc);
                }
            } finally {
                fis.close();
            }
        }
    }

    /**
     * term查询
     */
    private static void testSearch() {
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory
                    .open(new File(indexLocation)));
            IndexSearcher searcher = new IndexSearcher(reader);
            // Term term = new Term("fileName","海贼王.txt");
            // Term term = new Term("content","海贼王");//查不出来
            Term term = new Term("content", "文");// 有
            Query query = new TermQuery(term);
            Query query2 = new TermQuery(new Term("content", "火"));
            BooleanQuery query3 = new BooleanQuery();
            query3.add(query, Occur.SHOULD);
            query3.add(query2, Occur.SHOULD);// 0.34144828 混淆.txt0.08536207
                                                // 海贼王.txt0.08536207 火影忍者.txt
            TopDocs top5 = searcher.search(query3, 5);
            ScoreDoc[] hits = top5.scoreDocs;
            for (ScoreDoc doc : hits) {
                System.out.println(doc.score + "  "
                        + searcher.doc(doc.doc).get("fileName"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * 用解析器查询
     */
    private static void testSearchWithParser() {
        try {
            IndexReader reader = DirectoryReader.open(FSDirectory
                    .open(new File(indexLocation)));
            IndexSearcher searcher = new IndexSearcher(reader);

            Analyzer analyzer = new ChineseAnalyzer();
            QueryParser parser = new QueryParser(Version.LUCENE_47, "content",
                    analyzer);

            Query query = parser.parse("海贼王 火影忍者");
            TopDocs top5 = searcher.search(query, 5);
            ScoreDoc[] hits = top5.scoreDocs;
            for (ScoreDoc doc : hits) {
                System.out.println(doc.score + "  "
                        + searcher.doc(doc.doc).get("fileName"));
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (ParseException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }

    /**
     * 测试合并索引
     */
    private static void testMergeIndex() {
        try {
            Directory dir = FSDirectory.open(new File(indexLocation));
            Analyzer analyzer = new ChineseAnalyzer();
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47,analyzer);
            IndexWriter writer = new IndexWriter(dir, iwc);
            
            Directory dir2 = FSDirectory.open(new File(indexLocation2));
            writer.addIndexes(dir2);
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 删除一个索引
     */
    private static void testDeleteDoc(){
        try {
            Directory dir = FSDirectory.open(new File(indexLocation));
            Analyzer analyzer = new ChineseAnalyzer();
            IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47,analyzer);
            IndexWriter writer = new IndexWriter(dir, iwc);
            
            Term term = new Term("fileName","文章.txt");
            writer.deleteDocuments(term);
            writer.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

。首先在d盘建立了一个专用目录,存了一些简单的txt文件,以文件名为id,建立了索引。还测试了一下如何合并和删除索引。
成功的话,会生成如下的索引目录:

lucene4.7 入门例子。中文分词例子。_第3张图片




使用旧版本的lunece 和庖丁分词

新建一个项目
lucene4.7 入门例子。中文分词例子。_第4张图片

public static void main(String[] args) {
        try {
            Analyzer analyzer = new PaodingAnalyzer();
            Reader reader = new BufferedReader(new InputStreamReader(new FileInputStream(new File("D:\\lucene\\docs\\海贼王.txt")),"UTF-8"));
            TokenStream stream = analyzer.tokenStream("", reader);
            Token token = null;
            while((token=stream.next())!=null){
                System.out.println(token.termText());
            }
        } catch (UnsupportedEncodingException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (FileNotFoundException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }


你可能感兴趣的:(lucene4.7 入门例子。中文分词例子。)