idea+lucene

Lucene是一套用于全文检索和搜索的开放源代码程序库,由Apache软件基金会支持和提供。Lucene提供了一个简单却强大的应用程序接口,能够做全文索引和搜索,在Java开发环境里Lucene是一个成熟的免费开放源代码工具;就其本身而论,Lucene是现在并且是这几年,最受欢迎的免费Java信息检索程序库。

1.pom.xml

        
        <dependency>
            <groupId>org.apache.lucenegroupId>
            <artifactId>lucene-coreartifactId>
            <version>4.6.1version>
        dependency>
        <dependency>
            <groupId>org.apache.lucenegroupId>
            <artifactId>lucene-analyzers-commonartifactId>
            <version>4.6.1version>
        dependency>
        <dependency>
            <groupId>org.apache.lucenegroupId>
            <artifactId>lucene-queryparserartifactId>
            <version>4.6.1version>
        dependency>
        <dependency>
            <groupId>org.apache.lucenegroupId>
            <artifactId>lucene-codecsartifactId>
            <version>4.6.1version>
        dependency>
        
        <dependency>
            <groupId>junitgroupId>
            <artifactId>junitartifactId>
            <version>4.12version>
        dependency>

2、testLucene类


    //1.创建directory,保存索引,可以保存在内存中也可以保存在硬盘上
    //保存在内存中使用Directory directory=new RAMDirectory();
    File indexDir = new File(this.getClass().getClassLoader().getResource("").getFile());//new File("F:\\lucene\\index")

    /**
     * 创建索引
     */
    @Test
    public void createIndex() throws IOException {
//        Directory index = new RAMDirectory();
        Directory index = FSDirectory.open(indexDir);
        // 0. Specify the analyzer for tokenizing text.
        //    The same analyzer should be used for indexing and searching
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
        //2.1创建indexwriterConfig,并指定分词器版本
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_46, analyzer);

        // 1. create the index
        IndexWriter w = new IndexWriter(index, config);
        addDoc(w, "Lucene in Action", "193398817");
        addDoc(w, "Lucene for Dummies", "55320055Z");
        addDoc(w, "Managing Gigabytes", "55063554A");
        addDoc(w, "The Art of Computer Science", "9900333X");
        w.close();
    }

    private void addDoc(IndexWriter w, String title, String isbn) throws IOException {
        //File docDirectory=new File("F:\\lucene\\example");
        Document doc = new Document();
        doc.add(new TextField("title", title, Field.Store.YES));
        // use a string field for isbn because we don't want it tokenized
        doc.add(new StringField("isbn", isbn, Field.Store.YES));
        w.addDocument(doc);
        /*for(File file: docDirectory.listFiles()){
            doc=new Document();

            //创建搜索域,并说明是否进行分词
            doc.add(new TextField("content",new FileReader(file)));
            doc.add(new StringField("filename", file.getName(), Store.YES));
            doc.add(new StringField("path",file.getAbsolutePath(), Store.YES));
            //写入文档
            writer.addDocument(doc);
        }*/
    }

    /**
     * 检索
     * @throws IOException
     */
    @Test
    public void search() throws IOException {
        // 2. query
        String querystr = "lucene";//检索的字

        // the "title" arg specifies the default field to use
        // when no field is explicitly specified in the query.
        Query q = null;
        try {
            StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
            //4创建搜索用的query,指定搜索域
            q = new QueryParser(Version.LUCENE_46,"title", analyzer).parse(querystr);
        } catch (Exception e) {
            e.printStackTrace();
        }

        // 3. search
        int hitsPerPage = 10;
        //.1创建索引在的文件夹
        //Directory indexDirectory=FSDirectory.open(new File("F:\\lucene\\index"));
        Directory index = FSDirectory.open(indexDir);
        //2.创建indexReader
        IndexReader reader = DirectoryReader.open(index);
        //根据indexReader创建indexSeacher
        IndexSearcher searcher = new IndexSearcher(reader);
        TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true);
        searcher.search(q, collector);
        ScoreDoc[] hits = collector.topDocs().scoreDocs;

        // 4. display results
        System.out.println("Found " + hits.length + " hits.");
        for (int i = 0; i < hits.length; ++i) {
            int docId = hits[i].doc;
            Document d = searcher.doc(docId);
            System.out.println((i + 1) + ". " + d.get("isbn") + "\t" + d.get("title"));
        }

        // reader can only be closed when there
        // is no need to access the documents any more.
        reader.close();
        //答案
        //1. 193398817  Lucene in Action
        //2. 55320055Z  Lucene for Dummies
    }
    /**
     * 分词
     */
    @Test
    public void cutWords() throws IOException {
//        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_46);
//        CJKAnalyzer analyzer = new CJKAnalyzer(Version.LUCENE_46);
        SimpleAnalyzer analyzer = new SimpleAnalyzer(Version.LUCENE_46);
        String text = "Spark是当前最流行的开源大数据内存计算框架,采用Scala语言实现,由UC伯克利大学AMPLab实验室开发并于2010年开源。";
        TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text));
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);
        try {
            tokenStream.reset();
            while (tokenStream.incrementToken()) {
                System.out.println(charTermAttribute.toString());
            }
            tokenStream.end();
        } finally {
            tokenStream.close();
            analyzer.close();
        }
    }
    //答案
    spark是当前最流行的开源大数据内存计算框架
    采用scala语言实现
    由uc伯克利大学amplab实验室开发并于
    年开源

你可能感兴趣的:(idea+lucene)