LUCENE

/** * 使用IndexWriter对数据创建索引 * @throws IOException */ @Test public void testCreateIndex() throws IOException { // 索引存放的位置... Directory d = FSDirectory.open(new File("indexDir/")); // 索引写入的配置 Version matchVersion = Version.LUCENE_CURRENT;// lucene当前匹配的版本 Analyzer analyzer = new StandardAnalyzer(matchVersion);// 分词器 IndexWriterConfig conf = new IndexWriterConfig(matchVersion, analyzer); // 构建用于操作索引的类 IndexWriter indexWriter = new IndexWriter(d, conf); // 通过IndexWriter来创建索引 // 索引库里面的数据要遵守一定的结构（索引结构，document） Document doc = new Document(); /** * 1.字段的名称 2.该字段的值 3.字段在数据库中是否存储 * StringField是一体的 * TextField是可分的 */ IndexableField field = new IntField("id", 1, Store.YES); IndexableField title = new StringField("title", "java培训零基础开始从入门到精通", Store.YES); IndexableField content = new TextField( "content", "java培训，中软国际独创实训模式，三免一终身，学java多项保障让您无后顾之忧。中软国际java培训，全日制教学，真实项目实战，名企定制培训，四个月速成java工程师!", Store.YES); doc.add(field); doc.add(title); doc.add(content); // document里面也有很多字段 indexWriter.addDocument(doc); indexWriter.close(); } /** * 使用IndexSearcher对数据创建索引 * * @throws IOException */ @Test public void testSearcher() throws IOException { // 索引存放的位置... Directory d = FSDirectory.open(new File("indexDir/")); // 通过indexSearcher去检索索引目录 IndexReader indexReader = DirectoryReader.open(d); IndexSearcher indexSearcher = new IndexSearcher(indexReader); // 这是一个搜索条件，根据这个搜索条件我们来进行查找 // term是根据哪个字段进行检索，以及字段对应值 //================================================ //注意：这样是查询不出，只有单字才能查询出来 Query query = new TermQuery(new Term("content", "培训")); // 搜索先搜索索引目录 // 找到符合条件的前100条数据 TopDocs topDocs = indexSearcher.search(query, 100); System.out.println("总记录数：" + topDocs.totalHits); ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (ScoreDoc scoreDoc : scoreDocs) { //得分采用的是VSM算法 System.out.println("相关度得分：" + scoreDoc.score); //获取查询结果的文档的惟一编号，只有获取惟一编号，才能获取该编号对应的数据 int doc = scoreDoc.doc; //使用编号，获取真正的数据 Document document = indexSearcher.doc(doc); System.out.println(document.get("id")); System.out.println(document.get("title")); System.out.println(document.get("content")); } }

public static Directory d = null; public static IndexWriterConfig conf = null; public static Version matchVersion = null; public static Analyzer analyzer = null; static{ try { d = FSDirectory.open(new File(Constant.FILEURL)); matchVersion = Version.LUCENE_44; //注意：该分词器是单字分词 analyzer = new StandardAnalyzer(matchVersion); conf = new IndexWriterConfig(matchVersion, analyzer); } catch (IOException e) { e.printStackTrace(); } } /** * * @return 返回版本信息 */ public static Version getMatchVersion() { return matchVersion; } /** * * @return 返回分词器 */ public static Analyzer getAnalyzer() { return analyzer; } /** * * @return 返回用于操作索引的对象 * @throws IOException */ public static IndexWriter getIndexWriter() throws IOException{ IndexWriter indexWriter = new IndexWriter(d, conf); return indexWriter; } /** * * @return 返回用于读取索引的对象 * @throws IOException */ public static IndexSearcher getIndexSearcher() throws IOException{ IndexReader r = DirectoryReader.open(d); IndexSearcher indexSearcher = new IndexSearcher(r); return indexSearcher; }

public int getId() { return id; } public void setId(int id) { this.id = id; } public String getTitle() { return title; } public void setTitle(String title) { this.title = title; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public String getContent() { return content; } public void setContent(String content) { this.content = content; } public String getLink() { return link; } public void setLink(String link) { this.link = link; }

document.add(idField); document.add(authoField); document.add(linkField); document.add(titleField); document.add(contentField); return document; } /** * 将document转换成article * @param document * @return */ public static Article documentToArticle(Document document){ Article article = new Article(); article.setId(Integer.parseInt(document.get("id"))); article.setAuthor(document.get("author")); article.setLink(document.get("link")); article.setTitle(document.get("title")); article.setContent(document.get("content")); return article; }

} /** * 删除索引，根据字段对应的值删除 * @param fieldName * @param fieldValue * @throws IOException */ public void deleteIndex(String fieldName, String fieldValue) throws IOException { IndexWriter indexWriter = LuceneUtils.getIndexWriter(); //使用词条删除 Term term = new Term(fieldName, fieldValue); indexWriter.deleteDocuments(term); indexWriter.close(); } /** * 先删除符合条件的记录，再创建一个新的纪录 * @param fieldName * @param fieldValue * @param article * @throws IOException */ public void updateIndex(String fieldName, String fieldValue, Article article) throws IOException { IndexWriter indexWriter = LuceneUtils.getIndexWriter(); Term term = new Term(fieldName, fieldValue); Document doc = ArticleUtils.articleToDocument(article); /** * 1.设置更新的条件 * 2.设置更新的内容和对象 */ indexWriter.updateDocument(term, doc); indexWriter.close(); } /** * 查询是通过IndexSearch提供的(分页) */ public List

findIndex(String keywords, int start, int count) { try { IndexSearcher indexSearcher = LuceneUtils.getIndexSearcher(); //=========================================================== //这里是第二种query方式，不是termQuery QueryParser queryParser = new MultiFieldQueryParser( LuceneUtils.getMatchVersion(), new String[] { "title", "content" }, LuceneUtils.getAnalyzer()); Query query = queryParser.parse(keywords); TopDocs topDocs = indexSearcher.search(query, 100); System.out.println("总记录数：" + topDocs.totalHits); //表示返回的结果集 ScoreDoc[] scoreDocs = topDocs.scoreDocs; List

list = new ArrayList

(); int min = Math.min(scoreDocs.length, start + count); for (int i = start; i < min; i++) { System.out.println("相关度得分："+scoreDocs[i].score); //获取查询结果的文档的惟一编号，只有获取惟一编号，才能获取该编号对应的数据 int doc = scoreDocs[i].doc; //使用编号，获取真正的数据 Document document = indexSearcher.doc(doc); Article article = ArticleUtils.documentToArticle(document); list.add(article); } return list; } catch (Exception e) { e.printStackTrace(); } return null; }

@Test public void addIndex() { for (int i = 0; i <= 25; i++) { Article article = new Article(); article.setId(i); article.setTitle("腾讯qq"); article.setAuthor("马化腾"); article.setContent("腾讯网(www.QQ.com)是中国浏览量最大的中文门户网站,是腾讯公司推出的集新闻信息、互动社区、娱乐产品和基础服务为一体的大型综合门户网站。腾讯网服务于全球华人..."); article.setLink("http://www.qq.com/"); dao.addIndex(article); } } @Test public void findIndex() { String keywords = "第一"; List

list = dao.findIndex(keywords, 0, 30); for (Article article : list) { System.out.println(article.getId()); System.out.println(article.getTitle()); System.out.println(article.getContent()); System.out.println(article.getAuthor()); System.out.println(article.getLink()); } } @Test public void deleteIndex(){ try { dao.deleteIndex("author", "陈驰"); } catch (IOException e) { e.printStackTrace(); } } @Test public void updateIndex(){ String fieldName = "title"; String fieldValue = "qq"; Article article = new Article(); article.setId(1); article.setAuthor("陈驰"); article.setLink("http://www.baidu.com"); article.setTitle("天下第一"); article.setContent("天下第一一一一一一"); try { dao.updateIndex(fieldName, fieldValue, article); } catch (IOException e) { e.printStackTrace(); } }

Analyzer（分词器）的作用是把一段文本中的词按规则取出所包含的所有词。对应的是Analyzer类，这是一个抽象类，切分词的具体规则是由子类实现的，所以对于不同的语言（规则），要用不同的分词器。如下图：

public static void main(String[] args) { //单字分词器 //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); //二分法分词器 //Analyzer analyzer = new CJKAnalyzer(Version.LUCENE_44); //第三方的中文分词器，庖丁分词，中文分词，特点：扩展新的词，自定义停用词 Analyzer analyzer = new IKAnalyzer(); String text = "腾讯网(www.QQ.com)是中国浏览量最大的中文门户网站,是腾讯公司推出的集新闻信息、互动社区、娱乐产品和基础服务为一体的大型综合门户网站。腾讯网服务于全球华人..."; try { testAnalyzer(analyzer, text); } catch (IOException e) { e.printStackTrace(); } } /** * 分词器的作用 * * @throws IOException */ public static void testAnalyzer(Analyzer analyzer, String text) throws IOException { System.out.println("当前使用的分词器：" + analyzer.getClass().getSimpleName()); TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(text)); tokenStream.addAttribute(CharTermAttribute.class); //这里不写这一句，会报空指针异常 tokenStream.reset(); while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = tokenStream .getAttribute(CharTermAttribute.class); System.out.println(new String(charTermAttribute.toString())); } }

// testQuery(); /** * 第一种查询，TermQuery * 这是关键字查询 * 如果按照author查，因为author没有分词，所以查"马化腾"可以查询出来 * 如果按照content查，因为content分词了，如果是单字分词器，只能通过某一个字查出来，比如"中" */ //Query query=new TermQuery(new Term("content","中")); /** * 第二种查询：字符串搜索.. * 使用查询字符串：QueryParser+ MultiFieldQueryParser的查询方式 * 1、QueryParser：只在一个字段中查询 * 2、MultiFieldQueryParser：可以在多个字段查询 * 用来查询可以分词的字段，只要你输入的一段文本中包含分词，就会检索出来 */

/** * 第三种查询：查询所有.. */ // Query query=new MatchAllDocsQuery(); /** * 第四种查询：范围查询，可以使用此查询来替代过滤器... */ // 我们完成一种需求有两种方式，我们推荐用这种...性能比filter要高 // Query query=NumericRangeQuery.newIntRange("id", 1, 10, true, true); /** * 第五种查询：通配符。。。 */ // ?代表单个任意字符，* 代表多个任意字符... // Query query=new WildcardQuery(new Term("title", "luce*")); /** * 第六种查询：模糊查询..。。。 */ // author String /* * 1:需要根据查询的条件 * * 2:最大可编辑数取值范围0,1,2 允许我的查询条件的值，可以错误（或缺少）几个字符... * */ // Query query = new FuzzyQuery(new Term("author", "爱新觉罗杜小"), 1); /** * * 第七种查询:短语查询 * */ // PhraseQuery query=new PhraseQuery(); // //(1)直接指定角标... // // query.add(new Term("title","solr"),0); // // query.add(new Term("title","全"),8); // (2)设置两个短语之间的最大间隔数... // //设置间隔数范围越大，它被匹配的结果就越多，性能也就越慢.. // query.add(new Term("title","solr")); // query.add(new Term("title","全")); // query.setSlop(18); // 第八种查询:布尔查询 BooleanQuery query = new BooleanQuery(); // id 1~10 Query query1 = NumericRangeQuery.newIntRange("id", 1, 10, true, true); Query query2 = NumericRangeQuery.newIntRange("id", 5, 15, true, true); // select * from table where title=? or content=? // 必须满足第一个条件... query.add(query1, Occur.MUST); // 可以满足第二个条件 query.add(query2, Occur.SHOULD); testQuery(query); } public static void testQuery(Query query) throws Exception { IndexSearcher indexSearcher = LuceneUtils.getIndexSearcher(); TopDocs topDocs = indexSearcher.search(query, 100); System.out.println("总记录数：" + topDocs.totalHits); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { Document document = indexSearcher.doc(scoreDoc.doc); System.out.println(document.get("id")); System.out.println(document.get("title")); System.out.println(document.get("content")); System.out.println(document.get("author")); System.out.println(document.get("link")); } }

//条件三（文件名称和文件描述） if(StringUtils.isNotBlank(queryString)){ //多个字段进行检索的时候，查询使用QueryPaser //要是直接new QueryParser()，也可以，但是只能查询一个字段 QueryParser queryParser = new MultiFieldQueryParser(Version.LUCENE_CURRENT,new String[]{"fileName","comment"},Configuration.getAnalyzer()); Query query3 = queryParser.parse(queryString); query.add(query3, Occur.MUST);//相当与sql语句的and }

public static void main(String[] args) { String keywords = "中华人民共和国"; List

list = findIndex(keywords, 0, 10); for (Article article : list) { System.out.println(article.getId()); System.out.println(article.getTitle()); System.out.println(article.getContent()); System.out.println(article.getAuthor()); System.out.println(article.getLink()); } } /** * 查询是通过IndexSearch提供的(分页) */ public static List

findIndex(String keywords, int start, int count) { Analyzer analyzer = new IKAnalyzer(); try { IndexSearcher indexSearcher = LuceneUtils.getIndexSearcher(); // =========================================================== // 这里是第二种query方式，不是termQuery QueryParser queryParser = new MultiFieldQueryParser( LuceneUtils.getMatchVersion(), new String[] { "title", "content" }, analyzer); Query query = queryParser.parse(keywords); TopDocs topDocs = indexSearcher.search(query, 100); System.out.println("总记录数：" + topDocs.totalHits); /** * 添加设置文字高亮begin 使用lucene自带的高亮器进行高亮显示 */ // html页面高亮显示的格式化，默认是 Formatter formatter = new SimpleHTMLFormatter( "", ""); // 执行查询条件，因为高亮的值就是查询条件 Scorer scorer = new QueryScorer(query); Highlighter highlighter = new Highlighter(formatter, scorer); // 设置文字摘要，此时摘要大小 int fragmentSize = 100; Fragmenter fragmenter = new SimpleFragmenter(fragmentSize); highlighter.setTextFragmenter(fragmenter); /** 添加设置文字高亮end */ // 表示返回的结果集 ScoreDoc[] scoreDocs = topDocs.scoreDocs; List

list = new ArrayList

(); int min = Math.min(scoreDocs.length, start + count); for (int i = start; i < min; i++) { //System.out.println("相关度得分：" + scoreDocs[i].score); // 获取查询结果的文档的惟一编号，只有获取惟一编号，才能获取该编号对应的数据 int doc = scoreDocs[i].doc; // 使用编号，获取真正的数据 Document document = indexSearcher.doc(doc); /** 获取文字高亮的信息 begin */ // 获取文字的高亮，一次只能获取一个字段高亮的结果，如果获取不到，返回null值 // 高亮之后的title // 注意：如果这个字段当中没有包含搜索关键字，你对这个字段的值进行高亮，返回的是null String title = highlighter.getBestFragment( analyzer, "title", document.get("title")); // 如果null表示没有高亮的结果，如果高亮的结果，应该将原值返回 if (title == null) { title = document.get("title"); if (title != null && title.length() > fragmentSize) { // 截串，从0开始 title = title.substring(0, fragmentSize); } } System.out.println("-------title:" + title); // 高亮之后的content // 注意：如果这个字段当中没有包含搜索关键字，你对这个字段的值进行高亮，返回的是null String content = highlighter.getBestFragment( analyzer, "content", document.get("content")); // 如果null表示没有高亮的结果，如果高亮的结果，应该将原值返回 if (content == null) { content = document.get("content"); if (content != null && content.length() > fragmentSize) { // 截串，从0开始 content = content.substring(0, fragmentSize); } } System.out.println("--------content:" + content); /** 获取文字高亮的信息 end */ Article article = new Article(); article.setId(Integer.parseInt(document.get("id"))); article.setAuthor(document.get("author")); article.setLink(document.get("link")); article.setTitle(title);//高亮之后的 article.setContent(content);//高亮之后的 list.add(article); } return list; } catch (Exception e) { e.printStackTrace(); } return null; }

Directory d = FSDirectory.open(new File(Constant.FILEURL)); IndexWriterConfig conf = new IndexWriterConfig( LuceneUtils.getMatchVersion(), LuceneUtils.getAnalyzer()); // 在lucene里面是0配置的 // 通过设置对象的参数来进行配置 LogDocMergePolicy mergePolicy = new LogDocMergePolicy(); /** * * 1:mergeFactor * * 当这个值越小，更少的内存被运用在创建索引的时候，搜索的时候越快，创建索引的时候越慢.. * * 当这个值越大，更多的内存被运用在创建索引的时候，搜索的时候越慢，创建的时候越快... * * * smaller value 2 < smaller value <10 * */ // 设置索引的合并因子... mergePolicy.setMergeFactor(6); conf.setMergePolicy(mergePolicy); IndexWriter indexWriter = new IndexWriter(d, conf); }

数据库的表==>记录==>字段。所以很多传统的应用的文件、数据库等都可以比较方便的映射到Lucene的存储结构/接口中。总体上看：可以先把Lucene当成一个支持全文索引的数据库系统。 Lucene的索引存储位置使用的是一个接口（抽象类），也就可以实现各种各样的实际存储方式（实现类、子类），比如存到文件系统中，存在内存中、存在数据库中等等。

// 索引在硬盘里面... Directory directory1 = FSDirectory.open(new File(Constant.FILEURL)); IOContext ioContext = new IOContext(); // 索引放在内存当中... Directory directory = new RAMDirectory(directory1, ioContext); IndexReader indexReader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(indexReader); String fields[] = { "title" }; QueryParser queryParser = new MultiFieldQueryParser( LuceneUtils.getMatchVersion(), fields, LuceneUtils.getAnalyzer()); // 不同的规则构造不同的子类.. // title:keywords ，content:keywords Query query = queryParser.parse("抑郁症"); TopDocs topDocs = indexSearcher.search(query, 100); System.out.println(topDocs.totalHits); }

LUCENE

你可能感兴趣的:(JAVAWEB,框架)