import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.html.dom.HTMLDocumentImpl; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.search.*; import org.apache.lucene.search.highlight.Highlighter; import org.apache.lucene.search.highlight.QueryScorer; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.cyberneko.html.parsers.DOMFragmentParser; import org.springframework.stereotype.Service; import org.w3c.dom.DocumentFragment; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.wltea.analyzer.lucene.IKAnalyzer; import org.wltea.analyzer.lucene.IKQueryParser; import org.wltea.analyzer.lucene.IKSimilarity; import org.xml.sax.InputSource; import javax.annotation.Resource; import java.io.*; import java.util.*; /** * Created with IntelliJ IDEA. * User: R * Date: 12-12-20 * Time: 上午10:05 * To change this template use File | Settings | File Templates. */ @Service public class SiteSearchService implements ISiteSearchService { private static final Log logger = LogFactory.getLog(SiteSearchService.class); @Resource(name = "resourceDAO") private ResourceDAO resourceDAO; private static String index_FILE_PATH = "d:/indexDir"; private static int MAXNUM = 100; private static Version VERSION = Version.LUCENE_35; private Analyzer analyzer = new IKAnalyzer(); private DOMFragmentParser parser = new DOMFragmentParser(); public void createSiteIndex(){ logger.info("--------------lucene-----------createSiteIndex[start:"+(new Date()).toString()+"]"); File file = new File(index_FILE_PATH); Directory directory = null; IndexWriter iwriter = null; try{ if (!file.exists()) { logger.info("---------lucene--------filecreate------filepath:"+index_FILE_PATH); List<Resource> resourceList = resourceDAO.getResourceList(); if(resourceList!=null){ // 建立内存索引对象 directory = FSDirectory.open(file); IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //使用IK中文分词器建立索引 iwriter = new IndexWriter(directory, config); for(Resource tempResource : resourceList){ iwriter.addDocument(addtoDoc(tempResource)); } } } }catch(Exception e){ logger.error(e); }finally { if (iwriter != null) { try { iwriter.close(); } catch (IOException e) { logger.error(e); } } if (directory != null) { try { directory.close(); } catch (IOException e) { logger.error(e); } } } logger.info("--------------lucene-----------createSiteIndex[end:"+(new Date()).toString()+"]"); } private Document addtoDoc(ResouceIndexData tempsiteResource){ Document doc = new Document(); //Field.Index.NO 表示不索引 //Field.Index.ANALYZED 表示分词且索引 //Field.Index.NOT_ANALYZED 表示不分词且索引 doc.add(new Field("id", String.valueOf(tempsiteResource.getId()), Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("siteid", tempsiteResource.getSiteid()==null?"":tempsiteResource.getSiteid(), Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("typeid", tempsiteResource.getTypeid()==null?"":tempsiteResource.getTypeid(), Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("price", tempsiteResource.getPrice()==null?"":tempsiteResource.getPrice(), Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("pictureUrl", tempsiteResource.getPictureUrl()==null?"":tempsiteResource.getPictureUrl(), Field.Store.YES, Field.Index.NOT_ANALYZED,Field.TermVector.WITH_POSITIONS_OFFSETS)); doc.add(new Field("status", tempsiteResource.getStatus()==null?"":tempsiteResource.getStatus(), Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field("title", tempsiteResource.getTitle()==null?"":tempsiteResource.getTitle(), Field.Store.YES, Field.Index.ANALYZED)); String content = ""; if(tempsiteResource.getContent()!=null){ try{ content = "<div>"+tempsiteResource.getContent()+"</div>"; //创建文件片段对象 DocumentFragment node = new HTMLDocumentImpl().createDocumentFragment(); InputStream inputStream = new ByteArrayInputStream(content.getBytes("UTF-8")); parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "utf-8"); //解析HTML内容 parser.parse(new InputSource(inputStream),node); StringBuffer sb = new StringBuffer(); getText(sb,node); content = sb.toString(); }catch(Exception e){ e.printStackTrace(); } } doc.add(new Field("content", content, Field.Store.YES, Field.Index.ANALYZED)); doc.add(new Field("description", tempsiteResource.getDescription()==null?"":tempsiteResource.getDescription(), Field.Store.YES, Field.Index.ANALYZED)); return doc; } private void getText(StringBuffer sb, Node node) { if (node.getNodeType() == Node.TEXT_NODE) { sb.append(node.getNodeValue());//取得结点值,即开始与结束标签之间的信息 } NodeList children = node.getChildNodes(); if ( children != null ) { int len = children.getLength(); for ( int i = 0; i < len; i++ ) { getText(sb, children.item(i));//递归遍历DOM树 } } } public void addDoc(ResouceIndexData tempsiteResource){ try{ File file = new File(index_FILE_PATH); if(!file.exists()){ createSiteIndex(); } Directory directory = FSDirectory.open(file); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter writer = new IndexWriter(directory, writerConfig); Document doc = addtoDoc(tempsiteResource); writer.addDocument(doc); writer.close(); }catch (Exception e){ logger.error(e); } } public void updateDoc(ResouceIndexData tempsiteResource) { try { File file = new File(index_FILE_PATH); if(!file.exists()){ createSiteIndex(); } Directory directory = FSDirectory.open(file); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter writer = new IndexWriter(directory, writerConfig); Document doc = addtoDoc(tempsiteResource); Term term = new Term("id", String.valueOf(tempsiteResource.getId())); writer.updateDocument(term, doc); writer.close(); } catch (Exception e) { logger.error(e); } } public void deleteDoc(int id) { try { File file = new File(index_FILE_PATH); if(!file.exists()){ createSiteIndex(); } Directory directory = FSDirectory.open(file); IndexWriterConfig writerConfig = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter writer = new IndexWriter(directory, writerConfig); Term term = new Term("id", String.valueOf(id)); writer.deleteDocuments(term); writer.close(); } catch (Exception e) { logger.error(e); } } @Override public Resouce getDoc(int id) { Resouce resource = null; IndexSearcher isearcher = null; IndexReader ireader = null; try{ File file = new File(index_FILE_PATH); if(!file.exists()){ createSiteIndex(); } Directory directory = FSDirectory.open(file); ireader = IndexReader.open(directory); isearcher = new IndexSearcher(ireader); Term term = new Term("id", String.valueOf(id)); Query query = new TermQuery(term); TopDocs docs = isearcher.search(query,1); if(docs.totalHits>0){ Document targetDoc = isearcher.doc(docs.scoreDocs[0].doc); siteResource.setId(targetDoc.get("id")); siteResource.setSiteid(targetDoc.get("siteid")); siteResource.setTypeid(targetDoc.get("typeid")); siteResource.setPrice(targetDoc.get("price")); siteResource.setPictureUrl(targetDoc.get("pictureUrl")); siteResource.setDescription(targetDoc.get("description")); siteResource.setTitle(targetDoc.get("title")); siteResource.setContent(targetDoc.get("content")); } }catch (Exception e) { logger.error(e); } return siteResource; //To change body of implemented methods use File | Settings | File Templates. } public List<Resouce> search(String queryWord,String siteid,String typeid,int startPage,int pageSize){ List<Resouce> siteResourceList = new ArrayList<Resouce>(); IndexSearcher isearcher = null; IndexReader ireader = null; try{ File file = new File(index_FILE_PATH); if(!file.exists()){ createSiteIndex(); } Directory directory = FSDirectory.open(file); ireader = IndexReader.open(directory); isearcher = new IndexSearcher(ireader); isearcher.setSimilarity(new IKSimilarity());//在索引器中使用IKSimilarity相似度评估器 BooleanQuery query = new BooleanQuery(); String[] keys = {queryWord,queryWord,queryWord}; String[] fields = {"title","content","description"}; Query ikquery = IKQueryParser.parseMultiField(fields, keys); Query siteidQuery = new TermQuery(new Term("siteid",siteid)); Query typeidQuery = new TermQuery(new Term("typeid",typeid)); query.add(siteidQuery, BooleanClause.Occur.MUST); query.add(typeidQuery, BooleanClause.Occur.MUST); query.add(ikquery, BooleanClause.Occur.MUST); logger.info("---------lucene------queryword:"+query.toString()); Sort sort = new Sort(new SortField("id",SortField.STRING,true));//根据资源ID倒排序 logger.info("---------lucene------sort:"+sort.toString()); //关键字高亮显示的html标签 SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter("<font color='red'>","</font>"); Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); TopDocs topDocs = isearcher.search(query, null,MAXNUM,sort); int allcount = topDocs.totalHits; logger.info("---------lucene------search:"+allcount); //分页取出指定的doc(开始条数, 取几条) int startRow = startPage-1<0?0:(startPage-1)*pageSize; int endRow = startPage*pageSize>allcount?allcount:startPage*pageSize; ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = startRow; i < endRow; i++) { Resouce resource = new Resouce(); Document targetDoc = isearcher.doc(scoreDocs[i].doc); //标题增加高亮显示 TokenStream tokenStream1 = analyzer.tokenStream("title", new StringReader(targetDoc.get("title"))); String titledata = highlighter.getBestFragment(tokenStream1, targetDoc.get("title")); if(titledata==null){ titledata = targetDoc.get("title"); } //内容增加高亮显示 TokenStream tokenStream2 = analyzer.tokenStream("content", new StringReader(targetDoc.get("content"))); String contentdata = highlighter.getBestFragment(tokenStream2, targetDoc.get("content")); if(contentdata==null){ contentdata = targetDoc.get("content"); } //简述增加高亮显示 TokenStream tokenStream3 = analyzer.tokenStream("description", new StringReader(targetDoc.get("description"))); String descriptiondata = highlighter.getBestFragment(tokenStream3, targetDoc.get("description")); if(descriptiondata==null){ descriptiondata = targetDoc.get("description"); } String iddata = targetDoc.get("id"); //将资源重装 resource.setId(iddata); resource.setSiteid(targetDoc.get("siteid")); resource.setTypeid(targetDoc.get("typeid")); resource.setPrice(targetDoc.get("price")); resource.setPictureUrl(targetDoc.get("pictureUrl")); resource.setTitle(titledata); resource.setContent(contentdata); resource.setDescription(descriptiondata); resourceList.add(resource); } }catch (Exception e){ logger.error(e); }finally { if (isearcher != null) { try { isearcher.close(); } catch (IOException e) { logger.error(e); } } if (ireader != null) { try { ireader.close(); } catch (IOException e) { logger.error(e); } } } return siteResourceList; } @Override public String searchCount(String queryWord, String siteid, String typeid) { String allcount = "0"; IndexSearcher isearcher = null; IndexReader ireader = null; try{ File file = new File(index_FILE_PATH); if(!file.exists()){ createSiteIndex(); } Directory directory = FSDirectory.open(file); ireader = IndexReader.open(directory); isearcher = new IndexSearcher(ireader); isearcher.setSimilarity(new IKSimilarity());//在索引器中使用IKSimilarity相似度评估器 BooleanQuery query = new BooleanQuery(); String[] keys = {queryWord,queryWord,queryWord}; String[] fields = {"title","content","description"}; Query ikquery = IKQueryParser.parseMultiField(fields, keys); Query siteidQuery = new TermQuery(new Term("siteid",siteid)); Query typeidQuery = new TermQuery(new Term("typeid",typeid)); query.add(siteidQuery, BooleanClause.Occur.MUST); query.add(typeidQuery, BooleanClause.Occur.MUST); query.add(ikquery, BooleanClause.Occur.MUST); logger.info("---------lucene--count----queryword:"+query.toString()); Sort sort = new Sort(new SortField("id",SortField.STRING,true));//根据资源ID倒排序 logger.info("---------lucene------sort:"+sort.toString()); TopDocs topDocs = isearcher.search(query, null,MAXNUM,sort); allcount = String.valueOf(topDocs.totalHits); logger.info("---------lucene--count----search:"+allcount); }catch (Exception e) { logger.error(e); }finally { if (isearcher != null) { try { isearcher.close(); } catch (IOException e) { logger.error(e); } } if (ireader != null) { try { ireader.close(); } catch (IOException e) { logger.error(e); } } } return allcount; //To change body of implemented methods use File | Settings | File Templates. } }