HTML 解析器
package com.rain.util;
import Java.io.FileInputStream;
import Java.io.FileNotFoundException;
import Java.io.IOException;
import Java.io.InputStream;
import Java.io.InputStreamReader;
import Java.io.Reader;
import Java.io.UnsupportedEncodingException;
import org.apache.lucene.demo.html.HTMLParser;
public class HTMLDocParser {
private String htmlPath;
private HTMLParser htmlParser;
public HTMLDocParser(String htmlPath){
this.htmlPath=htmlPath;
initHtmlParser();
}
public void initHtmlParser(){
InputStream inputStream=null;
try{
inputStream=new FileInputStream(htmlPath);
}catch(FileNotFoundException e){
e.printStackTrace();
}
if(null!=inputStream){
try{
htmlParser=new HTMLParser(new InputStreamReader(inputStream,"utf-8"));
}catch(UnsupportedEncodingException e){
e.printStackTrace();
}
}
}
public String getTitle(){
if(null!=htmlParser){
try{
return htmlParser.getTitle();
}catch(IOException e){
e.printStackTrace();
}catch(InterruptedException e){
e.printStackTrace();
}
}
return "";
}
public Reader getContent(){
if(null!=htmlParser){
try{
return htmlParser.getReader();
}catch(IOException e){
e.printStackTrace();
}
}
return null;
}
public String getPath(){
return this.htmlPath;
}
}
描述搜索结果的结构实体Bean
package com.rain.search;
public class SearchResultBean {
private String htmlPath;
private String htmlTitle;
public String getHtmlPath() {
return htmlPath;
}
public void setHtmlPath(String htmlPath) {
this.htmlPath = htmlPath;
}
public String getHtmlTitle() {
return htmlTitle;
}
public void setHtmlTitle(String htmlTitle) {
this.htmlTitle = htmlTitle;
}
}
索引子系统的实现
package com.rain.index;
import Java.io.File;
import Java.io.IOException;
import Java.io.Reader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.document.Field;
import com.rain.util.HTMLDocParser;
public class IndexManager {
//the directory that stores HTML files
private final String dataDir="E:\\dataDir";
//the directory that is used to store a Lucene index
private final String indexDir="E:\\indexDir";
public boolean creatIndex()throws IOException{
if(true==inIndexExist()){
return true;
}
File dir=new File(dataDir);
if(!dir.exists()){
return false;
}
File[] htmls=dir.listFiles();
Directory fsDirectory=FSDirectory.getDirectory(indexDir,true);
Analyzer analyzer=new StandardAnalyzer();
IndexWriter indexWriter=new IndexWriter(fsDirectory,analyzer,true);
for(int i=0;i<htmls.length;i++){
String htmlPath=htmls[i].getAbsolutePath();
if(htmlPath.endsWith(".html")||htmlPath.endsWith("htm")){
addDocument(htmlPath,indexWriter);
}
}
indexWriter.optimize();
indexWriter.close();
return true;
}
public void addDocument(String htmlPath,IndexWriter indexWriter){
HTMLDocParser htmlParser=new HTMLDocParser(htmlPath);
String path=htmlParser.getPath();
String title=htmlParser.getTitle();
Reader content=htmlParser.getContent();
Document document=new Document();
document.add(new Field("path",path,Field.Store.YES,Field.Index.NO));
document.add(new Field("title",title,Field.Store.YES,Field.Index.TOKENIZED));
document.add(new Field("content",content));
try{
indexWriter.addDocument(document);
}catch(IOException e){
e.printStackTrace();
}
}
public String getDataDir(){
return this.dataDir;
}
public String getIndexDir(){
return this.indexDir;
}
public boolean inIndexExist(){
File directory=new File(indexDir);
if(0<directory.listFiles().length){
return true;
}else{
return false;
}
}
}
搜索功能的实现
package com.rain.search;
import Java.io.IOException;
import Java.util.ArrayList;
import Java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import com.rain.index.IndexManager;
public class SearchManager {
private String searchWord;
private IndexManager indexManager;
private Analyzer analyzer;
public SearchManager(String searchWord){
this.searchWord=searchWord;
this.indexManager=new IndexManager();
this.analyzer=new StandardAnalyzer();
}
/**
* do search
*/
public List search(){
List searchResult=new ArrayList();
if(false==indexManager.inIndexExist()){
try{
if(false==indexManager.creatIndex()){
return searchResult;
}
}catch(IOException e){
e.printStackTrace();
return searchResult;
}
}
IndexSearcher indexSearcher=null;
try{
indexSearcher=new IndexSearcher(indexManager.getIndexDir());
}catch(IOException e){
e.printStackTrace();
}
QueryParser queryParser=new QueryParser("content",analyzer);
Query query=null;
try{
query=queryParser.parse(searchWord);
}catch(ParseException e){
e.printStackTrace();
}
if(null!=query&&null!=indexSearcher){
try{
Hits hits=indexSearcher.search(query);
for(int i=0;i<hits.length();i++){
SearchResultBean resultBean=new SearchResultBean();
resultBean.setHtmlPath(hits.doc(i).get("path"));
resultBean.setHtmlTitle(hits.doc(i).get("title"));
searchResult.add(resultBean);
}
}catch(IOException e){
e.printStackTrace();
}
}
return searchResult;
}
}
请求管理器的实现
package com.rain.servlet;
import Java.io.IOException;
import Java.util.List;
import javax.servlet.RequestDispatcher;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import com.rain.search.SearchManager;
/**
* @author zhourui
* 2007-1-28
*/
public class SearchController extends HttpServlet {
private static final long serialVersionUID=1L;
/* (non-Javadoc)
* @see javax.servlet.http.HttpServlet#doPost(javax.servlet.http.HttpServletRequest, javax.servlet.http.HttpServletResponse)
*/
@Override
protected void doPost(HttpServletRequest arg0, HttpServletResponse arg1) throws ServletException, IOException {
// TODO Auto-generated method stub
String searchWord=arg0.getParameter("searchWord");
SearchManager searchManager=new SearchManager(searchWord);
List searchResult=null;
searchResult=searchManager.search();
RequestDispatcher dispatcher=arg0.getRequestDispatcher("search.jsp");
arg0.setAttribute("searchResult",searchResult);
dispatcher.forward(arg0, arg1);
}
}
向Web服务器提交搜索请求
<form action="SearchController" method="post">
<table>
<tr>
<td colspan="3">
SearchWord:<input type="text" name="searchWord" id="searchWord" size="40">
<input id="doSearch" type="submit" value="search">
</td>
</tr>
</table>
</form>
显示搜索结果
<table class="result">
<%
List searchResult=(List)request.getAttribute("searchResult");
int resultCount=0;
if(null!=searchResult){
resultCount=searchResult.size();
}
for(int i=0;i<resultCount;i++){
SearchResultBean resultBean=(SearchResultBean)searchResult.get(i);
String title=resultBean.getHtmlTitle();
String path=resultBean.getHtmlPath();
%>
<tr>
<td class="title"><h3><a href="<%=path%>"><%=title%></a></h3></td>
</tr>
<%
}
%>
</table>