一、面向对象程序设计
分为两部分,搜索对象设计和限制,服务层次设计引用。
package news;
/**
* <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: </p>
* @author 计算机99630 沈晨
* @version 1.0
* @Download:http://www.codefans.net
*/
import java.io.IOException;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
public class Index {
IndexWriter _writer = null;
Index() throws Exception {
_writer = new IndexWriter("c:\\News\\index",
new ChineseAnalyzer(), true);
}
/**
* 把每条新闻加入索引中
* @param url 新闻的url
* @param title 新闻的标题
* @throws java.lang.Exception
*/
void AddNews(String url, String title) throws Exception {
Document _doc = new Document();
_doc.add(Field.Text("title", title));
_doc.add(Field.UnIndexed("url", url));
_writer.addDocument(_doc);
}
/**
* 优化并且清理资源
* @throws java.lang.Exception
*/
void close() throws Exception {
_writer.optimize();
_writer.close();
}
}
package news;
/**
* <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: </p>
* @author 计算机99630 沈晨
* @version 1.0
* @Download:http://www.codefans.net
*/
import java.util.Iterator;
import java.util.Vector;
import com.heaton.bot.HTMLPage;
import com.heaton.bot.HTTP;
import com.heaton.bot.Link;
public class HTMLParse {
HTTP _http = null;
public HTMLParse(HTTP http) {
_http = http;
}
/**
* 对Web页面进行解析后建立索引
*/
public void start() {
try {
HTMLPage _page = new HTMLPage(_http);
_page.open(_http.getURL(), null);
Vector _links = _page.getLinks();
Index _index = new Index();
Iterator _it = _links.iterator();
int n = 0;
while (_it.hasNext()) {
Link _link = (Link) _it.next();
String _herf = input(_link.getHREF().trim());
String _title = input(_link.getPrompt().trim());
_index.AddNews(_herf, _title);
n++;
}
System.out.println("共扫描到" + n + "条新闻");
_index.close();
}
catch (Exception ex) {
System.out.println(ex);
}
}
/**
* 解决java中的中文问题
* @param str 输入的中文
* @return 经过解码的中文
*/
public static String input(String str) {
String temp = null;
if (str != null) {
try {
temp = new String(str.getBytes("ISO8859_1"));
}
catch (Exception e) {
}
}
return temp;
}
}
package news;
/**
* <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: </p>
* @version 1.0
* @Download:http://www.codefans.net
*/
import com.heaton.bot.HTTP;
import com.heaton.bot.HTTPSocket;
import com.heaton.bot.ISpiderReportable;
import com.heaton.bot.IWorkloadStorable;
import com.heaton.bot.Spider;
import com.heaton.bot.SpiderInternalWorkload;
public class Searcher
implements ISpiderReportable {
public static void main(String[] args) throws Exception {
IWorkloadStorable wl = new SpiderInternalWorkload();
Searcher _searcher = new Searcher();
Spider _spider
= new Spider(_searcher, "http://www.chenshen.com/index.html",
new HTTPSocket(), 100, wl);
_spider.setMaxBody(100);
_spider.start();
}
public boolean foundInternalLink(String url) {
return false;
}
public boolean foundExternalLink(String url) {
return false;
}
public boolean foundOtherLink(String url) {
return false;
}
public void processPage(HTTP http) {
System.out.println("扫描网页:" + http.getURL());
new HTMLParse(http).start();
}
public void completePage(HTTP http, boolean error) {
}
public boolean getRemoveQuery() {
return true;
}
public void spiderComplete() {
}
}
package newsserver;
import java.io.IOException;
import java.io.PrintWriter;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.ChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
/**
* <p>Title: 新闻搜索引擎</p>
* <p>Copyright: Copyright (c) 2003</p>
* <p>Company: </p>
* @version 1.0
* @Download:http://www.codefans.net
*/
public class Results
extends HttpServlet {
private static final String CONTENT_TYPE = "text/html; charset=GBK";
//Initialize global variables
public void init() throws ServletException {
}
//Process the HTTP Get request
public void doGet(HttpServletRequest request, HttpServletResponse response) throws
ServletException, IOException {
String QC = request.getParameter("QueryContent");
if (QC == null) {
QC = "";
}
else {
QC = input(QC);
}
response.setContentType(CONTENT_TYPE);
PrintWriter out = response.getWriter();
try {
Search(QC, out);
}
catch (Exception ex) {
System.out.println(ex.getMessage());
}
}
public void Search(String qc, PrintWriter out) throws Exception {
// 从索引目录创建索引
IndexSearcher _searcher = new IndexSearcher("c:\\news\\index");
// 创建标准分析器
Analyzer analyzer = new ChineseAnalyzer();
// 查询条件
String line = qc;
// Query是一个抽象类
Query query = QueryParser.parse(line, "title", analyzer);
out.println("<html>");
out.println("<head><title>搜索结果</title></head>");
out.println("<body bgcolor=#ffffff>");
out.println("<center>" +
"<form action='/NewsServer/results' method='get'>" +
"<font face='华文中宋' color='#3399FF'>新闻搜索引擎</font>:" +
"<input type='text' name='QueryContent' size='20'>" +
"<input type='submit' name='submit' value='开始搜索'>" +
"</form></center>"
);
out.println("<p>搜索关键字:<font color=red>" + query.toString("title") +
"</font></p>");
Hits hits = _searcher.search(query);
out.println(" 总共找到<font color=red>" + hits.length() + "</font>条新闻<br>");
final int HITS_PER_PAGE = 10;
for (int start = 0; start < hits.length(); start += HITS_PER_PAGE) {
int end = Math.min(hits.length(), start + HITS_PER_PAGE);
for (int i = start; i < end; i++) {
Document doc = hits.doc(i);
String url = doc.get("url");
if (url != null) {
out.println( (i + 1) + " <a href='" + url + "'>" +
replace(doc.get("title"), qc) +
"</a><br>");
}
else {
System.out.println("没有找到!");
}
}
}
out.println("</body></html>");
_searcher.close();
};
public String input(String str) {
String temp = null;
if (str != null) {
try {
temp = new String(str.getBytes("ISO8859_1"));
}
catch (Exception e) {
}
}
return temp;
}
public String replace(String title, String keyword) {
return title.replaceAll(keyword, "<font color='red'>" + keyword + "</font>");
};
//Clean up resources
public void destroy() {
}
}