接上篇
首先我们配置global.asax.在Application_Start中添加如下代码。创建日志,和定时建立索引任务。
1 protected void Application_Start( object sender, EventArgs e)
2 {
3 log4net.Config.XmlConfigurator.Configure(); // log4net起初配置,注意web.config里面改
4 Segment.Init(HttpContext.Current.Server.MapPath( " ~/PanGu.xml " )); // 采用配置文件来初始化
5
6 ISchedulerFactory sf = new StdSchedulerFactory(); // 创建计划工厂
7 IScheduler sched = sf.GetScheduler(); // 创建任务的执行者
8 JobDetail job = new JobDetail( " job1 " , " group1 " , typeof (Index_Job)); // IndexJob为实现了IJob接口的类
9 DateTime ts = TriggerUtils.GetNextGivenSecondDate( null , 1 ); // 1秒后开始执行
10 TimeSpan interval = TimeSpan.FromHours ( 1 ); // 每隔1小时运行一次,这里的方法是可以改的
11 Trigger trigger = new SimpleTrigger( " trigger1 " , " group1 " , " job1 " , " group1 " , ts, null ,
12 SimpleTrigger.RepeatIndefinitely, interval); // 创建触发器
13 sched.AddJob(job, true );
14 sched.ScheduleJob(trigger);
15 sched.Start(); // 启动后任务开始计划
16
17 }
然后我们进入Index_Job类中写它的定时任务代码:注意继承IJob接口把计划任务的执行代码写到Execute方法里面。
索引完成后需要索引库的文件都被lucene.net扔到Index文件夹下。
1 public class Index_Job:IJob
2 {
3 private static ILog log = LogManager.GetLogger( typeof (Index_Job));
4
5 #region IJob 成员
6
7 public void Execute(JobExecutionContext context)
8 {
9 // string indexPath = HttpContext.Current.Server.MapPath("~/Index");
10
11 string indexPath = HostingEnvironment.MapPath( " ~/Index " );
12 log.Debug( " 开始创建索引,索引目录: " + indexPath);
13 FSDirectory directory = FSDirectory.Open( new DirectoryInfo(indexPath), new NativeFSLockFactory());
14 bool isUpdate = IndexReader.IndexExists(directory);
15 log.Debug( " 索引目录存在状态: " + isUpdate);
16 if (isUpdate)
17 {
18 if (IndexWriter.IsLocked(directory))
19 {
20 log.Debug( " 解锁索引库 " );
21 IndexWriter.Unlock(directory);
22 }
23 }
24 log.Debug( " 开始爬文章 " );
25 IndexWriter writer = new IndexWriter(directory, new PanGuAnalyzer(), ! isUpdate, Lucene.Net.Index.IndexWriter.MaxFieldLength.UNLIMITED);
26 string siteURL = ConfigurationManager.AppSettings[ " SiteURL " ];
27 // for (int i = 1; i <= GetMaxId(siteURL); i++)
28
29 for ( int i = 900 ; i <= 1000 ; i ++ )
30 {
31 log.Debug( " 开始爬编号为 " + i.ToString() + " 的帖子 " );
32 try
33 {
34 WebClient wc = new WebClient();
35 wc.Encoding = Encoding.UTF8; //
36 string url = siteURL + " showtopic- " + i + " .aspx " ;
37 // string url = " http://localhost :8081/showtopic-" + i + ".aspx";
38 string txt = wc.DownloadString(url);
39 HTMLDocumentClass htmldoc = new HTMLDocumentClass();
40 htmldoc.designMode = " on " ; // 这样就不解析javascript了
41 htmldoc.IHTMLDocument2_write(txt);
42 string title = htmldoc.title;
43 string bodyText = htmldoc.body.innerText;
44
45 writer.DeleteDocuments( new Term( " url " , url)); // 删除旧的数据,以url键为主键,这样就避免重复
46
47 Document document = new Document();
48 document.Add( new Field( " url " , url, Field.Store.YES, Field.Index.NOT_ANALYZED));
49 document.Add( new Field( " title " , title, Field.Store.YES, Field.Index.NOT_ANALYZED));
50 document.Add( new Field( " body " , bodyText, Field.Store.YES, Field.Index.ANALYZED, Lucene.Net.Documents.Field.TermVector.WITH_POSITIONS_OFFSETS));
51
52 writer.AddDocument(document);
53 log.Debug( " 爬编号为 " + i.ToString() + " 的帖子结束 " );
54 }
55 catch (Exception ex)
56 {
57 log.Error( " 爬编号为 " + i.ToString() + " 的帖子发生异常 " , ex);
58 }
59 }
60 log.Debug( " 结束索引,开始关闭Writer和Directory " );
61 writer.Close();
62 directory.Close();
63 log.Debug( " 关闭Writer和Directiory完成 " );
64 // ClientScript.RegisterStartupScript(GetType(), "alert", "alert('索引完成')", true);
65 }
66
67 #endregion
68
69
70 private static int GetMaxId( string siteURL) // 获得最新的id帖子编号
71 {
72 WebClient wc = new WebClient();
73 wc.Encoding = Encoding.UTF8;
74 string html = wc.DownloadString(siteURL + " tools/rss.aspx " );
75 XDocument doc = XDocument.Parse(html);
76 string link = doc.Descendants( " item " ).First().Element( " link " ).Value;
77 System.Text.RegularExpressions.Regex regex = new Regex( @" showtopic-(\d+) " );
78 Match match = regex.Match(link);
79 string id = match.Groups[ 1 ].Value;
80
81 return Convert.ToInt32(id);
82 }
83
84
85
86 }
Default.aspx页面前台html代码(表单form中):
1 < form id ="form1" action ="Default.aspx" method ="get" >
2 < div >
3
4 < input type ="text" id ="txtKw" value ="<%=Request[" kw"] % > " name ="kw" />
5 < script type ="text/javascript" >
6 $( " #txtKw " ).autocomplete({
7 source: " SearchSuggestion.ashx " , select: function (e, ui) {
8 $( " #txtKw " ).val(ui.item.value);
9 $( " #sb " ).click();
10 } // 自动发出Ajax请求
11 });
12 </ script >
13
14 < input type ="submit" id ="sb" value ="搜索" />
15
16 </ div >
17 < asp:Repeater ID ="RepeaterResult" runat ="server" >
18 < ItemTemplate >
19 < a href ='<%#Eval("URL")% > ' > <% # Eval ( " TITLE " ) %> </ a >
20 < br />
21 < p >
22 <% # Eval ( " BODY " ) %>
23 </ p >
24 </ ItemTemplate >
25 </ asp:Repeater >
26 </ form >
注意把viewstate禁用了,防止最后生成的客户端html代码中有一大堆viewstate的东东,显得我们不专业 呵呵~(EnableViewState ="false")
我们知道一旦禁用viewstate所有跑在服务端的基本控件都不能用了,那些数控绑定和链接控件除外,所以我们回归原始的html。用get方法提交表单。
因此后台代码我们就在pageload方法中实现,通过用户输入的关键字Request["kw"]是否为空提交表单后判断是否加载运行。后台处理代码如下:(其中有注释我就不详细解释了)
1 public partial class _Default : System.Web.UI.Page
2 {
3 protected void Page_Load( object sender, EventArgs e)
4 {
5 if ( string .IsNullOrEmpty(Request[ " kw " ]))
6 {
7 return ;
8 }
9 else
10 {
11
12 string kw = Request[ " kw " ];
13 new SearchLogTableAdapter().Insert(Guid.NewGuid(), DateTime.Now, Request.UserHostAddress, kw); // 把关键词插入数据库
14
15 string indexPath = Server.MapPath( " ~/Index " ); // 获得要搜索的文本路径(已经被lucene建立好的文本的索引文件)
16 FSDirectory directory = FSDirectory.Open( new DirectoryInfo(indexPath), new NativeFSLockFactory());
17 IndexReader indexReader = IndexReader.Open(directory, true );
18 IndexSearcher searcher = new IndexSearcher(indexReader); // 加入搜索者
19 PhraseQuery query = new PhraseQuery(); // 查询条件
20 foreach ( string word in segString(kw)) // 分词
21 {
22 query.Add( new Term( " body " , word)); // 加入分词的查询条件
23
24 }
25 query.SetSlop( 1000 ); // 相邻1000个字有效
26
27 TopScoreDocCollector collector = TopScoreDocCollector.create( 1000 , true ); // 创建收集器最多收集1000个文本
28 searcher.Search(query, null , collector); // 开始查询,使用query条件,结果放入collector中
29 TopDocs topDocs = collector.TopDocs(); // 获得结果
30 int sum = collector.GetTotalHits(); // 得到结果条数
31 List < SearchResult > list = new List < SearchResult > ();
32 foreach (ScoreDoc scoreDoc in topDocs.scoreDocs)
33 {
34
35 int docId = scoreDoc.doc; // 拿到搜到的文档ID
36 Document document = searcher.Doc(docId); // 根据文档ID创建DOCUMENT
37 string url = document.Get( " url " );
38 string title = document.Get( " title " );
39 string body = document.Get( " body " );
40
41 SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter( " <font color='red'> " , " </font> " ); // 设置高亮,还可以实现其他功能。提示之类的~~
42 Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new Segment());
43 highlighter.FragmentSize = 200 ;
44 body = highlighter.GetBestFragment(kw, body); // 把关键字设为高亮
45
46 SimpleHTMLFormatter simpleHtmlFormatter1 = new SimpleHTMLFormatter( " <font style='background-color:red'> " , " </font> " ); // 把标题背景设为红
47 Highlighter highlighter1 = new Highlighter(simpleHtmlFormatter1, new Segment());
48 highlighter1.FragmentSize = 200 ;
49 title = highlighter1.GetBestFragment(title, title);
50
51 SearchResult result = new SearchResult()
52 {
53 URL = url,
54 TITLE = title,
55 BODY = body
56 };
57 list.Add(result);
58
59 }
60 searcher.Close();
61 indexReader.Close();
62 directory.Close();
63 RepeaterResult.DataSource = list;
64 RepeaterResult.DataBind();
65 }
66
67 }
68 private static string [] segString( string s)
69 {
70 Segment segment = new Segment();
71 return (from wordInfo in segment.DoSegment(s) select wordInfo.Word).ToArray();
72 }
73
74 }
75 public class SearchResult
76 {
77 public string URL { get ; set ; }
78 public string TITLE { get ; set ; }
79 public string BODY { get ; set ; }
80 }
接下去我们实现输入关键词自动补全。用JQueryUI和AJAX请求后台关键词效果。
未完,待续。。。