using System;
using System.Collections.Generic;
using System.Text;
using System.IO;
using System.Text.RegularExpressions;
using Lucene.Net.Analysis.Standard;
using Lucene.Net.Documents;
using Lucene.Net.Index;
namespace Indexer
{
public class IntranetIndexer
{
//索引写入器
private IndexWriter writer;
//要写入索引的文件的根目录
private string docRootDirectory;
//要匹配的文件格式
private string pattern;
/// <summary>
/// 初始化一个索引写入器writer,directory为创建索引的目录,true代表如果不存在索引文件将重新创建索引文件,如果已经存在索引文件将覆写索引文件,如果为true将代表打开已经存在的索引文件
/// </summary>
/// <param name="directory">传入的要创建索引的目录,注意是字符串值,如果目录不存在,他将会被自动创建</param>
public IntranetIndexer(string directory)
{
writer = new IndexWriter(directory,new StandardAnalyzer(),true);
writer.SetUseCompoundFile(true);
}
public void AddDirection(DirectoryInfo directory,string pattern)
{
this.pattern = pattern;
this.docRootDirectory = directory.FullName;
AddSubDirectory(directory);
}
private void AddSubDirectory(DirectoryInfo directory)
{
foreach (FileInfo fi in directory.GetFiles(pattern))
{
//遍历要写入索引的目录的所有文件,把他先加入Docuemnt对象,再加入索引,因为索引都是有Document对象组成
AddHtmlToDocument(fi.FullName);
}
foreach (DirectoryInfo di in directory.GetDirectories())
{
//层层遍历递归,只到把所有的子目录子文件都搞完
AddSubDirectory(di);
}
}
private void AddHtmlToDocument(string path)
{
Document doc = new Document();
string html;
using (StreamReader sr = new StreamReader(path, System.Text.Encoding.Default))
{
html = sr.ReadToEnd();
}
int relativePathStartsAt = this.docRootDirectory.EndsWith("\\") ? this.docRootDirectory.Length : this.docRootDirectory.Length + 1;
string relativePath = path.Substring(relativePathStartsAt);
doc.Add(Field.UnStored("text", ParseHtml(html)));
doc.Add(Field.Keyword("path", relativePath));
doc.Add(Field.Text("title", GetTitle(html)));
writer.AddDocument(doc);
}
/// <summary>
/// 把读取的文件中的所有的html标记去掉,把 替换成空格
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private string ParseHtml(string html)
{
string temp = Regex.Replace(html, "<[^>]*>", "");
return temp.Replace(" "," ");
}
/// <summary>
/// 获得读取的html文挡的标题
/// </summary>
/// <param name="html"></param>
/// <returns></returns>
private string GetTitle(string html)
{
Match m = Regex.Match(html,"<title>(.*)</title>");
if (m.Groups.Count == 2)
return m.Groups[1].Value;
return "此文挡标题未知";
}
public void Close()
{
writer.Optimize();
writer.Close();
}
}
}