代码如下,代码没有优化,仅实现功能
该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以
using
System;
using
System.Configuration;
using
System.Data;
using
System.Linq;
using
System.Web;
using
System.Web.Security;
using
System.Web.UI;
using
System.Web.UI.HtmlControls;
using
System.Web.UI.WebControls;
using
System.Web.UI.WebControls.WebParts;
using
System.Xml.Linq;
using
System.Text;
using
System.IO;
using
Lucene.Net.Documents;
using
Lucene.Net.Index;
using
Lucene.Net.Search;
using
Lucene.Net.QueryParsers;
using
Lucene.Net.Analysis.Standard;
using
Lucene.Net.Analysis.Cn;
using
org.pdfbox.pdmodel;
using
org.pdfbox.util;
using
System.Text.RegularExpressions;
public
partial
class
_Default : System.Web.UI.Page
{
public
DateTime start
=
new
DateTime();
delegate
void
AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
IndexSearcher searcher
=
null
;
protected
void
Page_Load(
object
sender, EventArgs e)
{
if
(
!
IsPostBack)
TextBox3.Text
=
Server.MapPath(
"
doc
"
);
}
#region
建立索引
protected
void
Button2_Click(
object
sender, EventArgs e)
{
string
INDEX_STORE_PATH
=
Server.MapPath(
"
index
"
);
//
INDEX_STORE_PATH 为索引存储目录
string
INDEX_PATH
=
TextBox3.Text;
//
INDEX_PATH 为搜索目录
IndexWriter writer
=
null
;
try
{
writer
=
new
IndexWriter(INDEX_STORE_PATH,
new
ChineseAnalyzer(),
true
);
start
=
DateTime.Now;
IndexDirectory(writer,
new
FileInfo(INDEX_PATH));
writer.Optimize();
writer.Close();
TimeSpan s
=
DateTime.Now
-
start;
TextBox1.Text
=
"
提示:索引完成,共用时
"
+
s.TotalSeconds
+
"
秒\n
"
;
}
catch
(Exception ex)
{
TextBox4.Text
=
ex.Message.ToString();
}
}
public
void
IndexDirectory(IndexWriter writer, FileInfo file)
{
if
(Directory.Exists(file.FullName))
{
String[] files
=
Directory.GetFileSystemEntries(file.FullName);
if
(files
!=
null
)
{
for
(
int
i
=
0
; i
<
files.Length; i
++
)
{
IndexDirectory(writer,
new
FileInfo(files[i]));
//
这里是一个递归
}
}
}
else
if
(file.Extension.ToLower()
==
"
.txt
"
||
file.Extension.ToLower()
==
"
.htm
"
||
file.Extension.ToLower()
==
"
.html
"
||
file.Extension.ToLower()
==
"
.pdf
"
||
file.Extension.ToLower()
==
"
.doc
"
||
file.Extension.ToLower()
==
"
.rtf
"
||
file.Extension.ToLower()
==
"
.ppt
"
||
file.Extension.ToLower()
==
"
.xls
"
)
{
IndexFile(file, writer);
}
}
private
void
IndexFile(FileInfo file, IndexWriter writer)
{
try
{
if
(file.Extension.ToLower()
==
"
.pdf
"
)
{
Document doc
=
new
Document();
PDDocument pddoc
=
PDDocument.load(file.FullName);
PDFTextStripper stripper
=
new
PDFTextStripper();
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
, stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));
writer.AddDocument(doc);
}
else
if
(file.Extension.ToLower()
==
"
.doc
"
)
{
Document doc
=
new
Document();
string
str
=
""
;
//
Microsoft.Office.Interop.Word.ApplicationClass wordApp
=
new
Microsoft.Office.Interop.Word.ApplicationClass();
object
filePath
=
file.FullName;
object
nullobj
=
System.Reflection.Missing.Value;
Microsoft.Office.Interop.Word.Document docdoc
=
wordApp.Documents.Open(
ref
filePath,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj);
docdoc.ActiveWindow.Selection.WholeStory();
str
=
docdoc.ActiveWindow.Selection.Text.ToString();
docdoc.Close(
ref
nullobj,
ref
nullobj,
ref
nullobj);
wordApp.Quit(
ref
nullobj,
ref
nullobj,
ref
nullobj);
//
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
, str, Field.Store.NO, Field.Index.TOKENIZED));
writer.AddDocument(doc);
}
else
if
(file.Extension.ToLower()
==
"
.rtf
"
)
//
word的方式可以解决rtf文件的读取
{
Document doc
=
new
Document();
string
str
=
""
;
//
Microsoft.Office.Interop.Word.ApplicationClass wordApp
=
new
Microsoft.Office.Interop.Word.ApplicationClass();
object
filePath
=
file.FullName;
object
nullobj
=
System.Reflection.Missing.Value;
Microsoft.Office.Interop.Word.Document docdoc
=
wordApp.Documents.Open(
ref
filePath,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj,
ref
nullobj);
docdoc.ActiveWindow.Selection.WholeStory();
str
=
docdoc.ActiveWindow.Selection.Text.ToString();
docdoc.Close(
ref
nullobj,
ref
nullobj,
ref
nullobj);
wordApp.Quit(
ref
nullobj,
ref
nullobj,
ref
nullobj);
//
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
, str, Field.Store.NO, Field.Index.TOKENIZED));
writer.AddDocument(doc);
}
else
if
(file.Extension.ToLower()
==
"
.ppt
"
)
{
Document doc
=
new
Document();
string
str
=
""
;
//
PowerPoint.ApplicationClass pptApp
=
new
PowerPoint.ApplicationClass();
PowerPoint.Presentation pptPre
=
pptApp.Presentations.Open(file.FullName,
Microsoft.Office.Core.MsoTriState.msoTrue,
Microsoft.Office.Core.MsoTriState.msoFalse,
Microsoft.Office.Core.MsoTriState.msoFalse);
foreach
(PowerPoint.Slide slide
in
pptPre.Slides)
{
foreach
(PowerPoint.Shape shape
in
slide.Shapes)
{
try
{
str
=
str
+
shape.TextFrame.TextRange.Text;
}
catch
{ }
}
}
pptPre.Close();
pptApp.Quit();
//
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
, str, Field.Store.NO, Field.Index.TOKENIZED));
writer.AddDocument(doc);
}
else
if
(file.Extension.ToLower()
==
"
.xls
"
)
{
Document doc
=
new
Document();
string
str
=
""
;
//
Microsoft.Office.Interop.Excel.Application xApp
=
new
Microsoft.Office.Interop.Excel.ApplicationClass();
//
xApp.Visible = true;
object
nullobj
=
System.Reflection.Missing.Value;
Microsoft.Office.Interop.Excel.Workbook xBook
=
xApp.Workbooks._Open(file.FullName,
nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);
Microsoft.Office.Interop.Excel.Worksheet xSheet;
int
rcount, ccount;
for
(
int
i
=
0
; i
<
xBook.Sheets.Count; i
++
)
{
xSheet
=
(Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i
+
1
];
rcount
=
xSheet.UsedRange.Rows.Count;
ccount
=
xSheet.UsedRange.Columns.Count;
for
(
int
m
=
0
; m
<
rcount; m
++
)
{
for
(
int
n
=
0
; n
<
ccount; n
++
)
{
str
=
str
+
((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m
+
1
, n
+
1
]).Value2;
}
}
}
xSheet
=
null
;
xBook.Close(nullobj, nullobj, nullobj);
xApp.Quit();
//
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
, str, Field.Store.NO, Field.Index.TOKENIZED));
writer.AddDocument(doc);
}
else
if
(file.Extension.ToLower()
==
"
.htm
"
||
file.Extension.ToLower()
==
"
.html
"
)
{
Document doc
=
new
Document();
string
str
=
""
;
str
=
NoHTML(File.ReadAllText(file.FullName));
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
,
new
StreamReader(file.FullName, System.Text.Encoding.Default)));
writer.AddDocument(doc);
}
else
//
默认是文本文件
{
Document doc
=
new
Document();
doc.Add(
new
Field(
"
filename
"
, file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));
doc.Add(
new
Field(
"
contents
"
,
new
StreamReader(file.FullName, System.Text.Encoding.Default)));
writer.AddDocument(doc);
}
}
catch
(FileNotFoundException fnfe)
{
TextBox4.Text
=
TextBox4.Text
+
fnfe.Message
+
"
\n
"
;
return
;
}
}
public
static
string
NoHTML(
string
Htmlstring)
//
过滤调html的标签
{
//
删除脚本
Htmlstring
=
Regex.Replace(Htmlstring,
@"
<script[^>]*?>.*?</script>
"
,
""
, RegexOptions.IgnoreCase);
//
删除HTML
Htmlstring
=
Regex.Replace(Htmlstring,
@"
<(.[^>]*)>
"
,
""
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
([\r\n])[\s]+
"
,
""
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
-->
"
,
""
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
<!--.*
"
,
""
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(quot|#34);
"
,
"
\
""
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(amp|#38);
"
,
"
&
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(lt|#60);
"
,
"
<
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(gt|#62);
"
,
"
>
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(nbsp|#160);
"
,
"
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(iexcl|#161);
"
,
"
\xa1
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(cent|#162);
"
,
"
\xa2
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(pound|#163);
"
,
"
\xa3
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&(copy|#169);
"
,
"
\xa9
"
, RegexOptions.IgnoreCase);
Htmlstring
=
Regex.Replace(Htmlstring,
@"
&#(\d+);
"
,
""
, RegexOptions.IgnoreCase);
Htmlstring.Replace(
"
<
"
,
""
);
Htmlstring.Replace(
"
>
"
,
""
);
Htmlstring.Replace(
"
\r\n
"
,
""
);
Htmlstring
=
HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
return
Htmlstring;
}
#endregion
#region
搜索
protected
void
Button1_Click(
object
sender, EventArgs e)
{
string
INDEX_STORE_PATH
=
Server.MapPath(
"
index
"
);
//
INDEX_STORE_PATH 为索引存储目录
string
KEYWORD
=
TextBox2.Text;
try
{
searcher
=
new
IndexSearcher(INDEX_STORE_PATH);
QueryParser q
=
new
QueryParser(
"
contents
"
,
new
ChineseAnalyzer());
Query query
=
q.Parse(KEYWORD);
Hits hits
=
searcher.Search(query);
printResult(hits);
searcher.Close();
}
catch
(Exception ex)
{
TextBox4.Text
=
TextBox4.Text
+
ex.Message.ToString();
}
}
void
printResult(Hits h)
{
string
str
=
""
;
if
(h.Length()
==
0
)
{
str
=
str
+
"
对不起,没有搜索到你要的结果。\n
"
;
}
else
{
for
(
int
i
=
0
; i
<
h.Length(); i
++
)
{
try
{
Document doc
=
h.Doc(i);
str
=
str
+
"
这是第
"
+
(i
+
1
)
+
"
个搜索结果,文件路径为:
"
+
doc.Get(
"
filename
"
)
+
"
\n
"
;
}
catch
(Exception ex)
{
TextBox4.Text
=
TextBox4.Text
+
ex.Message;
}
}
}
str
=
str
+
"
---------------------------\n
"
;
TextBox1.Text
=
str;
}
#endregion
}
完整demo下载,点击下载
【reprinted from http://www.cnblogs.com/weekzero/archive/2008/06/11/1217521.html】