lecene.net实现pdf,doc,xls,ppt,htm,html等格式文件的检索

代码如下,代码没有优化,仅实现功能
该代码复制到程序中不能直接使用,需要下载文章最后的例子,取得其中得dll后才可以

using  System;
using  System.Configuration;
using  System.Data;
using  System.Linq;
using  System.Web;
using  System.Web.Security;
using  System.Web.UI;
using  System.Web.UI.HtmlControls;
using  System.Web.UI.WebControls;
using  System.Web.UI.WebControls.WebParts;
using  System.Xml.Linq;
using  System.Text;
using  System.IO;

using  Lucene.Net.Documents;
using  Lucene.Net.Index;
using  Lucene.Net.Search;
using  Lucene.Net.QueryParsers;
using  Lucene.Net.Analysis.Standard;

using  Lucene.Net.Analysis.Cn;


using  org.pdfbox.pdmodel;
using  org.pdfbox.util;

using  System.Text.RegularExpressions;

public   partial   class  _Default : System.Web.UI.Page
{
    
public  DateTime start  =   new  DateTime();
    
delegate   void  AsyncIndexDirectoryCaller(IndexWriter writer, FileInfo file);
    IndexSearcher searcher 
=   null ;

    
protected   void  Page_Load( object  sender, EventArgs e)
    {
        
if  ( ! IsPostBack)
            TextBox3.Text 
=  Server.MapPath( " doc " );
    }


    
#region  建立索引
    
protected   void  Button2_Click( object  sender, EventArgs e)
    {
        
string  INDEX_STORE_PATH  =  Server.MapPath( " index " );   // INDEX_STORE_PATH 为索引存储目录
         string  INDEX_PATH  =  TextBox3.Text;   // INDEX_PATH 为搜索目录

        IndexWriter writer 
=   null ;
        
try
        {
            writer 
=   new  IndexWriter(INDEX_STORE_PATH,  new  ChineseAnalyzer(),  true );
            start 
=  DateTime.Now;

            IndexDirectory(writer, 
new  FileInfo(INDEX_PATH));
            writer.Optimize();
            writer.Close();

            TimeSpan s 
=  DateTime.Now  -  start;

            TextBox1.Text 
=   " 提示:索引完成,共用时  "   +  s.TotalSeconds  +   "  秒\n " ;

        }
        
catch  (Exception ex)
        {
            TextBox4.Text 
=  ex.Message.ToString();
        }


    }

    
public   void  IndexDirectory(IndexWriter writer, FileInfo file)
    {
        
if  (Directory.Exists(file.FullName))
        {
            String[] files 
=  Directory.GetFileSystemEntries(file.FullName);

            
if  (files  !=   null )
            {
                
for  ( int  i  =   0 ; i  <  files.Length; i ++ )
                {
                    IndexDirectory(writer, 
new  FileInfo(files[i]));   // 这里是一个递归 
                }
            }
        }
        
else   if  (file.Extension.ToLower()  ==   " .txt "   ||  file.Extension.ToLower()  ==   " .htm "   ||  file.Extension.ToLower()  ==   " .html "   ||  file.Extension.ToLower()  ==   " .pdf "   ||  file.Extension.ToLower()  ==   " .doc "   ||  file.Extension.ToLower()  ==   " .rtf "   ||  file.Extension.ToLower()  ==   " .ppt "   ||  file.Extension.ToLower()  ==   " .xls " )
        {
            IndexFile(file, writer);
        }
    }

    
private   void  IndexFile(FileInfo file, IndexWriter writer)
    {

        
try
        {
            
if  (file.Extension.ToLower()  ==   " .pdf " )
            {
                Document doc 
=   new  Document();

                PDDocument pddoc 
=  PDDocument.load(file.FullName);  
                PDFTextStripper stripper 
=   new  PDFTextStripper();

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , stripper.getText(pddoc), Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else   if  (file.Extension.ToLower()  ==   " .doc " )
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;
                
//
                Microsoft.Office.Interop.Word.ApplicationClass wordApp  =   new  Microsoft.Office.Interop.Word.ApplicationClass();
                
object  filePath  =  file.FullName;
                
object  nullobj  =  System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc 
=  wordApp.Documents.Open(
                    
ref  filePath,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str 
=  docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                wordApp.Quit(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                
//

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            
else   if  (file.Extension.ToLower()  ==   " .rtf " )     // word的方式可以解决rtf文件的读取
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;
                
//
                Microsoft.Office.Interop.Word.ApplicationClass wordApp  =   new  Microsoft.Office.Interop.Word.ApplicationClass();
                
object  filePath  =  file.FullName;
                
object  nullobj  =  System.Reflection.Missing.Value;
                Microsoft.Office.Interop.Word.Document docdoc 
=  wordApp.Documents.Open(
                    
ref  filePath,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj,
                    
ref  nullobj,  ref  nullobj,  ref  nullobj,  ref  nullobj);
                docdoc.ActiveWindow.Selection.WholeStory();

                str 
=  docdoc.ActiveWindow.Selection.Text.ToString();
                docdoc.Close(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                wordApp.Quit(
ref  nullobj,  ref  nullobj,  ref  nullobj);
                
//
                
                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else   if  (file.Extension.ToLower()  ==   " .ppt " )
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;
                
//
                PowerPoint.ApplicationClass pptApp  =   new  PowerPoint.ApplicationClass();
                PowerPoint.Presentation pptPre 
=  pptApp.Presentations.Open(file.FullName,
                            Microsoft.Office.Core.MsoTriState.msoTrue,
                            Microsoft.Office.Core.MsoTriState.msoFalse,
                            Microsoft.Office.Core.MsoTriState.msoFalse);

                
foreach  (PowerPoint.Slide slide  in  pptPre.Slides)
                {
                    
foreach  (PowerPoint.Shape shape  in  slide.Shapes)
                    {
                        
try
                        {
                            str 
=  str  +  shape.TextFrame.TextRange.Text;
                        }
                        
catch  { }
                    }
                }
                pptPre.Close();
                pptApp.Quit();
                
//

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);
            }
            
else   if  (file.Extension.ToLower()  ==   " .xls " )
            {
                Document doc 
=   new  Document();
                
string  str  =   "" ;

                
//
                Microsoft.Office.Interop.Excel.Application xApp  =   new  Microsoft.Office.Interop.Excel.ApplicationClass();
                
// xApp.Visible = true;

                
object  nullobj  =  System.Reflection.Missing.Value;

                Microsoft.Office.Interop.Excel.Workbook xBook 
=  xApp.Workbooks._Open(file.FullName,
                nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj, nullobj);

                Microsoft.Office.Interop.Excel.Worksheet xSheet;
                
int  rcount, ccount;

                
for  ( int  i  =   0 ; i  <  xBook.Sheets.Count; i ++ )
                {
                    xSheet 
=  (Microsoft.Office.Interop.Excel.Worksheet)xBook.Sheets[i  +   1 ];

                    rcount 
=  xSheet.UsedRange.Rows.Count;
                    ccount 
=  xSheet.UsedRange.Columns.Count;

                    
for  ( int  m  =   0 ; m  <  rcount; m ++ )
                    {
                        
for  ( int  n  =   0 ; n  <  ccount; n ++ )
                        {
                            str 
=  str  +  ((Microsoft.Office.Interop.Excel.Range)xSheet.Cells[m  +   1 , n  +   1 ]).Value2;
                        }
                    }

                }
                xSheet 
=   null ;
                xBook.Close(nullobj, nullobj, nullobj);
                xApp.Quit();
                
//

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " , str, Field.Store.NO, Field.Index.TOKENIZED));

                writer.AddDocument(doc);

            }
            
else   if  (file.Extension.ToLower()  ==   " .htm "   ||  file.Extension.ToLower()  ==   " .html " )
            {

                Document doc 
=   new  Document();
                
string  str  =   "" ;
                str 
=  NoHTML(File.ReadAllText(file.FullName));

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " new  StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
            
else      // 默认是文本文件
            {
                Document doc 
=   new  Document();

                doc.Add(
new  Field( " filename " , file.FullName, Field.Store.YES, Field.Index.UN_TOKENIZED));

                doc.Add(
new  Field( " contents " new  StreamReader(file.FullName, System.Text.Encoding.Default)));

                writer.AddDocument(doc);
            }
        }

        
catch  (FileNotFoundException fnfe)
        {
            TextBox4.Text 
=  TextBox4.Text  +  fnfe.Message  +   " \n " ;
            
return ;
        }
    }

    
public   static   string  NoHTML( string  Htmlstring) // 过滤调html的标签
    {
        
// 删除脚本 
        Htmlstring  =  Regex.Replace(Htmlstring,  @" <script[^>]*?>.*?</script> " "" , RegexOptions.IgnoreCase);
        
// 删除HTML 
        Htmlstring  =  Regex.Replace(Htmlstring,  @" <(.[^>]*)> " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" ([\r\n])[\s]+ " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" --> " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" <!--.* " "" , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(quot|#34); " " \ "" , RegexOptions.IgnoreCase);
        Htmlstring  =  Regex.Replace(Htmlstring,  @" &(amp|#38); " " & " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(lt|#60); " " < " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(gt|#62); " " > " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(nbsp|#160); " "   " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(iexcl|#161); " " \xa1 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(cent|#162); " " \xa2 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(pound|#163); " " \xa3 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &(copy|#169); " " \xa9 " , RegexOptions.IgnoreCase);
        Htmlstring 
=  Regex.Replace(Htmlstring,  @" &#(\d+); " "" , RegexOptions.IgnoreCase);
        Htmlstring.Replace(
" < " "" );
        Htmlstring.Replace(
" > " "" );
        Htmlstring.Replace(
" \r\n " "" );
        Htmlstring 
=  HttpContext.Current.Server.HtmlEncode(Htmlstring).Trim();
        
return  Htmlstring;
    }
    
#endregion

    
#region  搜索
    
protected   void  Button1_Click( object  sender, EventArgs e)
    {
        
string  INDEX_STORE_PATH  =  Server.MapPath( " index " );   // INDEX_STORE_PATH 为索引存储目录
         string  KEYWORD  =  TextBox2.Text;

        
try
        {
            searcher 
=   new  IndexSearcher(INDEX_STORE_PATH);

            QueryParser q 
=   new  QueryParser( " contents " new  ChineseAnalyzer());

            Query query 
=  q.Parse(KEYWORD);


            Hits hits 
=  searcher.Search(query);

            printResult(hits);

            searcher.Close();
        }
        
catch  (Exception ex)
        {
            TextBox4.Text 
=  TextBox4.Text  +  ex.Message.ToString();
        }
    }

    
void  printResult(Hits h)
    {
        
string  str  =   "" ;
        
if  (h.Length()  ==   0 )
        {
            str 
=  str  +   " 对不起,没有搜索到你要的结果。\n " ;
        }
        
else
        {
            
for  ( int  i  =   0 ; i  <  h.Length(); i ++ )
            {
                
try
                {
                    Document doc 
=  h.Doc(i);
                    str 
=  str  +   " 这是第 "   +  (i  +   1 +   " 个搜索结果,文件路径为:  "   +  doc.Get( " filename " +   " \n " ;
                }
                
catch  (Exception ex)
                {
                    TextBox4.Text 
=  TextBox4.Text  +  ex.Message;
                }
            }
        }
        str 
=  str  +   " ---------------------------\n " ;
        TextBox1.Text 
=  str;
    }

    
#endregion

}


完整demo下载,点击下载

你可能感兴趣的:(html)