最近在写一个使用lucene.net为基础架构的全文检索引擎,要求能处理OFFICE和PDF文件,说实话,头疼。
一直在网上找关于这几种文件读取的开源工具和COM组件。很辛苦,但是总算有点收获
先上读.doc的啊!
首先要添加COM引用:
morcosoft word 12.0 object Library和
morcosoft office12.0 object Library 其中12.0表示版本,如果版本低点也可以的。
然后
using Microsoft.Office.Core;
代码如下:
public string WordReader(string filename)
{
filename = DocPath+filename;
Word.ApplicationClass wordapp = null;
Word.Document worddoc = null;
object fileobj = filename;
object nullobj = System.Reflection.Missing.Value;
object Readonly = true;
object noSaveChange = false;
string doc = "";
try
{
wordapp = new Word.ApplicationClass();
worddoc = wordapp.Documents.Open(ref fileobj, ref nullobj, ref Readonly, ref nullobj,
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj,
ref nullobj, ref nullobj, ref nullobj, ref nullobj, ref nullobj);
doc = worddoc.Content.Text;
doc.Replace("\a", ""); //替换空串为空。(word中\a代表空串,但在C#中,代表响铃 晕~~)否则显示控制台程序时会响
doc.Replace("\r", "\n"); //替换回车为回车换行
}
catch(Exception ex)
{
throw ex;
}
finally
{
if (worddoc != null)
{
try
{
worddoc.Close(ref nullobj, ref nullobj, ref nullobj);
}
catch
{ }
}
if (wordapp != null)
{
try
{
wordapp.Quit(ref noSaveChange, ref nullobj, ref nullobj);
}
catch
{
}
}
System.Runtime.InteropServices.Marshal.ReleaseComObject(wordapp);
System.Runtime.InteropServices.Marshal.ReleaseComObject(worddoc);
worddoc = null;
wordapp = null;
GC.Collect();
GC.WaitForPendingFinalizers();
}
return doc;
}