具体实现如下:
1。首先在SnowballAnalyzer.cs里面建立类myEwordEntity,这个类可以看做是snowball.cs的接口:主程序调用Snowball.cs最终目的是为了获得关于词的这样一个“实体”
//词汇的实体类
public class myEwordEntity
{
public string txtWord;//词的文本
public string stemroot;//被过滤后词的词根
public string posWord;//词的词性
public int token_begin;//在文章中的开始位置
public int token_end;//在文章中的结束位置
public myEwordEntity()
{
txtWord = string.Empty;
posWord = string.Empty;
stemroot = string.Empty;
token_begin = 0;
token_end = 0;
}
}
2.在SnowballAnalyzer.cs下面建立类stemmer。完成词根还原功能代码(见二中的链接)
3。在SnowballAnalyzer.cs中的 class SnowballAnalyzer : Analyzer 做如下修改
1.private System.String name;
private System.Collections.Hashtable stopSet;//停用词表
private string mModelPath; //词性标注软件模型所在位置
/// <summary>Builds the named analyzer with no stop words. </summary>
2。 public SnowballAnalyzer(System.String name)
{
//获得词性标注软件模型所在位置。模型文件一般放在本工程下面
mModelPath = System.IO.Path.GetDirectoryName(
System.Reflection.Assembly.GetExecutingAssembly().GetName().CodeBase);
mModelPath = new System.Uri(mModelPath).LocalPath + @"\Models\";
this.name = name;
}
/// <summary>Builds the named analyzer with the given stop words. </summary>
public SnowballAnalyzer(System.String name, System.String[] stopWords)
: this(name)
{
stopSet = StopFilter.MakeStopSet(stopWords);
}
3。重写TokenStream函数
public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = new StandardTokenizer(reader);
result = new StandardFilter(result);
result = new LowerCaseFilter(result);
if (stopSet != null)
result = new StopFilter(result, stopSet);
//从result NokenStream 分离出Token 来,确定词性。
// result = new SnowballFilter(result, name);
return result;
}
4。修改后的该类的主要工作函数,从TokenStream中获得词,词的位置,并标注词性
public List<myEwordEntity> TokenStreamToEntityList(System.String fieldName, System.IO.TextReader reader)
{
TokenStream result = TokenStream(fieldName, reader);
// TokenStream result2 = TokenStream(fieldName, reader);
List<myEwordEntity> wordEnList = new List<myEwordEntity>();
while (true)
{
Token token = result.Next();
myEwordEntity entity = new myEwordEntity();
if (token == null)
break;
else
{
entity.token_begin = token.StartOffset();
entity.token_end = token.EndOffset();
entity.txtWord = token.TermText();//获得词汇文本
entity.stemroot = AfterStemed(entity.txtWord);
wordEnList.Add(entity);
}
}
ArrayList myposlist = new ArrayList();
foreach (myEwordEntity entity in wordEnList)
{
myposlist.Add(entity.txtWord);
}
EnglishMaximumEntropyPosTagger mTager = new EnglishMaximumEntropyPosTagger(mModelPath + "EnglishPOS.nbin", mModelPath + @"\Parser\tagdict");
myposlist = mTager.Tag(myposlist);
for (int i = 0; i < myposlist.Count; i++)
{
wordEnList[i].posWord = myposlist[i].ToString();
}
//对每个词汇进行词根还原
/* result2 = new SnowballFilter(result2, name);
int k=0;//工作下标
while(true)
{
Token token=result2.Next();
if (token == null)
break;
else
{
wordEnList[k].stemroot= token.TermText();
k++;
}
}*/
return wordEnList;
}
5。 词根还原
public string AfterStemed(string input)
{
Stemmer s = new Stemmer();
input = input.ToLower();
char[] inputchar = input.ToCharArray();
s.add(inputchar, inputchar.Length);
s.stem();
string u = s.stemerToString();
return u;
}