var words = StringStemHelper.Stemming(s,null,true, LanguageOp.DE); //German
public class LanguageOp
{
private string _languageName;
public string Language
{
get
{
return _languageName;
}
}
public static readonly LanguageOp En = new LanguageOp() { _languageName = "English" };
public static readonly LanguageOp DE = new LanguageOp() { _languageName = "German" };
public static readonly LanguageOp DE2 = new LanguageOp() { _languageName = "German2" };
public static readonly LanguageOp FR = new LanguageOp() { _languageName = "French" };
}
public static List<string> Stemming(string input, ISet<string> stopWords = null, bool lowerCase = false, LanguageOp lang = null)
{
var result = new List<string>();
using (var ts = ToTokenStream(input, stopWords, lowerCase, lang))
{
while (ts.IncrementToken())
{
var term = ts.GetAttribute<ITermAttribute>().Term;
if (!string.IsNullOrWhiteSpace(term))
{
result.Add(term);
}
}
}
return result;
}
private static TokenStream ToTokenStream(string text, ISet<string> stopSet, bool lowerCase = false, LanguageOp lang = null)
{
if (lang == null)
{
lang = LanguageOp.En;
}
var version = Lucene.Net.Util.Version.LUCENE_30;
var reader = new StringReader(text);
var stopWFlat = false;
if (stopSet == null && lang == null)
{
stopSet = EnglishStopWordSet;
stopWFlat = true;
}
TokenStream tokenStream = new StandardTokenizer(version, reader);
((StandardTokenizer)tokenStream).MaxTokenLength = 255;
if (lowerCase)
{
tokenStream = new StandardFilter(tokenStream);
tokenStream = new LowerCaseFilter(tokenStream);
}
if (stopWFlat)
{
tokenStream = new StopFilter(true, tokenStream, stopSet);
}
tokenStream = new SnowballFilter(tokenStream, lang.Language);
return tokenStream;
}
DLL REFERENCE
Lucene.Net
Lucene.Net.Contrib.Analyzers
Lucene.Net.Contrib.Snowball