问题描述:主要检测识别文本中夹杂的色情、推广、辱骂、违禁违法等垃圾内容,并进行过滤或者屏蔽。
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace FoundationHelper
{
#region 非法关键字过滤 bate 1.1
///
/// 非法关键词过滤(自动忽略汉字数字字母间的其他字符)
///
public class FilterWord
{
public FilterWord() { }
public FilterWord(string dictionaryPath)
{
this.dictionaryPath = dictionaryPath;
}
private string dictionaryPath = string.Empty;
///
/// 词库路径
///
public string DictionaryPath
{
get { return dictionaryPath; }
set { dictionaryPath = value; }
}
///
/// 内存词典
///
private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];
private string sourctText = string.Empty;
///
/// 检测源
///
public string SourctText
{
get { return sourctText; }
set { sourctText = value; }
}
///
/// 检测源游标
///
int cursor = 0;
///
/// 匹配成功后偏移量
///
int wordlenght = 0;
///
/// 检测词游标
///
int nextCursor = 0;
private List illegalWords = new List();
///
/// 检测到的非法词集
///
public List IllegalWords
{
get { return illegalWords; }
}
///
/// 判断是否是中文
///
///
///
private bool isCHS(char character)
{
// 中文表意字符的范围 4E00-9FA5
int charVal = (int)character;
return (charVal >= 0x4e00 && charVal <= 0x9fa5);
}
///
/// 判断是否是数字
///
///
///
private bool isNum(char character)
{
int charVal = (int)character;
return (charVal >= 48 && charVal <= 57);
}
///
/// 判断是否是字母
///
///
///
private bool isAlphabet(char character)
{
int charVal = (int)character;
return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
}
///
/// 转半角小写的函数(DBC case)
///
/// 任意字符串
/// 半角字符串
///
///全角空格为12288,半角空格为32
///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
///
private string ToDBC(string input)
{
char[] c = input.ToCharArray();
for (int i = 0; i < c.Length; i++)
{
if (c[i] == 12288)
{
c[i] = (char)32;
continue;
}
if (c[i] > 65280 && c[i] < 65375)
c[i] = (char)(c[i] - 65248);
}
return new string(c).ToLower();
}
///
/// 加载内存词库
///
private void LoadDictionary()
{
if (DictionaryPath != string.Empty)
{
List wordList = new List();
Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
string[] words = System.IO.File.ReadAllLines(DictionaryPath, System.Text.Encoding.Default);
foreach (string word in words)
{
string key = this.ToDBC(word);
wordList.Add(key);
wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));
}
Comparison cmp = delegate(string key1, string key2)
{
return key1.CompareTo(key2);
};
wordList.Sort(cmp);
for (int i = wordList.Count - 1; i > 0; i--)
{
if (wordList[i].ToString() == wordList[i - 1].ToString())
{
wordList.RemoveAt(i);
}
}
foreach (var word in wordList)
{
WordGroup group = MEMORYLEXICON[(int)word[0]];
if (group == null)
{
group = new WordGroup();
MEMORYLEXICON[(int)word[0]] = group;
}
group.Add(word.Substring(1));
}
}
}
///
/// 检测
///
///
///
private bool Check(string blackWord)
{
wordlenght = 0;
//检测源下一位游标
nextCursor = cursor + 1;
bool found = false;
//遍历词的每一位做匹配
for (int i = 0; i < blackWord.Length; i++)
{
//特殊字符偏移游标
int offset = 0;
if (nextCursor >= sourctText.Length)
{
break;
}
else
{
//检测下位字符如果不是汉字 数字 字符 偏移量加1
for (int y = nextCursor; y < sourctText.Length; y++)
{
if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
{
offset++;
//避让特殊字符,下位游标如果>=字符串长度 跳出
if (nextCursor + offset >= sourctText.Length) break;
wordlenght++;
}
else break;
}
if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
{
found = true;
}
else
{
found = false;
break;
}
}
nextCursor = nextCursor + 1 + offset;
wordlenght++;
}
return found;
}
///
/// 查找并替换
///
///
public string Filter(char replaceChar)
{
LoadDictionary();
if (sourctText != string.Empty)
{
char[] tempString = sourctText.ToCharArray(); ;
for (int i = 0; i < SourctText.Length; i++)
{
//查询以该字为首字符的词组
WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
if (group != null)
{
for (int z = 0; z < group.Count(); z++)
{
string word = group.GetWord(z);
if (word.Length == 0 || Check(word))
{
string blackword = string.Empty;
for (int pos = 0; pos < wordlenght + 1; pos++)
{
blackword += tempString[pos + cursor].ToString();
tempString[pos + cursor] = replaceChar;
}
illegalWords.Add(blackword);
cursor = cursor + wordlenght;
i = i + wordlenght;
}
}
}
cursor++;
}
return new string(tempString);
}
else
{
return string.Empty;
}
}
}
///
/// 具有相同首字符的词组集合
///
class WordGroup
{
///
/// 集合
///
private List groupList;
public WordGroup()
{
groupList = new List();
}
///
/// 添加词
///
///
public void Add(string word)
{
groupList.Add(word);
}
///
/// 获取总数
///
///
public int Count()
{
return groupList.Count;
}
///
/// 根据下标获取词
///
///
///
public string GetWord(int index)
{
return groupList[index];
}
}
#endregion
}
需要留意:wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));//需要在使用的位置添加Microsoft.VisualBasic引用
string path = "bad.txt";//敏感词库----当然你也可以自定义
FoundationHelper.FilterWord filter = new FoundationHelper.FilterWord(path);
filter.SourctText = (需要比对的文本);
string msg = filter.Filter('*');//替换的文本内容