【C#】敏感词过滤

问题描述:主要检测识别文本中夹杂的色情、推广、辱骂、违禁违法等垃圾内容,并进行过滤或者屏蔽。

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;

namespace FoundationHelper
{
    #region 非法关键字过滤 bate 1.1
    /// 
    /// 非法关键词过滤(自动忽略汉字数字字母间的其他字符)
    /// 
    public class FilterWord
    {

        public FilterWord() { }

        public FilterWord(string dictionaryPath)
        {
            this.dictionaryPath = dictionaryPath;
        }

        private string dictionaryPath = string.Empty;
        /// 
        /// 词库路径
        /// 
        public string DictionaryPath
        {
            get { return dictionaryPath; }
            set { dictionaryPath = value; }
        }
        /// 
        /// 内存词典
        /// 
        private WordGroup[] MEMORYLEXICON = new WordGroup[(int)char.MaxValue];

        private string sourctText = string.Empty;
        /// 
        /// 检测源
        /// 
        public string SourctText
        {
            get { return sourctText; }
            set { sourctText = value; }
        }

        /// 
        /// 检测源游标
        /// 
        int cursor = 0;

        /// 
        /// 匹配成功后偏移量
        /// 
        int wordlenght = 0;

        /// 
        /// 检测词游标
        /// 
        int nextCursor = 0;


        private List illegalWords = new List();

        /// 
        /// 检测到的非法词集
        /// 
        public List IllegalWords
        {
            get { return illegalWords; }
        }

        /// 
        /// 判断是否是中文
        /// 
        /// 
        /// 
        private bool isCHS(char character)
        {
            //  中文表意字符的范围 4E00-9FA5
            int charVal = (int)character;
            return (charVal >= 0x4e00 && charVal <= 0x9fa5);
        }

        /// 
        /// 判断是否是数字
        /// 
        /// 
        /// 
        private bool isNum(char character)
        {
            int charVal = (int)character;
            return (charVal >= 48 && charVal <= 57);
        }

        /// 
        /// 判断是否是字母
        /// 
        /// 
        /// 
        private bool isAlphabet(char character)
        {
            int charVal = (int)character;
            return ((charVal >= 97 && charVal <= 122) || (charVal >= 65 && charVal <= 90));
        }


        /// 
        /// 转半角小写的函数(DBC case)
        /// 
        /// 任意字符串
        /// 半角字符串
        ///
        ///全角空格为12288,半角空格为32
        ///其他字符半角(33-126)与全角(65281-65374)的对应关系是:均相差65248
        ///
        private string ToDBC(string input)
        {
            char[] c = input.ToCharArray();
            for (int i = 0; i < c.Length; i++)
            {
                if (c[i] == 12288)
                {
                    c[i] = (char)32;
                    continue;
                }
                if (c[i] > 65280 && c[i] < 65375)
                    c[i] = (char)(c[i] - 65248);
            }
            return new string(c).ToLower();
        }

        /// 
        /// 加载内存词库
        /// 
        private void LoadDictionary()
        {
            if (DictionaryPath != string.Empty)
            {
                List wordList = new List();
                Array.Clear(MEMORYLEXICON, 0, MEMORYLEXICON.Length);
                string[] words = System.IO.File.ReadAllLines(DictionaryPath, System.Text.Encoding.Default);
                foreach (string word in words)
                {
                    string key = this.ToDBC(word);
                    wordList.Add(key);
                    wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));
                }
                Comparison cmp = delegate(string key1, string key2)
                {
                    return key1.CompareTo(key2);
                };
                wordList.Sort(cmp);
                for (int i = wordList.Count - 1; i > 0; i--)
                {
                    if (wordList[i].ToString() == wordList[i - 1].ToString())
                    {
                        wordList.RemoveAt(i);
                    }
                }
                foreach (var word in wordList)
                {
                    WordGroup group = MEMORYLEXICON[(int)word[0]];
                    if (group == null)
                    {
                        group = new WordGroup();
                        MEMORYLEXICON[(int)word[0]] = group;

                    }
                    group.Add(word.Substring(1));
                }
            }

        }

        /// 
        /// 检测
        /// 
        /// 
        /// 
        private bool Check(string blackWord)
        {
            wordlenght = 0;
            //检测源下一位游标
            nextCursor = cursor + 1;
            bool found = false;
            //遍历词的每一位做匹配
            for (int i = 0; i < blackWord.Length; i++)
            {
                //特殊字符偏移游标
                int offset = 0;
                if (nextCursor >= sourctText.Length)
                {
                    break;
                }
                else
                {
                    //检测下位字符如果不是汉字 数字 字符 偏移量加1
                    for (int y = nextCursor; y < sourctText.Length; y++)
                    {

                        if (!isCHS(sourctText[y]) && !isNum(sourctText[y]) && !isAlphabet(sourctText[y]))
                        {
                            offset++;
                            //避让特殊字符,下位游标如果>=字符串长度 跳出
                            if (nextCursor + offset >= sourctText.Length) break;
                            wordlenght++;

                        }
                        else break;
                    }

                    if ((int)blackWord[i] == (int)sourctText[nextCursor + offset])
                    {
                        found = true;
                    }
                    else
                    {
                        found = false;
                        break;
                    }


                }
                nextCursor = nextCursor + 1 + offset;
                wordlenght++;


            }
            return found;
        }

        /// 
        /// 查找并替换
        /// 
        /// 
        public string Filter(char replaceChar)
        {
            LoadDictionary();
            if (sourctText != string.Empty)
            {
                char[] tempString = sourctText.ToCharArray(); ;
                for (int i = 0; i < SourctText.Length; i++)
                {
                    //查询以该字为首字符的词组
                    WordGroup group = MEMORYLEXICON[(int)ToDBC(SourctText)[i]];
                    if (group != null)
                    {
                        for (int z = 0; z < group.Count(); z++)
                        {
                            string word = group.GetWord(z);
                            if (word.Length == 0 || Check(word))
                            {
                                string blackword = string.Empty;
                                for (int pos = 0; pos < wordlenght + 1; pos++)
                                {
                                    blackword += tempString[pos + cursor].ToString();
                                    tempString[pos + cursor] = replaceChar;

                                }
                                illegalWords.Add(blackword);
                                cursor = cursor + wordlenght;
                                i = i + wordlenght;

                            }
                        }
                    }
                    cursor++;
                }
                return new string(tempString);
            }
            else
            {
                return string.Empty;
            }

        }
    }
    /// 
    /// 具有相同首字符的词组集合
    /// 
    class WordGroup
    {
        /// 
        /// 集合
        /// 
        private List groupList;

        public WordGroup()
        {
            groupList = new List();
        }

        /// 
        /// 添加词
        /// 
        /// 
        public void Add(string word)
        {
            groupList.Add(word);
        }

        /// 
        /// 获取总数
        /// 
        /// 
        public int Count()
        {
            return groupList.Count;
        }

        /// 
        /// 根据下标获取词
        /// 
        /// 
        /// 
        public string GetWord(int index)
        {
            return groupList[index];
        }
    }

    #endregion
}
需要留意:wordList.Add(Microsoft.VisualBasic.Strings.StrConv(key, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0));//需要在使用的位置添加Microsoft.VisualBasic引用
  string path = "bad.txt";//敏感词库----当然你也可以自定义
FoundationHelper.FilterWord filter = new FoundationHelper.FilterWord(path);
filter.SourctText = (需要比对的文本);
string msg = filter.Filter('*');//替换的文本内容

你可能感兴趣的:(敏感词过滤)