中英文关键字生成器



 中英文关键字生成器:

中文,会生成最大命中率2+3的格式,英文保留原词,至少2个长。

见我http://www.cnblogs.com/dullwolf/archive/2011/04/14/2015539.html
这个文章:倒排索引,中文维持2+3长度的重要性。

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Text.RegularExpressions;



namespace ConsoleApplication2

{

    class Program

    {

        static Dictionary<string, int> WordIndex = new Dictionary<string, int>();

        static void Main(string[] args)

        {

            WordIndex.Add("上海", 0);

            WordIndex.Add("上海市", 0);

            WordIndex.Add("质量", 0);

            WordIndex.Add("技术", 0);

            WordIndex.Add("监督", 0);

            WordIndex.Add("监督局", 0);

            WordIndex.Add("吊销", 0);

            WordIndex.Add("染色", 0);

            WordIndex.Add("馒头", 0);

            WordIndex.Add("加工", 0);

            WordIndex.Add("工厂", 0);

            WordIndex.Add("加工厂", 0);

            WordIndex.Add("生产", 0);

            WordIndex.Add("许可", 0);

            WordIndex.Add("许可证", 0);



            KeyMaker KM = new KeyMaker();

            string input = @"上海市质量技术监督局吊销了Shanghai ABC染色馒头加工厂的生产许可证";

            Console.WriteLine(KM.GetMaxHitKey(input, WordIndex));

            input = @"上A海市c质量c技术监督局aa吊销了Shanghai ABC染色馒头厂的生产许可证";

            Console.WriteLine(KM.GetMaxHitKey(input, WordIndex));

            Console.WriteLine("----");

            Console.Read();

        }



        public class KeyMaker

        {

            private Dictionary<string, int> getChinesMaxHitKey(string text, Dictionary<string, int> dict)

            {

                Dictionary<string, int> D = new Dictionary<string, int>();

                List<string> strList = GetAllKey(text);

                //查找最大命中,线性扫描即可,无须排序

                if (strList.Count > 0)

                {

                    int maxValue = -1;

                    int maxIndex = -1;

                    for (int i = 0; i < strList.Count; i++)

                    {

                        string[] arrA = strList[i].Split(" ".ToCharArray());

                        int x = 0;

                        foreach (string a in arrA)

                        {

                            x += (dict.ContainsKey(a) ? 1 : 0);

                        }

                        if (x > maxValue)

                        {

                            maxValue = x;

                            maxIndex = i;

                        }

                    }



                    string[] arrStr = strList[maxIndex].Split(" ".ToCharArray());

                    foreach (string a in arrStr)

                    {

                        AddDict(D, a, maxValue);

                    }

                }

                return D;



            }



            public string GetMaxHitKey(string text, Dictionary<string, int> dict)

            {

                Dictionary<string, wordInfo> D = getSegMent(text.ToLower());

                Dictionary<string, int> finalDict = new Dictionary<string, int>();

                foreach (string K in D.Keys)

                {

                    if (D[K].IsChinese)

                    {

                        AddDict(finalDict, getChinesMaxHitKey(K, dict));

                    }

                    else

                    {

                        AddDict(finalDict, K, 1);

                    }

                }

                string re = "";

                foreach (string K in finalDict.Keys)

                {

                    re += K + " ";

                }

                return re.Trim();



            }





            private List<string> GetAllKey(string text)

            {

                List<string> strList = new List<string>();

                if (text.Length > 1 && text.Length < 30)

                {

                    getKeys(text, text, "", strList);

                }

                return strList;

            }

            private void getKeys(string text, string tempText, string resultText, List<string> strList)

            {

                switch (tempText.Length)

                {

                    case 0:

                        break;

                    case 1:

                        break;

                    case 2:

                        strList.Add(resultText.Trim() + " " + text.Substring(text.Length - 2));

                        break;

                    case 3:

                        strList.Add(resultText.Trim() + " " + text.Substring(text.Length - 3));

                        break;

                    default:

                        getKeys(text, tempText.Remove(0, 3), resultText + " " + tempText.Substring(0, 3), strList);

                        getKeys(text, tempText.Remove(0, 2), resultText + " " + tempText.Substring(0, 2), strList);



                        break;

                }

            }



            private void AddDict(Dictionary<string, wordInfo> D, string theWord, bool isCHN)

            {

                if (!D.ContainsKey(theWord))

                {

                    wordInfo WI = new wordInfo();

                    WI.Word = theWord;

                    WI.IsChinese = isCHN;

                    D.Add(theWord, WI);

                }

            }

            private void AddDict(Dictionary<string, int> targetD, Dictionary<string, int> SourceD)

            {

                foreach (string K in SourceD.Keys)

                {

                    if (!targetD.ContainsKey(K))

                    {

                        targetD.Add(K, SourceD[K]);

                    }

                    else

                    {

                        targetD[K] += SourceD[K];

                    }

                }

            }

            private void AddDict(Dictionary<string, int> D, string W, int F)

            {

                if (!D.ContainsKey(W))

                {

                    D.Add(W, F);

                }



            }

            private class wordInfo

            {

                public string Word = "";

                public bool IsChinese = true;

            }



            private Dictionary<string, wordInfo> getSegMent(string text)

            {

                Dictionary<string, wordInfo> D = new Dictionary<string, wordInfo>();

                Regex RegCHN = new Regex(@"[\u4e00-\u9fa5]{2,}");



                foreach (Match M in RegCHN.Matches(text))

                {

                    AddDict(D, M.Value, true);

                }

                Regex RegEN = new Regex(@"[a-z]{2,}");

                foreach (Match M in RegEN.Matches(text))

                {

                    AddDict(D, M.Value, false);

                }



                return D;



            }





        }



    }

}



你可能感兴趣的:(关键字)