Daily Report 2012.11.07 刘宇翔

修改了模糊匹配的算法。

增加了低匹配程度的精确度。

修正了一些因算法变动产生的bug。

至此match算法基本完成。

之后将其应用到search算法中,并一起参加search算法的改进。

using System;

using System.Collections.Generic;

using System.Linq;

using System.Text;

using System.Threading.Tasks;

using System.IO;



namespace match0

{

    class Program

    {

        static public int match(string word,string keyword)

        {

            int matchDegree = -1;//word、keyword匹配级别

            //string[] wordlist = word.Split(new char[] { ' ' },StringSplitOptions.RemoveEmptyEntries);

            List<string> wordlist = ChineseWordSegmentation.word_segmentation(word);

            int wlN = wordlist.Count();//word关键词数量

            if (wlN == 0)

            {

                string[] wordlist2 = word.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                wlN = wordlist2.Count();

                for (int i = 0; i < wlN; i++)

                    wordlist.Add(wordlist2[i]);

            }

            



            //输入有空,返回************************************************************************

            if (word.Length == 0 || keyword.Length == 0)//输入有空,返回-1

                return matchDegree;



            //**************************************************************************************



            //模糊匹配,返回0或1或2或3或4***********************************************************

            if (wlN == 1)//word只含一个关键词

            {

                matchDegree = wordmatch(wordlist[0], keyword);

            }

            else //word含多个关键词

            {

                List<int> wkDegree = new List<int>();

                for (int i = 0; i < wlN; i++)

                    wkDegree.Add(0);

                for (int j = 0; j < wlN; j++)

                {

                    wkDegree[j] = wordmatch(wordlist[j], keyword);

                }

                

                //取wkDegree[]最大数法

                int Max = 0;

                for (int i = 0; i < wlN; i++)

                {

                    if (wkDegree[i] > Max)

                    {

                        Max = wkDegree[i];

                    }

                }

                matchDegree = Max;



                return matchDegree;

            }

            //***************************************************************************************



            return matchDegree;//因错误等不明原因跳出,返回-1

        }



        static public int wordmatch(string w, string keyword)//单个关键词对keyword的模糊匹配,w为单个关键词

        {

            int wmatchDegree = 0;//w、keyword匹配级别

            int Max = 0;

            List<int> wkDegree=new List<int>();

            //string[] keywordlist = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

            List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);//含新中文分词算法

            int klN = keywordlist.Count();//keyword关键词数量

            if (klN == 0)

            {

                string[] keywordlist2 = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries);

                klN = keywordlist2.Count();

                for (int i = 0; i < klN; i++)

                    keywordlist.Add(keywordlist2[i]);

            }

            for (int i = 0; i < klN;i++ )

                wkDegree.Add(0);

            for (int j = 0; j < klN; j++)

            {

                wkDegree[j] = wkmatch(w, keywordlist[j]);

            }

            Max = wkDegree[0];

            for (int i = 1; i < klN; i++)

            {

                if (wkDegree[i] > Max)

                    Max = wkDegree[i];

            }

            wmatchDegree = Max;

            return wmatchDegree;

        }



        static public int wkmatch(string w, string k)//单个关键词对单个k的模糊匹配,k为keyword单个关键词

        {

            int wkDegree = 0;//w、k匹配级别

            int YorN = 0;

            int m = w.Length;

            int n = k.Length;

            w = w.ToLower();

            k = k.ToLower();

            int minLength = 0;          //定义一个最小长度变量用于存储w和k中长度的最小值



            //模糊度4

            if ( w == k )

            {

                wkDegree = 4*w.Length;

                return wkDegree;

            }



            //模糊度3

            else if (m < n)

            {

                minLength = m;

                YorN=m3(w, k);

            }

            else

            {

                minLength = n;

                YorN=m3(k, w);

            }

            if (YorN == 1)

            {

                wkDegree = 3 * minLength;

                return wkDegree;

            }



            //模糊度2

            if (m < n)

            {

                YorN = m2(w, k);

            }

            else

            {

                YorN = m2(k, w);

            }

            if (YorN > 0 )

            {

                if (editDistance(w, k) < minLength / 4)                     //当两个字符串的模糊度通过模糊度计算后得到的结果为2时,不直接就认为他们的模糊度为2,还要考虑编辑距离,如果编辑距离小于某一个值时,就认为他们的模糊度为三

                    wkDegree = 3 * YorN;

                else

                    wkDegree = 2 * YorN;

                return wkDegree;

            }



            //模糊度1

            if (m < n)

            {

                YorN = m1(w, k);

            }

            else

            {

                YorN = m1(k, w);

            }

            if (YorN > 0 )

            {

                if (editDistance(w, k) < minLength / 10)                 //编辑距离的使用原因同上;

                    wkDegree = 2 * YorN;

                else

                    wkDegree = 1 * YorN;

                return wkDegree;

            }



            //模糊度0

            else

            {

                wkDegree = 0;

                return wkDegree;

            }

        }



        //模糊度3

        static public int m3(string x, string y)

        {

            int ans=0;

            int m = x.Length;

            int n = y.Length;

            int i = 0;

            int j = 0;

            int k = 0;

            while (i < m && j < n)

            {

                if (x[i] == y[j])

                {

                    i++;

                    j++;

                    if (i == m)

                    {

                        ans = 1;

                        break;

                    }

                }

                else

                {

                    i = 0;

                    k++;

                    j = k;

                }

            }

            return ans;

        }



        //模糊度2

        static public int m2(string x, string y)

        {

            int ans = 0;

            int m = x.Length;

            int n = y.Length;

            int Ml = 0;

            int l = 0;

            for (int i = 0; i < (m / 2); i++)

            {

                int k = i;

                for (int j = 0; j < n; j++)

                {

                    if (x[k] == y[j])

                    {

                        k++;

                        l++;

                        if (k == m)

                        {

                            k = i;

                            j--;

                            if (l > Ml)

                                Ml = l;

                            l = 0;

                        }

                    }

                    else 

                    {

                        if (k > i)

                        {

                            k = i;

                            j--;

                            if (l > Ml)

                                Ml = l;

                            l = 0;

                        }

                        else

                        {

                            k = i;

                        }

                    }



                }

            }



            if (Ml > (m / 2)+1)           //当最大匹配长度大于m/2时就说明满足模糊程度为2的条件

                ans = Ml;

            else

                ans = 0;



            return ans;

        }



        //模糊度1

        static public int m1(string x, string y)

        {

            int ans = 0;

            int m = x.Length;

            int n = y.Length;

            for (int i = 0; i < m; i++)

            {

                int j;

                for (j = 0; j < n; j++)

                {

                    if (x[i] == y[j])

                    {

                        ans++;

                        break;

                    }

                }

            }

            return ans;

        }



        //编写一个求两个字符串编辑距离的方法,提高容错率

        static public int editDistance(string x , string y) {

            //定义三个常量分别表示插入、删除和修改一个字符所消耗的编辑次数

            const int COSTINDEL = 1;

            const int COSTININS = 1;

            const int COSTINSUB = 1;



            int xLength = x.Length, yLength = y.Length;

            //二维数组distance用于存储动态规划过程中每一步的编辑距离

            int row = xLength + 1, low = yLength + 1;

            int[][] distance=new int[row][];

            for (int i = 0; i < row; i++) {

                distance[i] = new int[low];

            }



            //初始化距离distance二维表的行和列

            distance[0][0] = 0;

            for (int i = 1; i < row; i++) {

                distance[i][0] = distance[i - 1][0] + COSTINDEL;

            }

            for (int j = 1; j < low; j++) {

                distance[0][j] = distance[0][j - 1] + COSTININS;

            }



            //利用动态规划算法求x和y的编辑距离

            for (int i = 1; i < row; i++) {

                for (int j = 1; j < low; j++) {

                    //分别用delDistance、insDistance和subDistance暂存要编辑到distance[i][j]的各种方式的编辑次数

                    int delDistance = distance[i - 1][j] + COSTINDEL;

                    int insDistance = distance[i][j - 1] + COSTININS;

                    int subDistance = distance[i - 1][j - 1] + (x[i - 1] == y[j - 1] ? 0 : COSTINSUB);



                    int temp;

                    distance[i][j] = subDistance < (temp = (delDistance < insDistance ? delDistance : insDistance)) ? subDistance : temp;    //选择一个编辑次数最少的值附给distance[i][j]

                }

            }



            return distance[xLength][yLength];             //返回两个数的编辑距离的

        }



        //将标点符号进行更改的从半角转化为全角的方法

        static string half_to_whole(string s) { 

            int sLength=s.Length;

            char[] c=s.ToCharArray();

            for (int i = 0; i < sLength; i++) { 

                byte[] b = System.Text.Encoding.Unicode.GetBytes(c,i,1);

                if (b.Length == 2) {

                    //if (b[1] == 0 && !(c[i] >= 'a' && c[i] <= 'z' || c[i] >= 'A' && c[i] <= 'Z' || c[i] >= '0' && c[i] <= '9'))

                    if (b[1] == 0)

                    {

                        b[0] = (byte)(b[0] - 32);

                        b[1] = 255;

                        c[i] = System.Text.Encoding.Unicode.GetChars(b)[0];  

                    }

                }

            }



            string news = new string(c);

            return news;

        }



        public class eachline

        {

            public string line;

            public int matchpoint;

            public int num;

        }



        static void Main(string[] args)//供测试用主函数提供各函数返回值

        {

            int a;

            string keyword = Console.ReadLine();

            int count = 0;

            StreamReader objReader = new StreamReader("test.txt", System.Text.Encoding.Default);

            string sLine = "";

            List<eachline> LineList = new List<eachline>();

            while (sLine != null)

            {

                sLine = objReader.ReadLine();

                if (sLine != null && !sLine.Equals(""))

                {

                    a = match(sLine, keyword);

                    eachline l = new eachline();

                    l.line = sLine;

                    l.matchpoint = match(sLine, keyword);

                    l.num = count;

                    LineList.Add(l);

                    count++;

                }

            }

            objReader.Close();

            eachline temp;

            int i, j;

            j = 1;

            while (j < count)//判断长度    

            {

                for (i = 0; i < count - j; i++)

                {

                    if (LineList[i].matchpoint < LineList[i + 1].matchpoint)

                    {

                        temp = LineList[i];

                        LineList[i] = LineList[i + 1];//交换数据    

                        LineList[i + 1] = temp;

                    }

                }

                j++;

            }

            List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);

            for (i = 0; i < keywordlist.Count;i++ )

                Console.WriteLine(keywordlist[i]);

                for (i = 0; i < 20; i++)

                {

                    Console.WriteLine(LineList[i].line);

                    Console.WriteLine(LineList[i].matchpoint);

                    List<string> wordlist = ChineseWordSegmentation.word_segmentation(LineList[i].line);

                    for (j = 0; j < wordlist.Count; j++)

                        Console.Write(wordlist[j] + ' ');

                    Console.WriteLine(' ');

                }



            //Console.WriteLine("");

            //List<string> xList = ChineseWordSegmentation.word_segmentation(x);

            //List<string> yList = ChineseWordSegmentation.word_segmentation(y);



            //Console.WriteLine(x + ":");

            //for (int i = 0; i < xList.Count; i++) {

            //    Console.WriteLine(xList[i]);

            //}

            //Console.WriteLine(y + ":");

            //for (int i = 0; i < yList.Count; i++) {

            //    Console.WriteLine(yList[i]);

            //}



            

        }

    }

}

 

你可能感兴趣的:(port)