今天对李忠修改过的match函数进行测试,修正bug,并进行优化。
将中文分词方法加入到算法中,提高了算法的精确度。
但中文分词方法加入到算法后,出现在一些新问题,对新出现的问题进行了修正和优化。
测试过程中运用了900条字符串的样例。
更新后代码如下:
using System; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; using System.IO; namespace match0 { class Program { static public int match(string word,string keyword) { int matchDegree = -1;//word、keyword匹配级别 //string[] wordlist = word.Split(new char[] { ' ' },StringSplitOptions.RemoveEmptyEntries); List<string> wordlist = ChineseWordSegmentation.word_segmentation(word); int wlN = wordlist.Count();//word关键词数量 if (wlN == 0) { string[] wordlist2 = word.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); wlN = wordlist2.Count(); for (int i = 0; i < wlN; i++) wordlist.Add(wordlist2[i]); } //输入有空,返回************************************************************************ if (word.Length == 0 || keyword.Length == 0)//输入有空,返回-1 return matchDegree; //************************************************************************************** //模糊匹配,返回0或1或2或3或4*********************************************************** if (wlN == 1)//word只含一个关键词 { matchDegree = wordmatch(wordlist[0], keyword); } else //word含多个关键词 { List<int> wkDegree = new List<int>(); for (int i = 0; i < wlN; i++) wkDegree.Add(0); for (int j = 0; j < wlN; j++) { wkDegree[j] = wordmatch(wordlist[j], keyword); } //取wkDegree[]最大数法 int Max = 0; for (int i = 0; i < wlN; i++) { if (wkDegree[i] > Max) { Max = wkDegree[i]; } } matchDegree = Max; return matchDegree; } //*************************************************************************************** return matchDegree;//因错误等不明原因跳出,返回-1 } static public int wordmatch(string w, string keyword)//单个关键词对keyword的模糊匹配,w为单个关键词 { int wmatchDegree = 0;//w、keyword匹配级别 int Max = 0; List<int> wkDegree=new List<int>(); //string[] keywordlist = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword);//含新中文分词算法 int klN = keywordlist.Count();//keyword关键词数量 if (klN == 0) { string[] keywordlist2 = keyword.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries); klN = keywordlist2.Count(); for (int i = 0; i < klN; i++) keywordlist.Add(keywordlist2[i]); } for (int i = 0; i < klN;i++ ) wkDegree.Add(0); for (int j = 0; j < klN; j++) { wkDegree[j] = wkmatch(w, keywordlist[j]); } Max = wkDegree[0]; for (int i = 1; i < klN; i++) { if (wkDegree[i] > Max) Max = wkDegree[i]; } wmatchDegree = Max; return wmatchDegree; } static public int wkmatch(string w, string k)//单个关键词对单个k的模糊匹配,k为keyword单个关键词 { int wkDegree = 0;//w、k匹配级别 int YorN = 0; int m = w.Length; int n = k.Length; w = w.ToLower(); k = k.ToLower(); int minLength = 0; //定义一个最小长度变量用于存储w和k中长度的最小值 //模糊度4 if ( w == k ) { wkDegree = 4*w.Length; return wkDegree; } //模糊度3 else if (m < n) { minLength = m; YorN=m3(w, k); } else { minLength = n; YorN=m3(k, w); } if (YorN == 1) { wkDegree = 3 * minLength; return wkDegree; } //模糊度2 if (m < n) { YorN = m2(w, k); } else { YorN = m2(k, w); } if (YorN == 1) { if (editDistance(w, k) < minLength / 4) //当两个字符串的模糊度通过模糊度计算后得到的结果为2时,不直接就认为他们的模糊度为2,还要考虑编辑距离,如果编辑距离小于某一个值时,就认为他们的模糊度为三 wkDegree = 3 * minLength; else wkDegree = 2 * minLength; return wkDegree; } //模糊度1 if (m < n) { YorN = m1(w, k); } else { YorN = m1(k, w); } if (YorN == 1) { if (editDistance(w, k) < minLength / 10) //编辑距离的使用原因同上; wkDegree = 2 * minLength; else wkDegree = 1 * minLength; return wkDegree; } //模糊度0 else { wkDegree = 0; return wkDegree; } } //模糊度3 static public int m3(string x, string y) { int ans=0; int m = x.Length; int n = y.Length; int i = 0; int j = 0; int k = 0; while (i < m && j < n) { if (x[i] == y[j]) { i++; j++; if (i == m) { ans = 1; break; } } else { i = 0; k++; j = k; } } return ans; } //模糊度2 static public int m2(string x, string y) { int ans = 0; int m = x.Length; int n = y.Length; int l = 0; int Ml = 0; //最大匹配长度 for (int i = 0; i < (m/2+1); i++) { int i2 = i; int j = 0; int k = 0; while(j<n) { if (x[i2] == y[j]) { i2++; j++; l++; if (i2 >= m) { i2 = i; k++; j = k; if (l > Ml) Ml = l; } } else { i2 = i; k++; j = k; if( l > Ml ) Ml = l; } } } if (Ml > (m / 2)) //当最大匹配长度大于m/2时就说明满足模糊程度为2的条件 ans = 1; else ans = 0; return ans; } //模糊度1 static public int m1(string x, string y) { int ans = 0; int m = x.Length; int n = y.Length; for (int i = 0; i < m; i++) { int j; for (j = 0; j < n; j++) { if (x[i] == y[j]) { ans = 1; break; //当ans已经为1时就可以跳出循环了 } } //当ans已经为1时就可以跳出循环了 if (j < n) break; } return ans; } //编写一个求两个字符串编辑距离的方法,提高容错率 static public int editDistance(string x , string y) { //定义三个常量分别表示插入、删除和修改一个字符所消耗的编辑次数 const int COSTINDEL = 1; const int COSTININS = 1; const int COSTINSUB = 1; int xLength = x.Length, yLength = y.Length; //二维数组distance用于存储动态规划过程中每一步的编辑距离 int row = xLength + 1, low = yLength + 1; int[][] distance=new int[row][]; for (int i = 0; i < row; i++) { distance[i] = new int[low]; } //初始化距离distance二维表的行和列 distance[0][0] = 0; for (int i = 1; i < row; i++) { distance[i][0] = distance[i - 1][0] + COSTINDEL; } for (int j = 1; j < low; j++) { distance[0][j] = distance[0][j - 1] + COSTININS; } //利用动态规划算法求x和y的编辑距离 for (int i = 1; i < row; i++) { for (int j = 1; j < low; j++) { //分别用delDistance、insDistance和subDistance暂存要编辑到distance[i][j]的各种方式的编辑次数 int delDistance = distance[i - 1][j] + COSTINDEL; int insDistance = distance[i][j - 1] + COSTININS; int subDistance = distance[i - 1][j - 1] + (x[i - 1] == y[j - 1] ? 0 : COSTINSUB); int temp; distance[i][j] = subDistance < (temp = (delDistance < insDistance ? delDistance : insDistance)) ? subDistance : temp; //选择一个编辑次数最少的值附给distance[i][j] } } return distance[xLength][yLength]; //返回两个数的编辑距离的 } //将标点符号进行更改的从半角转化为全角的方法 static string half_to_whole(string s) { int sLength=s.Length; char[] c=s.ToCharArray(); for (int i = 0; i < sLength; i++) { byte[] b = System.Text.Encoding.Unicode.GetBytes(c,i,1); if (b.Length == 2) { //if (b[1] == 0 && !(c[i] >= 'a' && c[i] <= 'z' || c[i] >= 'A' && c[i] <= 'Z' || c[i] >= '0' && c[i] <= '9')) if (b[1] == 0) { b[0] = (byte)(b[0] - 32); b[1] = 255; c[i] = System.Text.Encoding.Unicode.GetChars(b)[0]; } } } string news = new string(c); return news; } public class eachline { public string line; public int matchpoint; public int num; } static void Main(string[] args)//供测试用主函数提供各函数返回值 { int a; string keyword = Console.ReadLine(); int count = 0; StreamReader objReader = new StreamReader("test.txt", System.Text.Encoding.Default); string sLine = ""; ListLineList = new List (); while (sLine != null) { sLine = objReader.ReadLine(); if (sLine != null && !sLine.Equals("")) { a = match(sLine, keyword); eachline l = new eachline(); l.line = sLine; l.matchpoint = match(sLine, keyword); l.num = count; LineList.Add(l); count++; } } objReader.Close(); eachline temp; int i, j; j = 1; while (j < count)//判断长度 { for (i = 0; i < count - j; i++) { if (LineList[i].matchpoint < LineList[i + 1].matchpoint) { temp = LineList[i]; LineList[i] = LineList[i + 1];//交换数据 LineList[i + 1] = temp; } } j++; } List<string> keywordlist = ChineseWordSegmentation.word_segmentation(keyword); for (i = 0; i < keywordlist.Count;i++ ) Console.WriteLine(keywordlist[i]); for (i = 0; i < 20; i++) { Console.WriteLine(LineList[i].line); Console.WriteLine(LineList[i].matchpoint); List<string> wordlist = ChineseWordSegmentation.word_segmentation(LineList[i].line); for (j = 0; j < wordlist.Count; j++) Console.Write(wordlist[j] + ' '); Console.WriteLine(' '); } //Console.WriteLine(""); //List xList = ChineseWordSegmentation.word_segmentation(x); //ListyList = ChineseWordSegmentation.word_segmentation(y); //Console.WriteLine(x + ":"); //for (int i = 0; i < xList.Count; i++) { // Console.WriteLine(xList[i]); //} //Console.WriteLine(y + ":"); //for (int i = 0; i < yList.Count; i++) { // Console.WriteLine(yList[i]); //} } } }
但目前仍然能在语义上提高算法的精确度。
之后需要优化多关键词语义分析计算匹配程度,并测试修改过的代码,修正错误。