编辑距离就是用来计算从原串(s)转换到目标串(t)所需要的最少的插入,删除和替换的数目,在NLP中应用比较广泛,如一些评测方法中就用到了(wer,mWer等),同时也常用来计算你对原文本所作的改动数。
编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫Levenshtein Distance。
Levenshtein distance (LD) is a measure of the similarity between two strings, which we will refer to as the source string (s) and the target string (t). The distance is the number of deletions, insertions, or substitutions required to transform s into t. For example,
The greater the Levenshtein distance, the more different the strings are.
Levenshtein distance is named after the Russian scientist Vladimir Levenshtein, who devised the algorithm in 1965. If you can't spell or pronounce Levenshtein, the metric is also sometimes called edit distance.
The Levenshtein distance algorithm has been used in:
Step | Description |
---|---|
1 | Set n to be the length of s. Set m to be the length of t. If n = 0, return m and exit. If m = 0, return n and exit. Construct a matrix containing 0..m rows and 0..n columns. |
2 | Initialize the first row to 0..n. Initialize the first column to 0..m. |
3 | Examine each character of s (i from 1 to n). |
4 | Examine each character of t (j from 1 to m). |
5 | If s[i] equals t[j], the cost is 0. If s[i] doesn't equal t[j], the cost is 1. |
6 | Set cell d[i,j] of the matrix equal to the minimum of: a. The cell immediately above plus 1: d[i-1,j] + 1. b. The cell immediately to the left plus 1: d[i,j-1] + 1. c. The cell diagonally above and to the left plus the cost: d[i-1,j-1] + cost. |
7 | After the iteration steps (3, 4, 5, 6) are complete, the distance is found in cell d[n,m]. |
This section shows how the Levenshtein distance is computed when the source string is "GUMBO" and the target string is "GAMBOL".
G | U | M | B | O | ||
0 | 1 | 2 | 3 | 4 | 5 | |
G | 1 | |||||
A | 2 | |||||
M | 3 | |||||
B | 4 | |||||
O | 5 | |||||
L | 6 |
G | U | M | B | O | ||
0 | 1 | 2 | 3 | 4 | 5 | |
G | 1 | 0 | ||||
A | 2 | 1 | ||||
M | 3 | 2 | ||||
B | 4 | 3 | ||||
O | 5 | 4 | ||||
L | 6 | 5 |
G | U | M | B | O | ||
0 | 1 | 2 | 3 | 4 | 5 | |
G | 1 | 0 | 1 | |||
A | 2 | 1 | 1 | |||
M | 3 | 2 | 2 | |||
B | 4 | 3 | 3 | |||
O | 5 | 4 | 4 | |||
L | 6 | 5 | 5 |
G | U | M | B | O | ||
0 | 1 | 2 | 3 | 4 | 5 | |
G | 1 | 0 | 1 | 2 | ||
A | 2 | 1 | 1 | 2 | ||
M | 3 | 2 | 2 | 1 | ||
B | 4 | 3 | 3 | 2 | ||
O | 5 | 4 | 4 | 3 | ||
L | 6 | 5 | 5 | 4 |
G | U | M | B | O | ||
0 | 1 | 2 | 3 | 4 | 5 | |
G | 1 | 0 | 1 | 2 | 3 | |
A | 2 | 1 | 1 | 2 | 3 | |
M | 3 | 2 | 2 | 1 | 2 | |
B | 4 | 3 | 3 | 2 | 1 | |
O | 5 | 4 | 4 | 3 | 2 | |
L | 6 | 5 | 5 | 4 | 3 |
G | U | M | B | O | ||
0 | 1 | 2 | 3 | 4 | 5 | |
G | 1 | 0 | 1 | 2 | 3 | 4 |
A | 2 | 1 | 1 | 2 | 3 | 4 |
M | 3 | 2 | 2 | 1 | 2 | 3 |
B | 4 | 3 | 3 | 2 | 1 | 2 |
O | 5 | 4 | 4 | 3 | 2 | 1 |
L | 6 | 5 | 5 | 4 | 3 | 2 |
算法示例1:
private int ComputeDistance (string s, string t) { int n=s.Length; int m=t.Length; int[,] distance=new int[n + 1, m + 1]; // matrix int cost=0; if(n == 0) return m; if(m == 0) return n; //init1 for(int i=0; i <= n; distance[i, 0]=i++); for(int j=0; j <= m; distance[0, j]=j++); //find min distance for(int i=1; i <= n; i++) { for(int j=1; j <= m;j++) { cost=(t.Substring(j - 1, 1) == s.Substring(i - 1, 1) ? 0 : 1); distance[i,j]=Min3(distance[i - 1, j] + 1, distance[i, j - 1] + 1, distance[i - 1, j - 1] + cost); } } return distance[n, m]; }
算法示例2:
private int Levenshtein(string str1, string str2)
{
int n = str1.Length;
int m = str2.Length;
int i; //遍历str1的
int j; //遍历str2的
char ch1; //str1的
char ch2; //str2的
int temp; //记录相同字符,在某个矩阵位置值的增量,不是0就是1
if(n == 0)
{
return m;
}
if(m == 0)
{
return n;
}
int[,] d = new int[n+1,m+1];
for(i=0; i<=n; i++)
{ //初始化第一列
d[i,0] = i;
}
for(j=0; j<=m; j++)
{ //初始化第一行
d[0,j] = j;
}
for(i=1; i<=n; i++)
{ //遍历str1
ch1 = str1[i-1];
//去匹配str2
for(j=1; j<=m; j++)
{
ch2 = str2[j-1];
if(ch1 == ch2)
{
temp = 0;
} else
{
temp = 1;
}
//左边+1,上边+1, 左上角+temp取最小
d[i, j] = Min(d[i - 1, j] + 1, d[i, j - 1] + 1, d[i - 1, j - 1] + temp);
}
}
return d[n,m];
}
private int Min(int one, int two, int three)
{
int min = one;
if (two < min)
{
min = two;
}
if (three < min)
{
min = three;
}
return min;
}
private double Sim(String str1, String str2)
{
int ld = Levenshtein(str1, str2);
return 1 - (double)ld / Math.Max(str1.Length, str2.Length);
}
算法示例3:空間復雜度從O(n*m)降到O(2m)
///*****************************
/// Compute Levenshtein distance
/// Memory efficient version
///*****************************
public int iLD(String sRow, String sCol)
{
int RowLen = sRow.Length; // length of sRow
int ColLen = sCol.Length; // length of sCol
int RowIdx; // iterates through sRow
int ColIdx; // iterates through sCol
char Row_i; // ith character of sRow
char Col_j; // jth character of sCol
int cost; // cost
/// Test string length
if (Math.Max(sRow.Length, sCol.Length) > Math.Pow(2, 31))
throw (new Exception("/nMaximum string length in Levenshtein.iLD is " + Math.Pow(2, 31) + "./nYours is " + Math.Max(sRow.Length, sCol.Length) + "."));
// Step 1
if (RowLen == 0)
{
return ColLen;
}
if (ColLen == 0)
{
return RowLen;
}
/// Create the two vectors
int[] v0 = new int[RowLen + 1];
int[] v1 = new int[RowLen + 1];
int[] vTmp;
/// Step 2
/// Initialize the first vector
for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
{
v0[RowIdx] = RowIdx;
}
// Step 3
/// Fore each column
for (ColIdx = 1; ColIdx <= ColLen; ColIdx++)
{
/// Set the 0'th element to the column number
v1[0] = ColIdx;
Col_j = sCol[ColIdx - 1];
// Step 4
/// Fore each row
for (RowIdx = 1; RowIdx <= RowLen; RowIdx++)
{
Row_i = sRow[RowIdx - 1];
// Step 5
if (Row_i == Col_j)
{
cost = 0;
}
else
{
cost = 1;
}
// Step 6
/// Find minimum
int m_min = v0[RowIdx] + 1;
int b = v1[RowIdx - 1] + 1;
int c = v0[RowIdx - 1] + cost;
if (b < m_min)
{
m_min = b;
}
if (c < m_min)
{
m_min = c;
}
v1[RowIdx] = m_min;
}
/// Swap the vectors
vTmp = v0;
v0 = v1;
v1 = vTmp;
}
// Step 7
/// Value between 0 - 100
/// 0==perfect match 100==totaly different
///
/// The vectors where swaped one last time at the end of the last loop,
/// that is why the result is now in v0 rather than in v1
System.Console.WriteLine("iDist=" + v0[RowLen]);
int max = System.Math.Max(RowLen, ColLen);
return ((100 * v0[RowLen]) / max);
}
From:http://hi.baidu.com/xining52113339/blog/item/8a23f1388ddfc523b9998f47.html
http://hi.baidu.com/pecefull0513/blog/item/a746ca1a292b9c118618bfbd.html
http://www.codeproject.com/KB/recipes/improvestringsimilarity.aspx
http://en.wikipedia.org/wiki/Levenshtein_distance
http://www.codeproject.com/KB/recipes/Levenshtein.aspx