参考:http://www.merriampark.com/ld.htm#WHATIS
http://en.wikipedia.org/wiki/Levenshtein_distance
* Java
* C++
* Visual Basic
* Python
Java
public class Distance {
//****************************
// Get minimum of three values
//****************************
private int Minimum (int a, int b, int c) {
int mi;
mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}
//*****************************
// Compute Levenshtein distance
//*****************************
public int LD (String s, String t) {
int d[][]; // matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
// Step 1
n = s.length ();
m = t.length ();
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
d = new int[n+1][m+1];
// Step 2
for (i = 0; i <= n; i++) {
d[i][0] = i;
}
for (j = 0; j <= m; j++) {
d[0][j] = j;
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s.charAt (i - 1);
// Step 4
for (j = 1; j <= m; j++) {
t_j = t.charAt (j - 1);
// Step 5
if (s_i == t_j) {
cost = 0;
}
else {
cost = 1;
}
// Step 6
d[i][j] = Minimum (d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1] + cost);
}
}
// Step 7
return d[n][m];
}
}
C++
In C++, the size of an array must be a constant, and this code fragment causes an error at compile time:
int sz = 5;
int arr[sz];
This limitation makes the following C++ code slightly more complicated than it would be if the matrix could simply be declared as a two-dimensional array, with a size determined at run-time.
In C++ it's more idiomatic to use the System Template Library's vector class, as Anders Sewerin Johansen has done in an alternative C++ implementation.
Here is the definition of the class (distance.h):
class Distance
{
public:
int LD (char const *s, char const *t);
private:
int Minimum (int a, int b, int c);
int *GetCellPointer (int *pOrigin, int col, int row, int nCols);
int GetAt (int *pOrigin, int col, int row, int nCols);
void PutAt (int *pOrigin, int col, int row, int nCols, int x);
};
Here is the implementation of the class (distance.cpp):
#include "distance.h"
#include <string.h>
#include <malloc.h>
//****************************
// Get minimum of three values
//****************************
int Distance::Minimum (int a, int b, int c)
{
int mi;
mi = a;
if (b < mi) {
mi = b;
}
if (c < mi) {
mi = c;
}
return mi;
}
//**************************************************
// Get a pointer to the specified cell of the matrix
//**************************************************
int *Distance::GetCellPointer (int *pOrigin, int col, int row, int nCols)
{
return pOrigin + col + (row * (nCols + 1));
}
//*****************************************************
// Get the contents of the specified cell in the matrix
//*****************************************************
int Distance::GetAt (int *pOrigin, int col, int row, int nCols)
{
int *pCell;
pCell = GetCellPointer (pOrigin, col, row, nCols);
return *pCell;
}
//*******************************************************
// Fill the specified cell in the matrix with the value x
//*******************************************************
void Distance::PutAt (int *pOrigin, int col, int row, int nCols, int x)
{
int *pCell;
pCell = GetCellPointer (pOrigin, col, row, nCols);
*pCell = x;
}
//*****************************
// Compute Levenshtein distance
//*****************************
int Distance::LD (char const *s, char const *t)
{
int *d; // pointer to matrix
int n; // length of s
int m; // length of t
int i; // iterates through s
int j; // iterates through t
char s_i; // ith character of s
char t_j; // jth character of t
int cost; // cost
int result; // result
int cell; // contents of target cell
int above; // contents of cell immediately above
int left; // contents of cell immediately to left
int diag; // contents of cell immediately above and to left
int sz; // number of cells in matrix
// Step 1
n = strlen (s);
m = strlen (t);
if (n == 0) {
return m;
}
if (m == 0) {
return n;
}
sz = (n+1) * (m+1) * sizeof (int);
d = (int *) malloc (sz);
// Step 2
for (i = 0; i <= n; i++) {
PutAt (d, i, 0, n, i);
}
for (j = 0; j <= m; j++) {
PutAt (d, 0, j, n, j);
}
// Step 3
for (i = 1; i <= n; i++) {
s_i = s[i-1];
// Step 4
for (j = 1; j <= m; j++) {
t_j = t[j-1];
// Step 5
if (s_i == t_j) {
cost = 0;
}
else {
cost = 1;
}
// Step 6
above = GetAt (d,i-1,j, n);
left = GetAt (d,i, j-1, n);
diag = GetAt (d, i-1,j-1, n);
cell = Minimum (above + 1, left + 1, diag + cost);
PutAt (d, i, j, n, cell);
}
}
// Step 7
result = GetAt (d, n, m, n);
free (d);
return result;
}
Visual Basic
'*******************************
'*** Get minimum of three values
'*******************************
Private Function Minimum(ByVal a As Integer, _
ByVal b As Integer, _
ByVal c As Integer) As Integer
Dim mi As Integer
mi = a
If b < mi Then
mi = b
End If
If c < mi Then
mi = c
End If
Minimum = mi
End Function
'********************************
'*** Compute Levenshtein Distance
'********************************
Public Function LD(ByVal s As String, ByVal t As String) As Integer
Dim d() As Integer ' matrix
Dim m As Integer ' length of t
Dim n As Integer ' length of s
Dim i As Integer ' iterates through s
Dim j As Integer ' iterates through t
Dim s_i As String ' ith character of s
Dim t_j As String ' jth character of t
Dim cost As Integer ' cost
' Step 1
n = Len(s)
m = Len(t)
If n = 0 Then
LD = m
Exit Function
End If
If m = 0 Then
LD = n
Exit Function
End If
ReDim d(0 To n, 0 To m) As Integer
' Step 2
For i = 0 To n
d(i, 0) = i
Next i
For j = 0 To m
d(0, j) = j
Next j
' Step 3
For i = 1 To n
s_i = Mid$(s, i, 1)
' Step 4
For j = 1 To m
t_j = Mid$(t, j, 1)
' Step 5
If s_i = t_j Then
cost = 0
Else
cost = 1
End If
' Step 6
d(i, j) = Minimum(d(i - 1, j) + 1, d(i, j - 1) + 1, d(i - 1, j - 1) + cost)
Next j
Next i
' Step 7
LD = d(n, m)
Erase d
End Function
#!/user/bin/env python
# -*- coding: utf-8 -*-
class arithmetic():
def __init__(self):
pass
''' 【编辑距离算法】 【levenshtein distance】 【字符串相似度算法】 '''
def levenshtein(self,first,second):
if len(first) > len(second):
first,second = second,first
if len(first) == 0:
return len(second)
if len(second) == 0:
return len(first)
first_length = len(first) + 1
second_length = len(second) + 1
distance_matrix = [range(second_length) for x in range(first_length)]
#print distance_matrix
for i in range(1,first_length):
for j in range(1,second_length):
deletion = distance_matrix[i-1][j] + 1
insertion = distance_matrix[i][j-1] + 1
substitution = distance_matrix[i-1][j-1]
if first[i-1] != second[j-1]:
substitution += 1
distance_matrix[i][j] = min(insertion,deletion,substitution)
print distance_matrix
return distance_matrix[first_length-1][second_length-1]
if __name__ == "__main__":
arith = arithmetic()
print arith.levenshtein('GUMBOsdafsadfdsafsafsadfasfadsfasdfasdfs','GAMBOL00000000000dfasfasfdafsafasfasdfdsa'