/Files/rocketfan/editdistance_readme.pdf
先给一个例子,两个字符串eeba和abca相似度是多少呢,edit distance是一个很好的度量,定义从字符串a变到字符串b,所需要的最少的操作步骤(插入,删除,更改)为两个字符串之间的编辑距离。
对于eeba,abca它们之间的编辑距离为3,可以按照上面的操作步骤(不是唯一的)将eeba变到abca,1.将e变为a 2.删除e 3.添加c 共3个步骤。
典型的动态规划问题。
EDIT[i,j]表示对于字符串a从1到i的子串和字符串b从1到j的字串的编辑距离。(字符串下标从1开始)
EDIT[i - 1,j] + 1表示对a 在i 位置删除delete操作
EDIT[i,j - 1] + 1 表示insert添加操作
EDIT[i-1, j - 1] + f(x[i],y[j])这里如果x[i] == y[j] 则 f(x[i],y[j]) == 0 否则 ==1
表示不变或者是modify操作。
如果需要记录编辑过程如第一幅图所示,需要用二维数组记录下动态规划过程的路径信息,即记录下前一步骤的位置索引信息。
如下图
//edit_distance.h
1
/*
*
2
* \file edit_distance.h
3
* \author pku_goldenlock
4
* \date 2009-8-10
5
*/
6
7
8
#ifndef _EDIT_DISTANCE_H
9
#define
_EDIT_DISTANCE_H
10
#include
<
string
>
11
using
std::
string
;
12
13
class
EditDistanceHelp {
14
private
:
15
/*
*
16
* Define the array data type as ArrayData
17
*/
18
struct
ArrayData {
19
int
dist;
/*
*< The min edit distance until current pos
*/
20
int
pre_x;
/*
*< Store the previous postion, x part horizontal
*/
21
int
pre_y;
/*
*< Store the previous postion, y part vertical
*/
22
};
23
public
:
24
/*
*
25
* Find the min edit distance and return the edit distance.
26
* will sotre the best path info in string rs1, rs2
27
* s1, s2 is the user given string for caculating the edit distance
28
* @param s1,s2 Input strings.
29
* @param rs1,rs2 Store the result path.
30
* @return Will return the edit distance for string s1 and s2.
31
*/
32
static
int
CalcPath(
const
string
&
s1,
const
string
&
s2,
string
&
rs1,
string
&
rs2);
33
/*
*
34
* Find the min edit distance only do not need path info.
35
*/
36
static
int
EditDistance(
const
string
&
s1,
const
string
&
s2);
37
private
:
38
/*
*
39
* Set all data members value for one array element.
40
*/
41
static
void
SetArrayData(ArrayData
&
a,
int
dist,
int
pre_x,
int
pre_y);
42
/*
*
43
* Find the best path and store result to string rs1 and rs2.
44
* Recursion function.
45
* @param array The array will store all the info of each positon (x, y).
46
* @param index_x Current postion x part.
47
* @param index_y Current postion y part.
48
* @param s1,s2 The input two string for caculating their best edit distance path.
49
* @param rs1,rs2 Will store the result.
50
*/
51
static
void
StoreResult(ArrayData
**
array,
int
index_x,
int
index_y,
52
const
string
&
s1,
const
string
&
s2,
53
string
&
rs1,
string
&
rs2);
54
};
55
56
57
#endif
//
end of define _EDIT_DISTANCE_H
//edit_distance.cc
1
#include
"
edit_distance.h
"
2
#include
<
iostream
>
3
#include
<
iomanip
>
4
using
namespace
std;
5
/*
*
6
* find the min edit distance and return the edit distance
7
* will sotre the best path info in string rs1, rs2
8
* s1, s2 is the user given string for caculating the edit distance
9
*/
10
int
EditDistanceHelp::CalcPath(
const
string
&
s1,
const
string
&
s2,
string
&
rs1,
string
&
rs2)
11
{
12
//
first find min dist and store path info
13
int
len1
=
s1.length();
14
int
len2
=
s2.length();
15
16
//
allocate space for array
17
ArrayData
**
array;
18
array
=
new
ArrayData
*
[len1
+
1
];
19
for
(
int
i
=
0
; i
<=
len1; i
++
)
20
array[i]
=
new
ArrayData[len2
+
1
];
21
22
//
kernal for finding the best path and store path info to array
23
for
(
int
i
=
0
; i
<=
len1; i
++
)
24
SetArrayData(array[i][
0
], i, i
-
1
,
0
);
25
for
(
int
j
=
0
; j
<=
len2; j
++
)
26
SetArrayData(array[
0
][j], j,
0
, j
-
1
);
27
int
min_dist;
28
for
(
int
i
=
1
; i
<=
len1; i
++
)
29
for
(
int
j
=
1
; j
<=
len2; j
++
) {
30
if
(array[i
-
1
][j].dist
<
array[i][j
-
1
].dist)
//
can also be <=
31
SetArrayData(array[i][j], array[i
-
1
][j].dist
+
1
, i
-
1
, j);
32
else
33
SetArrayData(array[i][j], array[i][j
-
1
].dist
+
1
, i, j
-
1
);
34
min_dist
=
array[i
-
1
][j
-
1
].dist
+
(s1[i
-
1
]
!=
s2[j
-
1
]);
35
if
(min_dist
<
array[i][j].dist)
//
< is OK but <= make modify high priority
36
SetArrayData(array[i][j], min_dist, i
-
1
, j
-
1
);
37
}
38
39
//
store the best path result to two result string rs1 and rs2
40
StoreResult(array, len1, len2, s1, s2, rs1, rs2);
41
min_dist
=
array[len1][len2].dist;
42
43
//
print array
44
for
(
int
i
=
0
; i
<=
len1; i
++
) {
45
for
(
int
j
=
0
; j
<=
len2; j
++
) {
46
cout
<<
"
(
"
<<
array[i][j].pre_x
<<
"
,
"
<<
setw(
2
)
<<
array[i][j].pre_y
<<
"
)
"
;
47
}
48
cout
<<
endl;
49
}
50
//
free resources of array
51
for
(
int
i
=
0
; i
<=
len1; i
++
)
52
delete array[i];
53
delete array;
54
55
//
return min edit distance
56
return
min_dist;
57
}
58
59
/*
*
60
* find the min edit distance only do not need path info
61
*/
62
int
EditDistanceHelp::EditDistance(
const
string
&
s1,
const
string
&
s2)
63
{
64
using
std::min;
65
int
len1
=
s1.length();
66
int
len2
=
s2.length();
67
int
array[len1
+
1
][len2
+
1
];
68
for
(
int
i
=
0
; i
<=
len1; i
++
)
69
array[i][
0
]
=
i;
70
for
(
int
j
=
1
; j
<=
len2; j
++
)
71
array[
0
][j]
=
j;
72
for
(
int
i
=
1
; i
<=
len1; i
++
)
73
for
(
int
j
=
1
; j
<=
len2; j
++
)
74
array[i][j]
=
min(min(array[i
-
1
][j]
+
1
, array[i][j
-
1
]
+
1
),
75
array[i
-
1
][j
-
1
]
+
(s1[i
-
1
]
!=
s2[j
-
1
]));
76
return
array[len1][len2];
77
}
78
79
/*
*
80
* Set all data members value for one array element
81
*/
82
void
EditDistanceHelp::SetArrayData(ArrayData
&
a,
int
dist,
int
pre_x,
int
pre_y)
83
{
84
a.dist
=
dist;
85
a.pre_x
=
pre_x;
86
a.pre_y
=
pre_y;
87
}
88
89
/*
*
90
* Based on the path info stored in array ,find the best path and store result to string rs1 and rs2
91
*/
92
void
EditDistanceHelp::StoreResult(ArrayData
**
array,
int
index_x,
int
index_y,
93
const
string
&
s1,
const
string
&
s2,
94
string
&
rs1,
string
&
rs2)
95
{
96
if
(index_x
==
0
&&
index_y
==
0
)
97
return
;
98
99
if
((array[index_x][index_y].pre_x
<
index_x)
&&
(array[index_x][index_y].pre_y
<
index_y)) {
100
StoreResult(array, index_x
-
1
, index_y
-
1
, s1, s2, rs1, rs2);
101
rs1
+=
s1[index_x
-
1
];
102
rs2
+=
s2[index_y
-
1
];
103
}
else
if
(array[index_x][index_y].pre_x
<
index_x) {
104
StoreResult(array, index_x
-
1
, index_y, s1, s2, rs1, rs2);
105
rs1
+=
s1[index_x
-
1
];
106
rs2
+=
'
-
'
;
107
}
else
{
108
StoreResult(array, index_x, index_y
-
1
, s1, s2, rs1, rs2);
109
rs1
+=
'
-
'
;
110
rs2
+=
s2[index_y
-
1
];
111
}
112
}
113