计算两个中文字符串相似度——编辑距离算法

Levenshtein 距离,又称编辑距离,指的是两个字符串之间,由一个转换成另一个所需的最少编辑操作次数。

许可的编辑操作包括将一个字符替换成另一个字符,插入一个字符,删除一个字符。

编辑距离的算法是首先由俄国科学家Levenshtein提出的,故又叫Levenshtein Distance。

 

1、Java

public static void levenshtein(String str1, String str2) {
		// 计算两个字符串的长度。
		int len1 = str1.length();
		int len2 = str2.length();
		// 建立上面说的数组,比字符长度大一个空间
		int[][] dif = new int[len1 + 1][len2 + 1];
		// 赋初值,步骤B。
		for (int a = 0; a <= len1; a++) {
			dif[a][0] = a;
		}
		for (int a = 0; a <= len2; a++) {
			dif[0][a] = a;
		}
		// 计算两个字符是否一样,计算左上的值
		int temp;
		for (int i = 1; i <= len1; i++) {
			for (int j = 1; j <= len2; j++) {

				System.out.println("i = " + i + " j = " + j + " str1 = "
						+ str1.charAt(i - 1) + " str2 = " + str2.charAt(j - 1));
				if (str1.charAt(i - 1) == str2.charAt(j - 1)) {
					temp = 0;
				} else {
					temp = 1;
				}
				// 取三个值中最小的
				dif[i][j] = min(dif[i - 1][j - 1] + temp, dif[i][j - 1] + 1,
						dif[i - 1][j] + 1);

				System.out.println("i = " + i + ", j = " + j + ", dif[i][j] = "
						+ dif[i][j]);
			}
		}
		System.out.println("字符串\"" + str1 + "\"与\"" + str2 + "\"的比较");
		// 取数组右下角的值,同样不同位置代表不同字符串的比较
		System.out.println("差异步骤:" + dif[len1][len2]);
		// 计算相似度
		float similarity = 1 - (float) dif[len1][len2]
				/ Math.max(str1.length(), str2.length());
		System.out.println("相似度:" + similarity);
	}
</span></span>


 

2、LotusScript

Function toCompute(str1 As String ,str2 As String) As Double

	Dim len1 As Integer
	Dim len2 As Integer
	Dim maxlen As Integer
	Dim i As long
	Dim j  As long
	Dim temp As long
	Dim similarity As Double
	
	
	If str1= "" Or str2 = "" Then
		
		toCompute = 0
		
	Else

		len1 = Len(str1)
		len2 = Len(str2)		
		
		Dim dif(0 To 120, 0 To 120) As Integer

		If len1 > 120 Then
			len1 = 120
		End If
		
		If len2 > 120 Then
			len2 = 120
		End If
		
		If len1 > len2 Then
			maxlen = len1
		Else
			maxlen = len2
		End If
		
		For i = 0 To len1 Step 1
			dif(i,0) = i
		Next

		For i = 0 To len2  Step 1
			dif(0,i) = i
		Next
		
		For i = 1 To len1  Step 1
			
			For j = 1 To len2  Step 1
				
				'Print "i = "& i & " j  = " & j &" str1 = " & Right$(Left$(str1,i),1) &" str2 = " &Right$(Left$(str2,j),1)
				
				If Right$(Left$(str1,i),1) = Right$(Left$(str2,j),1) Then
					temp = 0
				Else
					temp = 1
				End If
				
				dif(i,j) = min(dif(i-1,j-1)+ temp ,dif(i,j-1)+1,dif(i-1,j)+1)

			Next
		Next

		'Print "差异步骤: " & dif(len1 ,len2)
		similarity = 1 - dif(len1 ,len2 )/maxlen
		'Print "差异度:" & similarity
		toCompute = similarity
		
		
		'Call toLogFile("str1 = " & str1 &" str2 = " &str2 & " 相似度: " & similarity)
	End If
End Function
</span></span>


 优化

1、Visual Basic

Module Module1

    Sub Main()
        Dim str1 As String

        Dim str2 As String

        str1 = "今天是星期五"
        str2 = "明天星期四"

        Dim dis As New clsDistance(str1, str2)
        Dim result As Integer

        result = dis.CacuDistance()

        Console.WriteLine(result)



    End Sub


    Public Class clsDistance
        Private mCharA() As Char
        Private mCharB() As Char
        Private mCharALen As Integer
        Private mCharBLen As Integer


        Public Sub New(ByVal StrA As String, ByVal StrB As String)


            mCharA = StrA.ToCharArray
            mCharB = StrB.ToCharArray
            mCharALen = mCharA.Length
            mCharBLen = mCharB.Length


        End Sub


        Public Function CacuDistance() As Integer
            Dim i As Integer


            If mCharALen = 0 Then Return mCharBLen
            If mCharBLen = 0 Then Return mCharALen

            Console.WriteLine(mCharALen)
            Console.WriteLine(mCharBLen)

            Dim j As Integer = Min(mCharALen, mCharBLen) - 1
            Dim tP1 As Integer, tP2 As Integer


            tP1 = -1
            tP2 = -1


            For i = 0 To j
                If mCharA(i) <> mCharB(i) Then
                    tP1 = i
                    Exit For
                End If
            Next


            If tP1 = -1 Then Return Math.Abs(mCharALen - mCharBLen)


            For i = 0 To j - tP1
                If mCharA(mCharALen - i - 1) <> mCharB(mCharBLen - i - 1) Then
                    tP2 = i
                    Exit For
                End If
            Next


            If tP2 = -1 Then Return Math.Abs(mCharALen - mCharBLen)
            Console.WriteLine("tp1: = " & tP1)
            Console.WriteLine("tp2 : = " & tP2)



            Dim tA(mCharALen - tP1 - tP2) As Integer


            For i = 0 To tA.GetUpperBound(0)
                tA(i) = i
            Next
            For i = 0 To tA.GetUpperBound(0) Step 1
                Console.WriteLine(" i = " + CStr(i) + " " & tA(i))
            Next

            Console.WriteLine("Bound: = " & tA.GetUpperBound(0))

            Dim tN1 As Integer, tN2 As Integer, tN3 As Integer


            For i = 0 To mCharBLen - tP1 - tP2 - 1
                tN1 = tA(0)
                tN2 = tN1 + 1
                Console.WriteLine("i = " & i & " " & mCharB(mCharBLen - tP2 - i - 1))

                For j = 1 To tA.GetUpperBound(0)

                    Console.WriteLine("j = " & j & " " & mCharA(mCharALen - tP2 - j))

                    If mCharA(mCharALen - tP2 - j) = mCharB(mCharBLen - tP2 - i - 1) Then
                        tN3 = tN1
                    Else
                        tN3 = Min(tA(j), tN1, tN2) + 1
                    End If
                    tA(j - 1) = tN2
                    tN2 = tN3
                    tN1 = tA(j)

                    Console.WriteLine("tn1 = " & tN1)
                    Console.WriteLine("tn2 = " & tN2)
                    Console.WriteLine("tn3 = " & tN3)
                Next
                tA(tA.GetUpperBound(0)) = tN2

                Console.WriteLine(tA.GetUpperBound(0) & " " & tA(tA.GetUpperBound(0)))
            Next

            For i = 0 To tA.GetUpperBound(0) Step 1
                Console.WriteLine(" i = " + CStr(i) + " " & tA(i))
            Next
            Return tA(tA.GetUpperBound(0))


        End Function


        Public Function Min(ByVal ParamArray Num() As Integer) As Integer
            Dim tN As Integer, i As Integer
            If Num.Length = 0 Then Return Nothing
            tN = Num(0)


            For i = 1 To Num.GetUpperBound(0)
                If Num(i) < tN Then tN = Num(i)
            Next


            Return tN
        End Function


    End Class

End Module
</span>


2、Java

public static int clsDistance(String str1, String str2) {

		int j;
		int i;

		int mCharALen, mCharBLen;

		mCharALen = str1.length();
		mCharBLen = str2.length();

		int tp1 = -1;
		int tp2 = -1;

		j = Math.min(mCharALen , mCharBLen) - 1;

		for (i = 0; i <= j; i++) {
			if (str1.charAt(i) != str2.charAt(i)) {
				tp1 = i;
				break;
			}

		}

		if (tp1 == -1) {
			return Math.abs(mCharBLen - mCharALen);
		}

		for (i = 0; i <= j - tp1; i++) {

			if (str1.charAt(mCharALen - i - 1) != str2.charAt(mCharBLen - i
					- 1)) {
				tp2 = i;
				break;
			}
		}

		if (tp2 == -1) {
			return Math.abs(mCharALen - mCharBLen);
		}
		int taBound = mCharALen - tp1 - tp2;
		
		int tA[] = new int[taBound + 1];

		for (i = 0; i < tA.length; i++) {
			tA[i] = i ;

		}
		System.out.println(Arrays.toString(tA));
		int tN1, tN2, tN3;

		for (i = 0; i < mCharBLen - tp1 - tp2 ; i++) {
			tN1 = tA[0];
			tN2 = tN1 + 1;
			
			System.out.println("\n" + i + " " + str2.charAt(mCharBLen
						- tp2 - i - 1));
			
			for (j = 1; j < tA.length  ; j++) {

				System.out.print(str1.charAt(mCharALen - tp2 - j ) +"	");
				
				if (str1.charAt(mCharALen - tp2 - j  ) == str2.charAt(mCharBLen
						- tp2 - i - 1)) {

					tN3 = tN1;
				} else {
					tN3 = Math.min(tA[j], Math.min(tN1, tN2)) + 1;

				}
				
				tA[j - 1] = tN2;
				tN2 = tN3;
				tN1 = tA[j];
				
				System.out.println("\ntN1 = " + tN1);
				System.out.println("tN2 = " + tN2);
				System.out.println("tN3 = " + tN3);
			}

			tA[tA.length - 1] = tN2;
			System.out.println("\n"+tA[tA.length - 1] );
		}

		System.out.println("\n" +Arrays.toString(tA));
		return tA[tA.length - 1];

	}</span>


3、Lotus Script

%REM
	Function clsDistance
	Description: Comments for Function
%END REM
Function clsDistance(str1 As String ,str2 As String) As Double
	Dim mCharALen As Integer
	Dim mCharBLen As Integer
	Dim i As Integer
	Dim simularity As Double 
	Dim maxlen As Integer
	
	mCharALen = Len(str1)
	mCharBLen = Len(str2)
	
	If mCharALen > mCharBLen Then
		maxlen = mCharALen
	Else
		maxlen = mCharBLen
	End If
	
	If str1= "" Or str2 = "" Then
		clsDistance = 0
		Exit function		
	End If
		
	Dim j As Integer
	
	If mCharALen > mCharBLen Then
		j = mCharBLen - 1
	Else
		j = mCharALen - 1
	End If
	
	Dim tP1 , tP2  As Integer
	tP1 = -1 
	tP2 = -1
	
	For i = 0 To j Step 1

		If Right$(Left$(str1,i+1),1) <> Right$(Left$(str2,i+1),1) Then	
			tP1 = i
			Exit For
		End If
		
	Next
	
	
	If tP1 = -1 Then		
		clsDistance = 1 - Abs(mCharALen - mCharBLen) / maxlen
		Exit Function
	End If
	
	
	For i = 0 To j - tP1
		If Right$(Left$(str1,mCharALen - i),1) <> Right$(Left$(str2,mCharBLen - i),1) Then
			tP2 = i
			Exit For
		End If
	Next


	If tP2 = -1 Then 
		clsDistance = 1 - Abs(mCharALen - mCharBLen) /  maxlen
		Exit Function
	End If

	Dim tA(15000) As Integer
	Dim tABound As Integer
	tABound = mCharALen - tP1 - tP2 + 1 
	
	For i = 0 To tABound Step 1
		tA(i) = i
	Next 
	
	Dim tN1 As Integer, tN2 As Integer, tN3 As Integer
	
	
	For i = 0 To mCharBLen - tP1 - tP2
		tN1 = tA(0)
		tN2 = tN1 + 1
		
		For j = 1 To tABound
			
			If Right$(Left$(str1,mCharALen - tP2 - j + 1),1) = Right$(Left$(str2,mCharBLen - tP2 - i),1) Then
				tN3 = tN1
			Else
				tN3 = Min(tA(j), tN1, tN2) + 1
			End If
			
			tA(j - 1) = tN2
			tN2 = tN3
			tN1 = tA(j)
		Next
		
		
		tA(tABound) = tN2
	Next
	
	simularity = 1 - tA(tABound) / maxlen
	
	clsDistance = simularity
End Function</span>


 

你可能感兴趣的:(算法,字符串,编辑距离,levenshtein,distance)