字符串算法:最长公共子序列、最短编辑距离等

会慢慢写最长公共子序列、最短编辑距离等与字符串相关的算法,其实也就是数组相关的算法...

一、最长公共子序列

Solve1里递推公式为:

dp[i][j] = 0                                         if i = 0 or j  = 0

dp[i][j] = dp[i-1][j-1]+1                       if s1[i-1] = s2[j-1]

dp[i][j] = max{dp[i-1][j],dp[i][j-1]}       if s1[i-1] != s2[j-1]

这里i、j并不是s1、s2中字符下标而是已经s1前i个字符与s2前j个字符,dp[i][j]表示s1前i个字符与s2前j个字符最长公共子序列

Solve1只打印了最长公共子序列的其中一个,没有考虑多个相同的最长公共子序列

Solve2稍微变换了递推公式:

dp[i][j]表示s1前i+1个字符与s2前j+1个字符的最长公共子序列,没什么太大变换

Solve3只用了一维数组,减少了空间复杂度

Solve4是递归写法

#include 
#include 
#include 
//#define _DEBUG
int max(int a, int b){
    return a>b?a:b;
}

void Solve1(char s1[], char s2[]){
    int i,j;//i、j表示扫过的s1、s2长度
    int len1 = strlen(s1);
    int len2 = strlen(s2);
    int dp[len1+1][len2+1];

    for(i = 0; i <= len1; ++i) dp[i][0] = 0;
    for(j = 0; j <= len2; ++j) dp[0][j] = 0;
    for(i = 1; i <= len1; ++i){
        for(j = 1; j <= len2; ++j){
            if(s1[i-1] == s2[j-1]){//是i-1 与 j-1
                dp[i][j] = dp[i-1][j-1] + 1;
            }else{
                dp[i][j] = max(dp[i-1][j],dp[i][j-1]);
            }
        }
    }
    #ifdef _DEBUG
    for(i = 0; i <= len1; ++i){
        for(j = 0; j<= len2; ++j){
            printf("%d ",dp[i][j]);
        }
        printf("\n");
    }
    #endif// _DEBUG
    printf("Solve1:%d\n",dp[len1][len2]);

    for(i = len1; i >= 0; ){
        for(j = len2; j >= 0; ){
            if(s1[i-1] == s2[j-1]){
                printf("%c ",s1[i-1]);
                --i;
                --j;
            }else{
                if(dp[i][j-1] > dp[i-1][j]){
                    --j;
                }else{
                    --i;
                }
            }
        }
    }
    printf("\n");
}

/*错误
//dp[][0]与dp[0][]错误
void Solve2(char s1[], char s2[]){
    int i,j;
    int len1 = strlen(s1);
    int len2 = strlen(s2);
    int dp[len1][len2];

    for(i = 0; i < len1; ++i) dp[i][0] = s1[i]==s2[0]?1:0;
    for(j = 0; j < len2; ++j) dp[0][j] = s1[0]==s2[j]?1:0;
    for(i = 1; i < len1; ++i){
        for(j = 1; j < len2; ++j){
            if(s1[i] == s2[j]){
                dp[i][j] = dp[i-1][j-1] + 1;
            }else{
                dp[i][j] = max(dp[i-1][j],dp[i][j-1]);
            }
        }
    }

    for(i = 0; i < len1; ++i){
        for(j = 0; j< len2; ++j){
            printf("%d ",dp[i][j]);
        }
        printf("\n");
    }
    printf("%d\n",dp[len1-1][len2-1]);
}
*/

void Solve2(char s1[], char s2[]){
    int i,j;//i、j是s1 s2下标
    int len1 = strlen(s1);
    int len2 = strlen(s2);
    int dp[len1][len2];
    int dp1 = 0;
    int dp2 = 0;
    dp[0][0] = s1[0]==s2[0]?1:0;
    for(i = 1; i < len1; ++i){
        if(dp1 == 0 && s1[i] == s2[0]) dp1 = 1;
        dp[i][0] = dp1;
    }
    for(j = 1; j < len2; ++j){
        if(dp2 == 0 && s1[0] == s2[j]) dp2 = 1;
        dp[0][j] = dp2;
    }

    for(i = 1; i < len1; ++i){
        for(j = 1; j < len2; ++j){
            if(s1[i] == s2[j]){
                dp[i][j] = dp[i-1][j-1] + 1;
            }else{
                dp[i][j] = max(dp[i-1][j],dp[i][j-1]);
            }
        }
    }
    #ifdef _DEBUG
    for(i = 0; i < len1; ++i){
        for(j = 0; j< len2; ++j){
            printf("%d ",dp[i][j]);
        }
        printf("\n");
    }
    #endif // _DEBUG
    printf("Solve2:%d\n",dp[len1-1][len2-1]);
}

void Solve3(char s1[], char s2[]){
    int i,j;
    int len1 = strlen(s1);
    int len2 = strlen(s2);
    int dp[len2+1];

    for(j = 0; j <= len2; ++j){
        dp[j] = 0;
    }

    for(i = 1; i <= len1; ++i){
        for(j = 1; j <= len2; ++j){
            if(s1[i-1] == s2[j-1]){
                dp[j] = dp[j-1] + 1;
            }else{
                dp[j] = max(dp[j],dp[j-1]);
            }
        }
    }
    #ifdef _DEBUG
    for(j = 0; j<= len2; ++j) printf("%d ",dp[j]);
    printf("\n");
    #endif // _DEBUG
    printf("Solve3:%d\n",dp[len2]);
}

int Solve4(char s1[], char s2[], int i, int j,int dp[][20]){
    //递归求解
    if(i==0 || j==0){
        dp[i][j] = 0;
        return dp[i][j];
    }
    if(dp[i][j] != -1){
        return dp[i][j];
    }
    if(s1[i-1] == s2[j-1]){
        dp[i][j] = Solve4(s1,s2,i-1,j-1,dp)+1;
        return dp[i][j];
    }
    dp[i][j] = max(Solve4(s1,s2,i-1,j,dp),Solve4(s1,s2,i,j-1,dp));
    return dp[i][j];
}

int main(){
    char s1[20] = "abcdef";
    char s2[20] = "dgajchdef";
    int dp[20][20];
    memset(dp, -1, sizeof(dp));
    Solve1(s1,s2);
    Solve2(s1,s2);
    Solve3(s1,s2);
    printf("Solve4:%d",Solve4(s1,s2,strlen(s1),strlen(s2),dp));
    return 0;
}

二、编辑距离

LeetCode原题:

72. Edit Distance

Given two words word1 and word2, find the minimum number of steps required to convert word1 to word2. (each operation is counted as 1 step.)

You have the following 3 operations permitted on a word:

a) Insert a character
b) Delete a character
c) Replace a character

递推公式:

dp[i][j]表示长度为i与长度为j的字符串的编辑距离,对应原题即word1(2)中前i(j)个字符组成的字符串:

dp[i][j] = min(dp[i-1][j],dp[i][j-1],dp[i-1][j-1])+1                            if  word1[i-1] != word2[j-1]

dp[i][j] = dp[i-1][j-1]                                                                    if  word1[i-1] == word2[j-1]

也可以写成dp[i][j] = min(dp[i-1][j]+1,    dp[i][j-1]+1,   dp[i-1][j-1] + word1[i-1]==word2[j-1]?0:1),但这种写法感觉上似乎并不是很好,因为在word1[i-1] == word2[j-1]时这种写法还在dp[i-1][j]+1,    dp[i][j-1]+1,   dp[i-1][j-1] + word1[i-1]三者之中取最小,并没有意识到当word1[i-1] == word2[j-1]时dp[i][j] = dp[i-1][j-1]一定成立!!!

下面是碎碎念,可忽略:

开始想着 dp[i-1][j-1] + word1[i-1]==word2[j-1]?0:1是否一定会小于或等于dp[i-1][j]+1,    dp[i][j-1]+1,如果成立那么动态规划就可以变成贪心算法,但是并不成立,可找到反例。从下面证明中也可以得到答案。

首先证明dp[i-1][j-1]<=dp[i-1][j]+1:

w1'经过dp[i-1][j]步可变为w2,再删除w2末位字符得到w2',所以经过dp[i-1][j]+1不w1'可转换为w2',而dp[i-1][j-1]为w1',w2'的编辑距离,因而dp[i-1][j]+1>=dp[i-1][j-1]一定成立,所以没必要像第二种递推公式那样比较dp[i-1][j]+1,    dp[i][j-1]+1,   dp[i-1][j-1]。

dp[i-1][j-1]<=dp[i-1][j]一定成立吗?

dp[i-1][j-1]

两个答案都是不!简单的反例:word1=“A”,word2=“A”,dp[1][0]=1,dp[1][1]=0,dp[i-1][j-1]>dp[i-1][j]

总结:dp[i-1][j-1]<=dp[i-1][j]+1并且等号可成立

以上只是比较了dp[i-1][j-1]与dp[i-1][j]直接的大小,并未讨论dp[i][j]什么情况下与dp[i-1][j-1]相等。

以下是正式的证明递推公式

设字符串w1、w2长度分别为i、j,且w1、w2末位字符相同即w1[i-1]=w2[j-1],w1前i-1个字符组成的字符串为w1',w2前j-1个字符组成的字符串为w2',w1、w2编辑距离为dp[i][j],w1'与w2编辑距离为dp[i-1][j],w1'与w2'编辑距离为dp[i-1][j-1]。

int min(int a, int b){
    return a


你可能感兴趣的:(算法导论)