2019-07-15 字符串的相似度（编程之美）

标签（空格分隔）：

问题描述

许多程序会大量使用字符串。对于不同的字符串，我们希望能够有办法判断其相似程序。我们定义一套操作方法来把两个不相同的字符串变得相同，具体的操作方法为：
1.修改一个字符（如把“a”替换为“b”）;　　
2.增加一个字符（如把“abdd”变为“aebdd”）;
3.删除一个字符（如把“travelling”变为“traveling”）;
比如，对于“abcdefg”和“abcdef”两个字符串来说，我们认为可以通过增加/减少一个“g”的方式来达到目的。上面的两种方案，都仅需要一次。把这个操作所需要的次数定义为两个字符串的距离，而相似度等于“距离+1”的倒数。也就是说，“abcdefg”和“abcdef”的距离为1，相似度为1/2=0.5。
给定任意两个字符串，你是否能写出一个算法来计算它们的相似度呢？

原文的分析与解法

不难看出，两个字符串的距离肯定不超过它们的长度之和（我们可以通过删除操作把两个串都转化为空串）。虽然这个结论对结果没有帮助，但至少可以知道，任意两个字符串的距离都是有限的。我们还是就住集中考虑如何才能把这个问题转化成规模较小的同样的子问题。如果有两个串A=xabcdae和B=xfdfa，它们的第一个字符是相同的，只要计算A[2,...,7]=abcdae和B[2,...,5]=fdfa的距离就可以了。但是如果两个串的第一个字符不相同，那么可以进行如下的操作（lenA和lenB分别是A串和B串的长度）。

1.删除A串的第一个字符，然后计算A[2,...,lenA]和B[1,...,lenB]的距离。
2.删除B串的第一个字符，然后计算A[1,...,lenA]和B[2,...,lenB]的距离。
3.修改A串的第一个字符为B串的第一个字符，然后计算A[2,...,lenA]和B[2,...,lenB]的距离。
4.修改B串的第一个字符为A串的第一个字符，然后计算A[2,...,lenA]和B[2,...,lenB]的距离。
5.增加B串的第一个字符到A串的第一个字符之前，然后计算A[1,...,lenA]和B[2,...,lenB]的距离。
6.增加A串的第一个字符到B串的第一个字符之前，然后计算A[2,...,lenA]和B[1,...,lenB]的距离。
在这个题目中，我们并不在乎两个字符串变得相等之后的字符串是怎样的。所以，可以将上面的6个操作合并为：
1.一步操作之后，再将A[2,...,lenA]和B[1,...,lenB]变成相字符串。
2.一步操作之后，再将A[2,...,lenA]和B[2,...,lenB]变成相字符串。
3.一步操作之后，再将A[1,...,lenA]和B[2,...,lenB]变成相字符串。

原文的解法

#include
#include>
using namespace std;
int minValue(const int &a, const int &b, const int &c);
int calculateStringDistance(string strA, int pABegin, int pAEnd, string strB,
    int pBBegin, int pBEnd);
int main()
{
    string strA,strB;
    cout << "请输入字符串strA" << endl;
    cin >> strA;
    cout << endl;
    cout << "请输入字符串strB" << endl;
    cin >> strB;
    cout << endl;
    int strA_length = strA.length();
    int strB_length = strB.length();
    int distance = calculateStringDistance(strA, 0, strA_length - 1, strB, 0, strB_length - 1);
    cout << double(1.0 /( distance + 1)) << endl;
    cin.get();
    cin.get();
    return 0;
}

//求出最小值
int minValue(const int & a, const int & b, const int & c)
{
    int min_temp = ((a < b) ? a : b) < c ? ((a < b) ? a : b) : c;
    return min_temp;
}

//递归求解
int calculateStringDistance(string strA, int pABegin, int pAEnd, string strB, int pBBegin, int pBEnd)
{
    if (pABegin > pAEnd)
    {
        if (pBBegin > pBEnd)
            return 0;
        else
            return (pBEnd - pBBegin + 1);
    }
    if (pBBegin > pBEnd)
    {
        if (pABegin > pAEnd)
            return 0;
        else
            return (pAEnd - pABegin + 1);
    }
    if (strA[pABegin] == strB[pBBegin])
    {
        return calculateStringDistance(strA, pABegin + 1, pAEnd, strB, pBBegin + 1, pBEnd);
    }
    else
    {
        int t1 = calculateStringDistance(strA, pABegin, pAEnd, strB, pBBegin + 1, pBEnd);
        int t2 = calculateStringDistance(strA, pABegin + 1, pAEnd, strB, pBBegin, pBEnd);
        int t3 = calculateStringDistance(strA, pABegin + 1, pAEnd, strB, pBBegin + 1, pBEnd);
        return minValue(t1, t2, t3) + 1;        //加1表示进行了一次操作
    }
}

这个问题给出了递归思想的解法，简单直白，但是递归运算的时候做了很多重复性的运算，同时进行操作的方式有四种，他们不同的操作方式会不同的解值，找出其中最优的解，也就是最小的返回值。这种问题类似于最长公共子序列的问题。

动态规划解法

#include
#include>
using namespace std;

int minValue(const int &a, const int &b, const int &c);
int calculateStringDistance(string strA, int pABegin, int pAEnd, string strB,
    int pBBegin, int pBEnd);

int main()
{
    string strA, strB;
    cout << "请输入字符串strA" << endl;
    cin >> strA;
    cout << endl;
    cout << "请输入字符串strB" << endl;
    cin >> strB;
    cout << endl;
    int strA_length = strA.length();
    int strB_length = strB.length();

    int **c = NULL;
    c = new int*[strA_length + 1];
    for (int i = 0;i < strA_length + 1;++i)
    {
        c[i] = new int[strB_length + 1];
    }
    for (int i = 0;i < strA_length + 1;++i)
    {
        for (int j = 0;j < strB_length + 1;++j)
        {
            c[i][j] = 0;
        }
    }
    int distance = calculateStringDistance(strA, strA_length, strB, strB_length, c);

    cout << double (1.0/(distance + 1)) << endl;
    for (int i = 0; i < strA_length; ++i)
    {
    delete[]c[i];
    c[i] = NULL;
    }
    delete c;
    c = NULL;

    cin.get();
    cin.get();
    return 0;
}

//求出最小值
int minValue(const int & a, const int & b, const int & c)
{
    int min_temp = ((a < b) ? a : b) < c ? ((a < b) ? a : b) : c;
    return min_temp;
}

int calculateStringDistance(string strA, int strA_length, string strB, int strB_length, int **c)
{
    /*下面两个for循环是为了赋初值的，我的理解是
    加入其中一个字符串位空，若要与空穿相同，则需要的变换次数，就是初值*/
    for (int i = 0; i < strA_length; i++)
    {
        c[i][strB_length] = strA_length - i;
    }
    for (int j = 0; j < strB_length; j++)
    {
        c[strA_length][j] = strB_length - j;
    }
    c[strA_length][strB_length] = 0;
    for (int i = strA_length - 1; i >= 0; i--)
        for (int j = strB_length - 1; j >= 0; j--)
        {
            if (strB[j] == strA[i])
                c[i][j] = c[i + 1][j + 1];
            else
                /*分析问题后可以得出四种状态转换过程，
                不同的状态转换过程对应的结果可以能有所不同，因此从不同的转换过程中取最优的*/
                c[i][j] = minValue(c[i][j + 1], c[i + 1][j], c[i + 1][j + 1]) + 1;
        }

    return c[0][0];
}

动态规划解题方式，有两点很重要：
1、判断是否满足动态规划的要求（1.判断问题的子结构，是否存在最优解，若存在，则可能适用动态规划；
2.求解重叠子问题。一个递归算法不断地调用同一问题，递归转换为查表从而利用子问题解决当前问题。
）
2、给出状态转换方程（不同的动态规划，问题的状态转换方程不同）

参考

动态规划：https://www.cnblogs.com/kkgreen/archive/2011/06/26/2090702.html
问题求解：https://www.cnblogs.com/yujunyong/articles/2004724.html
动态规划定义详解：https://www.cnblogs.com/wuyuegb2312/p/3281264.html

2019-07-15 字符串的相似度（编程之美）

问题描述

原文的分析与解法

原文的解法

动态规划解法

参考

你可能感兴趣的:(2019-07-15 字符串的相似度（编程之美）)