用Python和C语言实现DNA双序列局部比对（Smith-waterman算法）

双序列局部比对的算法和全局比对算法的步骤相似，只是赋值规则稍有不同，每个单元格的分值由前面对角线的分值和引入一个空位得到的分值的最大值决定，但是分值不能为负值，如果为负值，则将其值设为0.

构建矩阵时先将第一列与第一行的值设为0，而在双序列全局比对中，此处的值为空位罚分的累加
除第一行和第一列以外每个位置的分值由一下四个值的最大值决定
1.S(i-1,j-1)+S(i,j) --匹配和不匹配
2.S(i,j-1)+gap -- 列引入空位
3.S(i-1,j)+gap -- 行引入空位
4.0

打分过程的目的是为了寻找比对的最大值，他代表了比对的结尾处，最大值不一定在最后一行和最后一列，这一点与全局比对不同

回溯过程从最大值开始，沿对角线向左直到碰到一位置(i,j)它的对角线上方，左方，上方的分值都小于匹配得分(在我的代码中是5时)，停止寻找。

局部比对的比对区域较短，但相似性百分比较大

Python代码如下：

mport sys
import numpy
import string

## Score matrix for nucleotide alignment
NUC44=numpy.array([[5,-4,-4,-4,-2],\
                   [-4,5,-4,-4,-2],\
                   [-4,-4,5,-4,-2],\
                   [-4,-4,-4,5,-2],\
                   [-2,-2,-2,-2,-1]])

NBET='ATGCN'

## define the function for calculating score matrix and arrow matrix:
def scoreMat(NUC44,NBET,seq1,seq2,gap=-8):
    len1,len2=len(seq1),len(seq2)
    scorMat=numpy.zeros((len1+1,len2+1),int)
    arrowMat=numpy.zeros((len1+1,len2+1),int)

#    scorMat[0,:]=numpy.arange(len2+1)*gap
#    scorMat[:,0]=numpy.arange(len1+1)*gap
    arrowMat[0,:]=numpy.ones(len2+1)
    arrowMat[1:,0]=numpy.zeros(len1)
    for i in range(1,len1+1):
        for j in range(1,len2+1):
            s_mat=numpy.zeros(4)
            s_mat[0]=scorMat[i-1,j]+gap
            s_mat[1]=scorMat[i,j-1]+gap
            n1,n2=NBET.index(seq1[i-1]),NBET.index(seq2[j-1])
            s_mat[2]=scorMat[i-1,j-1]+NUC44[n1,n2] 

            scorMat[i,j]=numpy.max(s_mat)
            arrowMat[i,j]=numpy.argmax(s_mat)
    return scorMat,arrowMat

## obtain the alignment of the sequences
def DynAlign(scorMat,arrow,seq1,seq2):
    aln_seq1,aln_seq2='',''
    flat_scorMat=numpy.ravel(scorMat)
    v,h=divmod(numpy.argmax(flat_scorMat),len(seq2)+1)
    print (v,h)
    while True:
        if arrow[v,h]==0:
            aln_seq1+=seq1[v-1]
            aln_seq2+='-'
            v-=1
        elif arrow[v,h]==1:
            aln_seq1+='-'
            aln_seq2+=seq2[h-1]
            h-=1
        elif arrow[v,h]==2:
            aln_seq1+=seq1[v-1]
            aln_seq2+=seq2[h-1]
            v-=1
            h-=1
        elif arrow[v,h]==3:
            break
        if (v==0 and h==0) or scorMat[v,h]==0:
            break

    aln1=aln_seq1[::-1]
    aln2=aln_seq2[::-1]

    return aln1,aln2


sq1=input("Please input the sequence 1:")
sq2=input("Please input the sequence 2:")
scoreMatrix,arrowMatrix=scoreMat(NUC44,NBET,sq1,sq2,gap=-8)
aln1,aln2=DynAlign(scoreMatrix,arrowMatrix,sq1,sq2)

print('The score matrix is:')
print (scoreMatrix)
print ('The arrow matrix is:')
print (arrowMatrix)
print ('The aligned sequences are:')
print (aln1)
print (aln2)

C语言代码如下：

#include 
#include 
char seq1_out[100] = "\0", seq2_out[100]= "\0";//定义比对后的输出一维数组
int scores[50][50] = {0};//定义得分矩阵
int score_array[5][5] = { {5,-4,-4,-4,-2},{-4,5,-4,-4,-2},{-4,-4,5,-4,-2},{-4,-4,-4,5,-2},{-2,-2,-2,-2,-1} };//定义打分矩阵
int main()
{
    void output(char seq1[], char seq2[]);//引用输出函数
    void scoresMat(char seq1[], char seq2[], char normol[], int gap = -8);//引用打分函数
    char seq1[100] = "\0", seq2[100] = "\0", normol[] = "ATCGN";
    int i, j;
    //输入待比对序列
    printf("Please input the sequence 1 :");
    scanf("%s", seq1);
    printf("Please input the sequence 2 :");
    scanf("%s", seq2);
    scoresMat(seq1, seq2, normol);
    printf("the score array is:\n");
    for (i = 0; i <= strlen(seq2); i++)
        for (j = 0; j <= strlen(seq1); j++)
            if (j == strlen(seq1))
                printf("%d\n", scores[i][j]);
            else
                printf("%d\t", scores[i][j]);
    output(seq1, seq2);
    int out_len1 = strlen(seq1_out), out_len2 = strlen(seq2_out);
    for (i = out_len1 - 1; i >= 0; i--)
        printf("%c", seq1_out[i]);
    printf("\n");
    for (i = out_len2 - 1; i >= 0; i--)
        printf("%c", seq2_out[i]);
    printf("\n");
    return 0;
}

void scoresMat(char seq1[], char seq2[], char normol[],int gap=-8)//打分函数
{
    int find_index(char c, char normol[]);
    int max(int score1, int score2, int score3);
    int index1, index2;
    int len1 = strlen(seq1), len2 = strlen(seq2);
    int i = 0,j=0,k=0;
    //将打分矩阵的第一行和第一列分值设为0
    for (i = 1; i <= len1; i++)
        scores[0][i] = 0;
    for (i = 1; i <= len2; i++)
        scores[i][0] = 0;
    for(i=1;i<=len2;i++)
        for (j = 1; j <= len1; j++)
        {
            index1 = 0, index2 = 0;
            int score1 = 0, score2 = 0, score3 = 0,final_score=0;
            index1 = find_index(seq2[i-1], normol);//当i=1时，其实是第二条序列的第一个碱基，在序列中的索引为0，所以要减一
            index2 = find_index(seq1[j-1], normol);
            score1 = scores[i-1][j-1]+score_array[index1][index2];
            score2 = scores[i - 1][j] + gap;
            score3 = scores[i][j - 1] + gap;
            final_score = max(score1, score2, score3);
            if (final_score > 0)
                scores[i][j] = final_score;
            else
                scores[i][j] = 0;
        }
}

int max(int score1, int score2, int score3)//求三个数最大值函数
{
    if (score1 > score2)
        if (score1 > score3)
            return score1;
        else
            return score3;
    else
        if (score2 > score3)
            return score2;
        else
            return score3;
}
int find_index(char c, char normol[])//寻找碱基在normol中的索引函数
{
    int i=0, index=0;
    for (i = 0; i < strlen(normol); i++)
        if (c == normol[i])
            index = i;
    return index;
}


void output(char seq1[], char seq2[])//反向输出最佳匹配序列函数
{
    int out_num,seq1_num,seq2_num;
    int i, j;
    int score_sum = 0;
    int max(int score1, int score2, int score3);
    int len1 = strlen(seq1), len2 = strlen(seq2);
        //找出打分矩阵中最置作为反向寻找最佳比对的起点
    int start_i = 0, start_j = 0, max_score;
    max_score = scores[0][0];
    for (i = 0; i <= len2; i++)
        for (j = 0; j <= len1; j++)
            if (max_score <= scores[i][j])
            {
                max_score = scores[i][j];
                start_i = i;
                start_j = j;
            }
//开始反向寻找最佳匹配
    out_num = 0;
    seq1_num = start_j - 1;
    seq2_num = start_i - 1;
    score_sum += scores[start_i][start_j];
    seq1_out[out_num] =seq1[seq1_num]; seq1_num--;
    seq2_out[out_num] = seq2[seq2_num]; seq2_num--;
    out_num++;
    int score_max = 0, score1, score2 , score3;//score1为（i,j）左边的分数，score2为(i,j)上边的分数，score3为（i,j)对角线上方的分数
    score1 = scores[start_i][start_j - 1];
    score2 = scores[start_i - 1][start_j];
    score3 = scores[start_i - 1][start_j - 1];
    while (score1>=5 || score2>=5 || score3>=5)
    {
        if (score3 == score_max)
        {
            start_i = start_i - 1;
            start_j = start_j - 1;
            seq1_out[out_num] = seq1[seq1_num];
            seq2_out[out_num] = seq2[seq2_num];
            out_num++; seq1_num--; seq2_num--;
        }
        if (score1 == score_max && score2!=score_max)
        {
            start_j = start_j - 1;
            seq1_out[out_num] = seq1[seq1_num];
            seq2_out[out_num] = '-';
            out_num++; seq1_num--;
        }
        if (score2 == score_max&&score1!=score_max)
        {
            start_i = start_i - 1;
            seq2_out[out_num] = seq2[seq2_num];
            seq1_out[out_num] = '-';
            out_num++; seq2_num--;
        }
        if ( score2 == score_max && score1== score_max)
        {
            start_i = start_i - 1;
            seq2_out[out_num] = seq2[seq2_num];
            seq1_out[out_num] = '-';
            out_num++; seq2_num--;
        }
        score1 = scores[start_i][start_j - 1];
        score2 = scores[start_i - 1][start_j];
        score3 = scores[start_i - 1][start_j - 1];
        score_max = max(score1, score2, score3);
        score_sum += score_max;
    }
    printf("The best scores is %d\n", score_sum);
}

运行结果如下图：

用Python和C语言实现DNA双序列局部比对（Smith-waterman算法）_第1张图片

image.png

代码仅供参考和学习，如果有问题，请联系我，我们可以一起讨论！！！