用Python和C语言实现DNA双序列全局比对(Needleman-Wnnsch算法)

双序列全局比对主要是依据Needleman-Wnnsch算法来进行

整个过程分为三步
1.设置一个矩阵:第一条序列长m,沿x轴排列,第二条序列长n,沿y轴排列
2.设置打分矩阵,根据适当的打分公式来对对应的碱基进行打分,有四种情况:1.两碱基完全匹配2.不匹配3.第一条序列引入空位4.第二条序列引入空位
3.反向寻找最佳比对,通过回溯法,从打分矩阵的右下方最大得分处,往左上方寻找最大值。所谓动态规划表示最佳路径是通过逐步延长最佳子路径得到的,即比对的每一步都选择获得分值高的碱基对作为比对的方法

总目标是沿着矩阵对角线找到最佳分值的路径,这条路径确定了最佳比对

Python代码

import numpy
import string

## Score matrix for nucleotide alignment
NUC44=numpy.array([[5,-4,-4,-4,-2],\
                   [-4,5,-4,-4,-2],\
                   [-4,-4,5,-4,-2],\
                   [-4,-4,-4,5,-2],\
                   [-2,-2,-2,-2,-1]])

NBET='ATGCN'

## define the function for calculating score matrix and arrow matrix:
def scoreMat(NUC44,NBET,seq1,seq2,gap=-8):
    len1,len2=len(seq1),len(seq2)
    scorMat=numpy.zeros((len1+1,len2+1),int)
    arrowMat=numpy.zeros((len1+1,len2+1),int)

    scorMat[0,:]=numpy.arange(len2+1)*gap
    scorMat[:,0]=numpy.arange(len1+1)*gap
    arrowMat[0,:]=numpy.ones(len2+1)
    arrowMat[1:,0]=numpy.zeros(len1)
    for i in range(1,len1+1):
        for j in range(1,len2+1):
            s_mat=numpy.zeros(3)
            s_mat[0]=scorMat[i-1,j]+gap
            s_mat[1]=scorMat[i,j-1]+gap
            n1,n2=NBET.index(seq1[i-1]),NBET.index(seq2[j-1])
            s_mat[2]=scorMat[i-1,j-1]+NUC44[n1,n2] 

            scorMat[i,j]=numpy.max(s_mat)
            arrowMat[i,j]=numpy.argmax(s_mat)
    return scorMat,arrowMat

## obtain the alignment of the sequences
def DynAlign(scorMat,arrow,seq1,seq2):
    aln_seq1,aln_seq2='',''
    flat_scorMat=numpy.ravel(scorMat)
    v,h=divmod(numpy.argmax(flat_scorMat),len(seq2)+1)
    print(v,h)
    while True:
        if arrow[v,h]==0:
            aln_seq1+=seq1[v-1]
            aln_seq2+='-'
            v-=1
        elif arrow[v,h]==1:
            aln_seq1+='-'
            aln_seq2+=seq2[h-1]
            h-=1
        elif arrow[v,h]==2:
            aln_seq1+=seq1[v-1]
            aln_seq2+=seq2[h-1]
            v-=1
            h-=1
        if v==0 and h==0:
            break

    aln1=aln_seq1[::-1]
    aln2=aln_seq2[::-1]

    return aln1,aln2


sq1=input("Please input the sequence 1:")
sq2=input("Please input the sequence 2:")
scoreMatrix,arrowMatrix=scoreMat(NUC44,NBET,sq1,sq2,gap=-8)
aln1,aln2=DynAlign(scoreMatrix,arrowMatrix,sq1,sq2)

print('The score matrix is:')
print(scoreMatrix)
print('The arrow matrix is:')
print(arrowMatrix)
print('The aligned sequences are:')
print(aln1)
print(aln2)

C语言代码

#include 
#include 
char seq1_out[100] = "\0", seq2_out[100] = "\0";//定义比对后的输出一维数组
int scores[50][50] = { 0 };//定义得分矩阵
int score_array[5][5] = { {5,-4,-4,-4,-2},{-4,5,-4,-4,-2},{-4,-4,5,-4,-2},{-4,-4,-4,5,-2},{-2,-2,-2,-2,-1} };//定义打分矩阵
int main()
{
    void output(char seq1[], char seq2[]);//引用输出函数
    void scoresMat(char seq1[], char seq2[], char normol[], int gap = -8);//引用打分函数
    char seq1[100] = "\0", seq2[100] = "\0", normol[] = "ATCGN";
    int i, j;
    //输入待比对序列
    printf("Please input the sequence 1 :");
    scanf("%s", seq1);
    printf("Please input the sequence 2 :");
    scanf("%s", seq2);
    scoresMat(seq1, seq2, normol);
    printf("the score array is:\n");
    for (i = 0; i <= strlen(seq2); i++)
        for (j = 0; j <= strlen(seq1); j++)
            if (j == strlen(seq1))
                printf("%d\n", scores[i][j]);
            else
                printf("%d\t", scores[i][j]);
    output(seq1, seq2);
    int out_len1 = strlen(seq1_out), out_len2 = strlen(seq2_out);
    for (i = out_len1 - 1; i >= 0; i--)
        printf("%c", seq1_out[i]);
    printf("\n");
    for (i = out_len2 - 1; i >= 0; i--)
        printf("%c", seq2_out[i]);
    printf("\n");
    return 0;
}

void scoresMat(char seq1[], char seq2[], char normol[], int gap = -8)//打分函数
{
    int find_index(char c, char normol[]);
    int max(int score1, int score2, int score3);
    int index1, index2;
    int len1 = strlen(seq1), len2 = strlen(seq2);
    int i = 0, j = 0, k = 0, temp = 0;
    for (i = 1, temp = -8; i <= len1; i++, temp += gap)//空位罚分为-8
        scores[0][i] = temp;
    for (i = 1, temp = -8; i <= len2; i++, temp += gap)//空位罚分为-8
        scores[i][0] = temp;
    for (i = 1; i <= len2; i++)
        for (j = 1; j <= len1; j++)
        {
            index1 = 0, index2 = 0;
            int score1 = 0, score2 = 0, score3 = 0, final_score = 0;
            index1 = find_index(seq2[i - 1], normol);//当i=1时,其实是第二条序列的第一个碱基,在序列中的索引为0,所以要减一
            index2 = find_index(seq1[j - 1], normol);
            score1 = scores[i - 1][j - 1] + score_array[index1][index2];
            score2 = scores[i - 1][j] + gap;
            score3 = scores[i][j - 1] + gap;
            final_score = max(score1, score2, score3);
            scores[i][j] = final_score;
        }
}

int max(int score1, int score2, int score3)//求三个数最大值函数
{
    if (score1 > score2)
        if (score1 > score3)
            return score1;
        else
            return score3;
    else
        if (score2 > score3)
            return score2;
        else
            return score3;
}
int find_index(char c, char normol[])//寻找碱基在normol中的索引函数
{
    int i = 0, index = 0;
    for (i = 0; i < strlen(normol); i++)
        if (c == normol[i])
            index = i;
    return index;
}


void output(char seq1[], char seq2[])//反向输出最佳匹配序列函数
{
    int out_num, seq1_num = strlen(seq1) - 1, seq2_num = strlen(seq2) - 1;
    int i, j;
    int score_sum = 0;
    int max(int score1, int score2, int score3);
    int len1 = strlen(seq1), len2 = strlen(seq2);
    //找出最后一行数值最大的数的位置
    int max_score = scores[len2][0], max_j = 0;
    for (j = 1; j <= len1; j++)
        if (max_score <= scores[len2][j])
        {
            max_score = scores[len2][j];
            max_j = j;
        }
    int col_max = scores[len2][max_j];
    //找出最后一列数值最大的数的位置
    int max_score1 = scores[0][len1], max_i = 0;
    for (i = 1; i <= len2; i++)
        if (max_score1 < scores[i][len1])
        {
            max_score1 = scores[i][len1];
            max_i = i;
        }
    int row_max = scores[max_i][len1];
    //找出最后一列和最后一行最大的数,作为反向找出最佳匹配的起始点
    int start_i, start_j;
    if (col_max >= row_max)
    {
        start_i = len2;
        start_j = max_j;
    }
    if (col_max < row_max)
    {
        start_i = max_i;
        start_j = len1;
    }

    //开始反向寻找最佳匹配
    if (start_i < len2)
    {
        for (i = start_i + 1; i <= len2; i++)
            score_sum += scores[i][len1];
        for (out_num = 0; out_num < len2 - start_i; out_num++)
        {
            seq1_out[out_num] = '-';
            seq2_out[out_num] = seq2[seq2_num];
            seq2_num--;
        }
    }
    if (start_j < len1)
    {
        for (j = start_j + 1; j<= len1; j++)
            score_sum += scores[len2][j];
        for (out_num = 0; out_num < len1 - start_j; out_num++)
        {
            seq2_out[out_num] = '-';
            seq1_out[out_num] = seq1[seq1_num];
            seq1_num--;
        }
    }
    if (start_i == len2 && start_j == len1)
    {
        out_num = 0;
        seq2_out[out_num] = seq2[seq2_num];
        seq1_out[out_num] = seq1[seq1_num];
        out_num++; seq1_num--; seq2_num--;
    }
    score_sum += scores[start_i][start_j];
    while (start_i > 1 && start_j > 1 && seq1_num > 0 && seq2_num > 0)
    {
        int score_max = 0, score1 = 0, score2 = 0, score3 = 0;//score1为(i,j)左边的分数,score2为(i,j)上边的分数,score3为(i,j)对角线上方的分数
        score1 = scores[start_i][start_j - 1];
        score2 = scores[start_i - 1][start_j];
        score3 = scores[start_i - 1][start_j - 1];
        score_max = max(score1, score2, score3);
        score_sum += score_max;
        if (score3 == score_max)
        {
            start_i = start_i - 1;
            start_j = start_j - 1;
            seq1_out[out_num] = seq1[seq1_num];
            seq2_out[out_num] = seq2[seq2_num];
            out_num++; seq1_num--; seq2_num--;
        }
        if (score1 == score_max && score2 != score_max)
        {
            start_j = start_j - 1;
            seq1_out[out_num] = seq1[seq1_num];
            seq2_out[out_num] = '-';
            out_num++; seq1_num--;
        }
        if (score2 == score_max && score1 != score_max)
        {
            start_i = start_i - 1;
            seq2_out[out_num] = seq2[seq2_num];
            seq1_out[out_num] = '-';
            out_num++; seq2_num--;
        }
        if (score2 == score_max && score1 == score_max)
        {
            start_i = start_i - 1;
            seq2_out[out_num] = seq2[seq2_num];
            seq1_out[out_num] = '-';
            out_num++; seq2_num--;
        }
    }
    if (start_i == 1 && start_j != 1)
        while (seq1_num >= 0)
        {
            seq1_out[out_num] = seq1[seq1_num];
            seq2_out[out_num] = '-';
            seq1_num--; out_num++;
        }
    if (start_j == 1 && start_i != 1)
        while (seq2_num >= 0)
        {
            seq2_out[out_num] = seq2[seq2_num];
            seq1_out[out_num] = '-';
            seq2_num--; out_num++;
        }
    if (start_j == 1 && start_i == 1)
    {
        seq2_out[out_num] = seq2[0];
        seq1_out[out_num] = seq1[0];
    }
    printf("The best scores is %d\n", score_sum);
}

结果如下图:


用Python和C语言实现DNA双序列全局比对(Needleman-Wnnsch算法)_第1张图片
image.png

代码仅供参考和学习,如果有问题,请联系我,我们可以一起讨论!!!

你可能感兴趣的:(用Python和C语言实现DNA双序列全局比对(Needleman-Wnnsch算法))