perl:DNA序列翻译成氨基酸序列的若干方法,直接法,简并法,哈希法,以及perl中的uc和lc函数(上)


1.直接转换法

use warnings;
use strict;

#利用perl来进行DNA序列到氨基酸序列的翻译,我们来介绍一下几种方法:


#第一种方法:
#DNA序列和氨基酸序列通过密码子来联系,密码子一共有61个,蛋白质有20个
#第一种方法也就是最简单的方法,就是建立一一对应的关系

# A subroutine to translate a DNA 3-character codon to an amino acid 
# 这个程序的效率是非常低的,因为每翻译一个氨基酸,需要进行一次循环 
sub codon2aa 
{ 
	   my($codon) = @_; 
     
       if ( $codon =~ /TCA/i )    { return 'S' }    # Serine 
    elsif ( $codon =~ /TCC/i )    { return 'S' }    # Serine 
    elsif ( $codon =~ /TCG/i )    { return 'S' }    # Serine 
    elsif ( $codon =~ /TCT/i )    { return 'S' }    # Serine 
    elsif ( $codon =~ /TTC/i )    { return 'F' }    # Phenylalanine 
    elsif ( $codon =~ /TTT/i )    { return 'F' }    # Phenylalanine 
    elsif ( $codon =~ /TTA/i )    { return 'L' }    # Leucine 
    elsif ( $codon =~ /TTG/i )    { return 'L' }    # Leucine 
    elsif ( $codon =~ /TAC/i )    { return 'Y' }    # Tyrosine 
    elsif ( $codon =~ /TAT/i )    { return 'Y' }    # Tyrosine 
    elsif ( $codon =~ /TAA/i )    { return '_' }    # Stop终止密码子
    elsif ( $codon =~ /TAG/i )    { return '_' }    # Stop终止密码子
    elsif ( $codon =~ /TGC/i )    { return 'C' }    # Cysteine 
    elsif ( $codon =~ /TGT/i )    { return 'C' }    # Cysteine 
    elsif ( $codon =~ /TGA/i )    { return '_' }    # Stop 
    elsif ( $codon =~ /TGG/i )    { return 'W' }    # Tryptophan 
    elsif ( $codon =~ /CTA/i )    { return 'L' }    # Leucine 
    elsif ( $codon =~ /CTC/i )    { return 'L' }    # Leucine 
    elsif ( $codon =~ /CTG/i )    { return 'L' }    # Leucine 
    elsif ( $codon =~ /CTT/i )    { return 'L' }    # Leucine 
    elsif ( $codon =~ /CCA/i )    { return 'P' }    # Proline 
    elsif ( $codon =~ /CCC/i )    { return 'P' }    # Proline 
    elsif ( $codon =~ /CCG/i )    { return 'P' }    # Proline 
    elsif ( $codon =~ /CCT/i )    { return 'P' }    # Proline 
    elsif ( $codon =~ /CAC/i )    { return 'H' }    # Histidine 
    elsif ( $codon =~ /CAT/i )    { return 'H' }    # Histidine 
    elsif ( $codon =~ /CAA/i )    { return 'Q' }    # Glutamine 
    elsif ( $codon =~ /CAG/i )    { return 'Q' }    # Glutamine 
    elsif ( $codon =~ /CGA/i )    { return 'R' }    # Arginine 
    elsif ( $codon =~ /CGC/i )    { return 'R' }    # Arginine 
    elsif ( $codon =~ /CGG/i )    { return 'R' }    # Arginine 
    elsif ( $codon =~ /CGT/i )    { return 'R' }    # Arginine 
    elsif ( $codon =~ /ATA/i )    { return 'I' }    # Isoleucine 
    elsif ( $codon =~ /ATC/i )    { return 'I' }    # Isoleucine 
    elsif ( $codon =~ /ATT/i )    { return 'I' }    # Isoleucine 
    elsif ( $codon =~ /ATG/i )    { return 'M' }    # Methionine 
    elsif ( $codon =~ /ACA/i )    { return 'T' }    # Threonine 
    elsif ( $codon =~ /ACC/i )    { return 'T' }    # Threonine 
    elsif ( $codon =~ /ACG/i )    { return 'T' }    # Threonine 
    elsif ( $codon =~ /ACT/i )    { return 'T' }    # Threonine 
    elsif ( $codon =~ /AAC/i )    { return 'N' }    # Asparagine 
    elsif ( $codon =~ /AAT/i )    { return 'N' }    # Asparagine 
    elsif ( $codon =~ /AAA/i )    { return 'K' }    # Lysine 
    elsif ( $codon =~ /AAG/i )    { return 'K' }    # Lysine 
    elsif ( $codon =~ /AGC/i )    { return 'S' }    # Serine 
    elsif ( $codon =~ /AGT/i )    { return 'S' }    # Serine 
    elsif ( $codon =~ /AGA/i )    { return 'R' }    # Arginine 
    elsif ( $codon =~ /AGG/i )    { return 'R' }    # Arginine 
    elsif ( $codon =~ /GTA/i )    { return 'V' }    # Valine 
    elsif ( $codon =~ /GTC/i )    { return 'V' }    # Valine 
    elsif ( $codon =~ /GTG/i )    { return 'V' }    # Valine 
    elsif ( $codon =~ /GTT/i )    { return 'V' }    # Valine 
    elsif ( $codon =~ /GCA/i )    { return 'A' }    # Alanine 
    elsif ( $codon =~ /GCC/i )    { return 'A' }    # Alanine 
    elsif ( $codon =~ /GCG/i )    { return 'A' }    # Alanine 
    elsif ( $codon =~ /GCT/i )    { return 'A' }    # Alanine 
    elsif ( $codon =~ /GAC/i )    { return 'D' }    # Aspartic Acid 
    elsif ( $codon =~ /GAT/i )    { return 'D' }    # Aspartic Acid 
    elsif ( $codon =~ /GAA/i )    { return 'E' }    # Glutamic Acid 
    elsif ( $codon =~ /GAG/i )    { return 'E' }    # Glutamic Acid 
    elsif ( $codon =~ /GGA/i )    { return 'G' }    # Glycine 
    elsif ( $codon =~ /GGC/i )    { return 'G' }    # Glycine 
    elsif ( $codon =~ /GGG/i )    { return 'G' }    # Glycine    
    elsif ( $codon =~ /GGT/i )    { return 'G' }    # Glycine 
    else 
	{ 
		print STDERR "Bad codon \"$codon\"!!\n"; 
            exit;
	} 
}



2.第二种版本:简并法

#下面是第二种方法,这里需要一点生物学知识
#我们可以看看第一种方法中
#/GGA/   =>Glycine
#/GGC/   =>Glycine
#/GGG/   =>Glycine
#/GGT/   =>glycine
#上面四个虽然密码子的第三位不同,但是他们表达的都是同一种蛋白
#这就是密码子的简并性
#下面我们就利用这一点和正则表达式的 . 来匹配

# A subroutine to translate a DNA 3-character codon to an amino acid 
#   Version 2 
 
sub codon2aa 
{ 
       my($codon) = @_; 
  
       if ( $codon =~ /GC./i)        { return 'A' }    # Alanine     
    elsif ( $codon =~ /TG[TC]/i)     { return 'C' }    # Cysteine 
    elsif ( $codon =~ /GA[TC]/i)     { return 'D' }    # Aspartic Acid 
    elsif ( $codon =~ /GA[AG]/i)     { return 'E' }    # Glutamic Acid 
    elsif ( $codon =~ /TT[TC]/i)     { return 'F' }    # Phenylalanine 
    elsif ( $codon =~ /GG./i)        { return 'G' }    # Glycine 
    elsif ( $codon =~ /CA[TC]/i)     { return 'H' }    # Histidine 
    elsif ( $codon =~ /AT[TCA]/i)    { return 'I' }    # Isoleucine 
    elsif ( $codon =~ /AA[AG]/i)     { return 'K' }    # Lysine 
    elsif ( $codon =~ /TT[AG]|CT./i) { return 'L' }    # Leucine 
    elsif ( $codon =~ /ATG/i)        { return 'M' }    # Methionine 
    elsif ( $codon =~ /AA[TC]/i)     { return 'N' }    # Asparagine 
    elsif ( $codon =~ /CC./i)        { return 'P' }    # Proline 
    elsif ( $codon =~ /CA[AG]/i)     { return 'Q' }    # Glutamine 
    elsif ( $codon =~ /CG.|AG[AG]/i) { return 'R' }    # Arginine 
    elsif ( $codon =~ /TC.|AG[TC]/i) { return 'S' }    # Serine 
    elsif ( $codon =~ /AC./i)        { return 'T' }    # Threonine 
    elsif ( $codon =~ /GT./i)        { return 'V' }    # Valine 
    elsif ( $codon =~ /TGG/i)        { return 'W' }    # Tryptophan 
    elsif ( $codon =~ /TA[TC]/i)     { return 'Y' }    # Tyrosine 
    elsif ( $codon =~ /TA[AG]|TGA/i) { return '_' }    # Stop 
    else 
	{ 
		print STDERR "Bad codon \"$codon\"!!\n"; 
        exit; 
	} 
} 



3.第三中方法:哈希法

#第三种方法
#也就是运用哈希
#我们将所有的密码子作为hash的key,然后将代表的氨基酸作为hash的value
#然后进行匹配

# codon2aa 
# 
# A subroutine to translate a DNA 3-character codon to an amino acid 
#   Version 3, using hash lookup 
 
sub codon2aa 
{ 
    my($codon) = @_; 
 
    $codon = uc $codon;#uc=uppercase;lc=lowercase
	               #也就是大小写转换,uc表示将所有的小写 转换为大写
		       #lc将所有的大写转换为小写
  
    my(%genetic_code) = ( 
     
    'TCA' => 'S',    # Serine 
    'TCC' => 'S',    # Serine 
    'TCG' => 'S',    # Serine 
    'TCT' => 'S',    # Serine 
    'TTC' => 'F',    # Phenylalanine 
    'TTT' => 'F',    # Phenylalanine 
    'TTA' => 'L',    # Leucine 
    'TTG' => 'L',    # Leucine 
    'TAC' => 'Y',    # Tyrosine  
    'TAT' => 'Y',    # Tyrosine 
    'TAA' => '_',    # Stop 
    'TAG' => '_',    # Stop 
    'TGC' => 'C',    # Cysteine 
    'TGT' => 'C',    # Cysteine 
    'TGA' => '_',    # Stop 
    'TGG' => 'W',    # Tryptophan 
    'CTA' => 'L',    # Leucine 
    'CTC' => 'L',    # Leucine 
    'CTG' => 'L',    # Leucine 
    'CTT' => 'L',    # Leucine 
    'CCA' => 'P',    # Proline 
    'CCC' => 'P',    # Proline 
    'CCG' => 'P',    # Proline 
    'CCT' => 'P',    # Proline 
    'CAC' => 'H',    # Histidine 
    'CAT' => 'H',    # Histidine 
    'CAA' => 'Q',    # Glutamine 
    'CAG' => 'Q',    # Glutamine 
    'CGA' => 'R',    # Arginine 
    'CGC' => 'R',    # Arginine 
    'CGG' => 'R',    # Arginine 
    'CGT' => 'R',    # Arginine 
    'ATA' => 'I',    # Isoleucine 
    'ATC' => 'I',    # Isoleucine 
    'ATT' => 'I',    # Isoleucine 
    'ATG' => 'M',    # Methionine 
    'ACA' => 'T',    # Threonine 
    'ACC' => 'T',    # Threonine 
    'ACG' => 'T',    # Threonine 
    'ACT' => 'T',    # Threonine 
    'AAC' => 'N',    # Asparagine 
    'AAT' => 'N',    # Asparagine 
    'AAA' => 'K',    # Lysine 
    'AAG' => 'K',    # Lysine 
    'AGC' => 'S',    # Serine 
    'AGT' => 'S',    # Serine 
    'AGA' => 'R',    # Arginine 
    'AGG' => 'R',    # Arginine 
    'GTA' => 'V',    # Valine 
    'GTC' => 'V',    # Valine 
    'GTG' => 'V',    # Valine 
    'GTT' => 'V',    # Valine 
    'GCA' => 'A',    # Alanine 
    'GCC' => 'A',    # Alanine 
    'GCG' => 'A',    # Alanine 
    'GCT' => 'A',    # Alanine     
    'GAC' => 'D',    # Aspartic Acid 
    'GAT' => 'D',    # Aspartic Acid 
    'GAA' => 'E',    # Glutamic Acid 
    'GAG' => 'E',    # Glutamic Acid 
    'GGA' => 'G',    # Glycine 
    'GGC' => 'G',    # Glycine 
    'GGG' => 'G',    # Glycine 
    'GGT' => 'G',    # Glycine 
    ); 
 
    if(exists $genetic_code{$codon}) 
    { 
        return $genetic_code{$codon}; 
    }
	else
    { 
 
            print STDERR "Bad codon \"$codon\"!!\n"; 
            exit; 
    } 
} 


当然这里面,hash的速度是最快的。

所以我们更推荐低三种方法。




你可能感兴趣的:(编程jin,perl,编程经验,perl,subroutine,正则表达式,生物,gcc,c)