perl实现蛋白质翻译以及蛋白质个数统计

这个程序,用perl语言实现了RNA序列翻译蛋白质序列的过程。

考虑了AG GU alternative splicing influence and start coden, stop coden

统计了RNA序列可生产蛋白质长度

引用请注明出处

 

#!/usr/bin/perl -w
# Program name: statProtein_splicing.pl
# Author      : sunchen
# Contact     : [email protected]
# Date        : 10/22/2011
# Last Update : 10/27/2011
# Reference   : Please cite our following papers when you are using this script.

# Description : 
#===============================================================================================================
use warnings;
use strict;
use Getopt::Long;

#my %opts;
#GetOptions(\%opts,"dir:s");
#my $usage= <<"USAGE";
#	Program: $0
#	INPUT:
#		-dir        full path of file

#	OUTPUT:
#USAGE
#die $usage unless ($opts{dir} && -e $opts{dir});

#open DIR, $opts{dir};

#create a hash table contain the coden and Aa

my %Aa=
(
	"UUU"=>"F",
	"UUC"=>"F",
	"UUA"=>"L",
	"UUG"=>"L",
	
	"CUU"=>"L",
	"CUC"=>"L",
	"CUA"=>"L",
	"CUG"=>"L",
	
	"AUU"=>"I",
	"AUC"=>"I",
	"AUA"=>"I",
	"AUG"=>"START",
	
	"GUU"=>"V",
	"GUC"=>"V",
	"GUA"=>"V",
	"GUG"=>"V",
	
	"UCU"=>"S",
	"UCC"=>"S",
	"UCA"=>"S",
	"UCG"=>"S",
	
	"CCU"=>"P",
	"CCC"=>"P",
	"CCA"=>"P",
	"CCG"=>"P",
	
	"ACU"=>"T",
	"ACC"=>"T",
	"ACA"=>"T",
	"ACG"=>"T",
	
	"GCU"=>"A",
	"GCC"=>"A",
	"GCA"=>"A",
	"GCG"=>"A",
	
	"UAU"=>"Y",
	"UAC"=>"Y",
	"UAA"=>"END",
	"UAG"=>"END",
	
	"CAU"=>"H",
	"CAC"=>"H",
	"CAA"=>"Q",
	"CAG"=>"Q",
	
	"AAU"=>"N",
	"AAC"=>"N",
	"AAA"=>"K",
	"AAG"=>"K",
	
	"GAU"=>"D",
	"GAC"=>"D",
	"GAA"=>"E",
	"GAG"=>"E",
	
	"UGU"=>"C",
	"UGC"=>"C",
	"UGA"=>"END",
	"UGG"=>"W",
	
	"CGU"=>"R",
	"CGC"=>"R",
	"CGA"=>"R",
	"CGG"=>"R",
	
	"AGU"=>"S",
	"AGC"=>"S",
	"AGA"=>"R",
	"AGG"=>"R",
	
	"GGU"=>"G",
	"GGC"=>"G",
	"GGA"=>"G",
	"GGG"=>"G",
);


open DIR, "<E:/lab/4_AUG";
open OUT, ">E:/lab/out.txt";


my $protein = 0;
my $coden = 0;
my @array;
my $array_len = 0;
my $if_exist = 0;#$if_exist
my $seq = "";

my $start_point = 0;
my $end_point = 0;

my $order = 0;

my @coden_size;
sub detect_protein
{
	if($if_exist == 1)
	{
		goto END_OF_SUB;
	}
	$coden = 0;
	my($string) = @_;
	@array=split "", $string;
	$array_len = @array;
	my $position = -1;
	$position = index($string,"AUG");
	my $if_start = 0;
	my $to_print = "";
	if($position == -1)
	{
		goto END_OF_SUB;
	}else
	{
		$to_print = $to_print."M";
		$if_start = 1;
		$coden++;
	}
	my $pro = "";
	for(my $i = $position+3; $i < $array_len-2; $i=$i+3)
	{
		$pro = $array[$i].$array[$i+1].$array[$i+2];
		#print OUT $pro."\n";
		#print OUT $Aa{$pro}."\n";
		if($Aa{$pro} eq "START")
		{
			$coden++;
			$to_print = $to_print."M";
		}elsif($Aa{$pro} eq "END")
		{
			$if_start = 2;
			$to_print = $to_print."\n";
			$coden++;
			last;
		}else
		{
			$coden++;
			$to_print = $to_print.$Aa{$pro};
		}
	}
	if($if_start == 1)
	{
		$to_print = $to_print."\n";
	}

	if($if_start==2 && $if_exist == 0)
	{
		$protein++;
		$if_exist = 1;
		#print OUT $order."\n";
		if($coden == 72)
		{
			print $to_print."\n";
		}
		#print OUT $to_print;
		$coden_size[$coden]++;
		#print OUT $string."\n";
	}
	END_OF_SUB:
}

while(<DIR>)
{
	$order ++;
	$seq = $_;
	$if_exist = 0;
	
	my $splicing_site_ag = 0;
	$splicing_site_ag = index($seq,"AG");
	my @ag_site=();
	push(@ag_site,$splicing_site_ag);
	while($splicing_site_ag != -1)
	{
		#print "$splicing_site_ag\n";
		$splicing_site_ag = index($seq,"AG",$splicing_site_ag+2);
		push(@ag_site,$splicing_site_ag);
	}
	
	my $splicing_site_gu = 0;
	$splicing_site_gu = rindex($seq,"GU");
	my @gu_site=();
	push(@gu_site,$splicing_site_gu);
	while($splicing_site_gu != -1)
	{
		#print "$splicing_site_gu\n";
		$splicing_site_gu = rindex($seq,"GU",$splicing_site_gu-1);
		if($splicing_site_gu != -1 && defined($splicing_site_gu))
		{
			push(@gu_site,$splicing_site_gu);
		}else
		{
			last;
		}

	}

	for(my $j = 0; $j < @ag_site;$j++)
	{
		if($ag_site[$j] > 270)
		{
			last;
		}
		if($if_exist == 1)
		{
			last;
		}
		for(my $k = @gu_site-1; $k >= 0;$k--)
		{

			#print JK "J:$ag_site[$j]##########K:$gu_site[$k]\n";
			#print OUT "$ag_site[$j] < $gu_site[$k]\n";
			if($ag_site[$j] < $gu_site[$k])
			{
				if(($gu_site[$k] - $ag_site[$j] - 2) >= 30)
				{
					my $substr = substr($seq, $ag_site[$j]+2, $gu_site[$k] - $ag_site[$j] - 2);
					#print OUT2 "$substr\n";
					my @sub_array = split "",$substr;
					
					my $final_length = @sub_array;
					
					detect_protein($substr);
						
					last;
				}
			}
		}
	}
}
for(my $i = 0; $i < @coden_size; $i++)
{
	print "$i\n";
}
print "###########################################\n";
for(my $i = 0; $i < @coden_size; $i++)
{
	if(defined($coden_size[$i]))
	{
		print "$coden_size[$i]\n";
	}else
	{
		print "0\n";
	}

}
close DIR;
close OUT;
 

你可能感兴趣的:(perl)