EST gb转fa格式(genebank和fasta)与提取gtf的noncoding区域

两个程序比较简单,高手免看了,就是想保存一下,以后说不定直接就用了。。。

EST的数据库是genebank格式,找了一圈没发现转换的程序,就自己写了一个,还算可以,就是没有用多线程来做。。。

#!/usr/bin/env perl 
use strict;
use warnings;
use Getopt::Long;

my %opt;
GetOptions(\%opt, "i=s", "o=s");

my $usage = <<"USAGE";

	Usage : perl $0 [options] <-i>
	ModifyTime : 09/10/2013
	Note: 

	Options:
	-i	input file
	-o	output file, default ./

	eg: perl $0 -i est.gb

USAGE

die $usage if(!$opt{i});

open IN, ($opt{i} =~ /\.gz$/) ? "gzip -dc $opt{i} |" : $opt{i} or die $!;
my ($id, $def, $str);
while()
{
	chomp;
	if(/^\s+loci/)
	{
		next;
	}elsif(/^DEFINITION\s+(.*)/){
		$def = $1;
		while(my $line = )
		{
			chomp($line);
			if($line =~ /^ACCESSION\s+(.*)/)
			{
				$id = $1;
				last;
			}else{
				$line =~ s/^\s+/ /;
				$def .= $line;
			}
		}
	}elsif(/^VERSION\s+(\w+\.\w+)\s+GI:(.*)/){
		print ">gi|$2|gb|$1|$id $def\n";
	}elsif(/^ORIGIN/){
		while()
		{
			chomp;
			if(/^\s+\d+\s+(.*)/)
			{
				$str = $1;
				$str =~ s/ //g;
				print "$str\n";
			}else{
				last;
			}
		}
	}else{
		next;
	}
}
close IN;
提取gtf的noncoding区域:

#!/usr/bin/env perl 
use strict;
use warnings;
use Getopt::Long;

my %opt;
GetOptions(\%opt, "i=s", "o=s", "help|h");

my $usage = <<"USAGE";

	Usage : perl $0 [options] <-i>
	ModifyTime : 09/06/2013
	Note: gtf must be sorted!!!

	Options:
	-i	input file
	-o	output file, default ./
	-help|h

	eg: perl $0 -i 

USAGE

die $usage if(!$opt{i} or $opt{help} or $opt{h});
$opt{o} ||= "$opt{i}.bed";

my %hash;
open IN, (($opt{i} =~ /\.gz$/) ? "gzip -dc $opt{i} |" : $opt{i}) or die $!;
open OUT, "> $opt{o}" or die $!;
while()
{
	chomp;
	my @tmp = split;
	push @{$hash{$tmp[0]}}, "$tmp[3]\t$tmp[4]";
}
close IN;

foreach my $ct(keys %hash)
{
	my ($fs, $fe) = split /\t/, $hash{$ct}[0];
	my ($len) = $ct =~ /size(\d+)$/;
	$len -= 1;
	my $fs_e = $fs - 2;
	print OUT "$ct\t0\t$fs_e\n";
	for(my $i = 1; $i < @{$hash{$ct}}; $i ++)
	{
		my ($s, $e) = split /\t/, $hash{$ct}[$i];
		if($fe < $s)
		{
			my $etm = $s - 2;
			print OUT "$ct\t$fe\t$etm\n";
			($fs, $fe) = ($s, $e);
		}elsif($fe >= $e){
			next;
		}else{
			$fe = $e;
		}
	}
	print OUT "$ct\t$fe\t$len\n";
}
close OUT;



你可能感兴趣的:(生物信息学)