生信笔记系列之序列提取--根据 GTF 提取转录本

今日再来一发,根据 gtf 文件提取 fasta 序列。

#!/usr/bin/perl -w
use strict;
use File::Basename;
use Getopt::Long;

sub usage{
    print <  [options]
Options:
    -g, --gtf       *   refernece gtf file
    -f, --fasta     *   reference fasta file
    -p, --prefix        prefix of output file       [default: gtfname]
    -o, --outdir        output directory            [default: ./ ]
    help                        print this help information
e.g
    perl $0 -g unreference.gtf -f hg38.fa -p fetched -o ./
    perl $0 --gtf unreference.gtf --fasta hg38.fa --prefix fetched --outdir ./
USAGE
    exit 0;
}

my ($help,$outdir,$prefix,$fasta,$gtf);
GetOptions(
    "gtf|g=s"    => \$gtf,
    "fasta|f=s"  => \$fasta,
    "prefix|p:s" => \$prefix,
    "outdir|o:s" => \$outdir,
    "help|?"     => \$help
);
die &usage if (!defined $gtf || !defined $fasta || defined $help);

$prefix ||= basename($gtf);
$outdir ||= ".";
mkdir $outdir if (not -e $outdir);

## read gtf file
my %gtf;
my %exon;
my %transcript;
my %fasta;

open GTF,$gtf or die "Can't open the gtf file $!";
while (){
    chomp;
    my @a = split /\t/,$_;
    if ($a[2] eq "exon"){
        (my $gen) = $_ =~ /\s+gene_id\s+"(\S+)";/;
        (my $tra) = $_ =~ /\s+transcript_id\s+"(\S+)";/;
        (my $num) = $_ =~ /\s+exon_number\s+"(\S+)";/;
#        push @{$exon{$id}},$_;
#        my $n = @{$exon{$id}};
        $transcript{$tra}{"strand"} = $a[6];
        $transcript{$tra}{"geneid"} = $gen ;
        $gtf{$a[0]}{$tra}{$num}{"start"}  = $a[3] - 1;
        $gtf{$a[0]}{$tra}{$num}{"length"} = $a[4] - $a[3] + 1;
    } else {
        next;
    }

}
close GTF;

open FAO,">$outdir/$prefix.fa"  || die "Can't write the file $outdir/$prefix.fa $! \n";
open G2T,">$outdir/gene2tr.txt" || die "Can't write the file $outdir/gene2tr.txt $!\n";
open FA,$fasta || die "Can't open the reference fasta file $!\n";
$/ = ">";
;
while (){
    my ($info,$seq) = split(/\n/,$_,2);
    $seq =~ s/\n+//g;
    my @b = split(/\s/,$info);
    my $z = $gtf{$b[0]};
    foreach my $k (sort keys %$z){

        my $strand = $transcript{$k}{"strand"};
        my $name   = $transcript{$k}{"geneid"};
        my $x = $z->{$k};

        foreach my $j (keys %$x ) {
            my $start  = $x->{$j}->{"start"};
            my $length = $x->{$j}->{"length"};
            $fasta{$k} .= substr($seq,$start,$length);
        }

        my $fa = $fasta{$k};

        if ($strand eq "-" ) {
            $fa = reverse $fa;
            $fa =~ tr/AaGgCcTt/TtCcGgAa/;
        }
        $fa = out_fasta($fa,60);

        print FAO ">$k\t$name\n$fa\n";
        print G2T "$name\t$k\n";
    }
}
close FA;
close FAO;
close G2T;

sub out_fasta{
    my ($seq,$num) = @_;
    my $len = length $seq;
    $seq =~ s/([A-Za-z]{$num})/$1\n/g;
    chop($seq) unless $len % $num;
    return $seq;
}

__END__

其实转了一圈,才发现我做的工作,前人已经做过了,而且比我做的好。该程序类似于 cufflinks 程序中 gffread 的程序
gffread 用法如下:

     gffread  -w out.fa -g reference.fa in.gtf

汗颜~还是要多学习!

你可能感兴趣的:(生信笔记系列之序列提取--根据 GTF 提取转录本)