NCBI gff/gff3 转换为 gtf 格式

适用于 NCBI 数据库中 gff 格式文件转换为 gtf 格式,使用 perl 代码实现:

#!/usr/bin/perl -w
use strict;
use Getopt::Long;
use Data::Dumper;

my ($help, $infile, $outdir, $gff2gtf, $gene2tr);
GetOptions(
    "infile=s"  => \$infile,
    "outdir:s"  => \$outdir,
    "gene2rt!"  => \$gene2tr,
    "gff2gtf!"  => \$gff2gtf,
    "help|?"    => \$help
);
die &usage if (!defined $infile || defined $help);
$outdir ||= ".";system("mkdir -p $outdir");


if (defined $gff2gtf){
    my ($gene, $gene_info, $mRNA, $exon, $cds) = &gff2gtf($infile);

    foreach my $g (sort keys %$gene){
        my $out_g = $gene_info->{$g};
        my $gene_id = $gene_info->{$g}->[4];
        my $gene_type = $gene_info->{$g}->[5];
        
        print "$out_g->[0]\tGeneBang\tgene\t$out_g->[1]\t$out_g->[2]\t.\t$out_g->[3]\t.\tgene_id \"$gene_id\"; gi \"$g\"; gene_biotype \"$gene_type\";\n";

        foreach my $m (sort keys %{$mRNA->{$g}}){
            if (exists $mRNA->{$g}->{$m}->{'mRNA'}){
                my $out_m = $mRNA->{$g}->{$m}->{'mRNA'};
                my $transcript_id = $out_m->[4];
                print "$out_m->[0]\tGeneBang\ttranscript\t$out_m->[1]\t$out_m->[2]\t.\t$out_m->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$transcript_id\"; gi \"$g\"; transcript_biotype \"$gene_type\";\n";

                if (exists $exon->{$m}){
                    foreach my $e (sort keys %{$exon->{$m}}){
                        my $out_e = $exon->{$m}->{$e};
                        print "$out_e->[0]\tGeneBang\texon\t$out_e->[1]\t$out_e->[2]\t.\t$out_e->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$transcript_id\"; gi \"$g\"; transcript_biotype \"$gene_type\"; exon_id \"$e\"; \n";
                    }
                }
                
                if (exists $cds->{$m}){
                    foreach my $c (sort keys %{$cds->{$m}}){
                        my @out_pos = @{$cds->{$m}->{$c}->{pos}};
                        my $protein_id = $cds->{$m}->{$c}->{pep};
                        foreach my $pos (@out_pos){
                            print "$out_m->[0]\tGeneBang\tCDS\t$pos\t.\t$out_m->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$transcript_id\"; gi \"$g\"; transcript_biotype \"$gene_type\"; cds_id \"$c\"; protein_id \"$protein_id\"; \n";
                        }
                    }
                }


            }elsif(exists $mRNA->{$g}->{$m}->{'ncRNA'} ){
                my $out_n = $mRNA->{$g}->{$m}->{'ncRNA'};
                my $bio_type = $out_n->[4];
                print "$out_n->[0]\tGeneBang\ttranscript\t$out_n->[1]\t$out_n->[2]\t.\t$out_n->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$m\"; gi \"$g\"; transcript_biotype \"$bio_type\";\n";

                if (exists $exon->{$m}){
                    foreach my $e (sort keys %{$exon->{$m}}){
                        my $out_e = $exon->{$m}->{$e};
                        print "$out_e->[0]\tGeneBang\texon\t$out_e->[1]\t$out_e->[2]\t.\t$out_e->[3]\t.\tgene_id \"$gene_id\"; transcript_id \"$m\"; gi \"$g\"; transcript_biotype \"$bio_type\"; exon_id \"$e\"; \n";
                    }
                }
            }else{
                next;
            }
        }
    }
}


if (defined $gene2tr){

    open OUT, ">$outdir/gene2tr.txt" || die $!;
    my $gene2tr = &gene2tr("$infile");
    foreach my $k (sort keys %{$gene2tr}){
        foreach (@{$gene2tr->{$k}}){
            print OUT "$k\t$_\n";
        }
    }
    close OUT;
}


sub gff2gtf{
    my $gff = shift @_;
    my (%gene, %gene_info, %mRNA, %exon, %cds);
    open GFF, "< $gff" || die $!;
    while(){
        chomp;
        next if /^$|^#/;
        my @a = split /\t/, $_;

        if ($a[2] =~ /gene/i){
            my ($id) = $a[8] =~ /ID=(gene\d+)/;
            my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
            my ($gene_type) = $a[8] =~ /gene_biotype=([^;]+)/;
            $gene_info{$gene} = [$a[0], $a[3], $a[4], $a[6], $id, $gene_type];
            $gene{$gene} = $id;

        }elsif($a[2] =~ /mRNA|lnc_RNA|trascript/i){
            my ($rna_id) = $a[8] =~ /ID=(rna\d+)/;
            my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
            my ($transcript) = $a[8] =~ /transcript_id=([^;]+)/;
            $mRNA{$gene}{$rna_id}{mRNA} = [$a[0], $a[3], $a[4], $a[6], $transcript];

        }elsif($a[2] =~ /exon/){
            my ($exon_id) = $a[8] =~ /ID=(id\d+)/;
            my ($rna_id) = $a[8] =~ /Parent=([^;]+)/;
            $exon{$rna_id}{$exon_id} = [$a[0], $a[3], $a[4], $a[6]];

        }elsif($a[2] =~ /CDS/i){
            my ($cds_id) = $a[8] =~ /ID=(cds\d+)/;
            my ($rna_id) = $a[8] =~ /Parent=([^;]+)/;
            my ($protein)= $a[8] =~ /protein_id=([^;]+)/;
            $cds{$rna_id}{$cds_id}{pep} = $protein;
            push @{$cds{$rna_id}{$cds_id}{pos}},"$a[3]\t$a[4]";

        }elsif($a[2] =~ /rRNA/i){
            my ($rna_id) = $a[8] =~ /ID=(rna\d+)/;
            my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
            $mRNA{$gene}{$rna_id}{ncRNA} = [$a[0], $a[3], $a[4], $a[6], "rRNA"];
            
        }elsif($a[2] =~ /tRNA/i){
            my ($rna_id) = $a[8] =~ /ID=(rna\d+)/;
            my ($gene) = $a[8] =~ /Dbxref=GeneID:(\d+)/;
            $mRNA{$gene}{$rna_id}{ncRNA} = [$a[0], $a[3], $a[4], $a[6], "tRNA"];
            
        }else{
            next;
        }
    }
    return (\%gene, \%gene_info, \%mRNA, \%exon, \%cds);
}


sub gene2tr{
    my $gff = shift @_;
    my %out;
    open GFF, "<$gff" || die $!;
    while(){
        chomp;
        next if (/^$|^#/);
        my @a = split /\t/, $_;
        if ($a[2] =~ /mRNA|transcript|lnc_RNA/i){
            my ($gene) = $a[8] =~ /Parent=(gene\d+)/;
            my ($transcript) = $a[8] =~ /transcript_id=(\w+_\d+\.\d+)/;
            push @{$out{$gene}}, $transcript;
        }else{
            next;
        }
    }
    close GFF;
    
    return \%out;
}

sub usage{
    print STDERR<

你可能感兴趣的:(NCBI gff/gff3 转换为 gtf 格式)