gff3格式是使用gmap软件得到的。
输入文件gff3的格式如下:
chr1A IWGSCv1.0_gmap gene 11740 12074 . + . ID=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1.path1;Name=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1
chr1A IWGSCv1.0_gmap mRNA 11740 12074 . + . ID=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1.mrna1;Name=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1;Parent=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1.path1;coverage=100.0;identity=100.0;matches=335;mismatches=0;indels=0;unknowns=0
chr1A IWGSCv1.0_gmap exon 11740 12074 100 + . ID=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1.mrna1.exon1;Name=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1;Parent=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1.mrna1;Target=TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1 1 335 +
chr1A IWGSCv1.0_gmap gene 22427 24851 . - . ID=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1.path1;Name=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1
chr1A IWGSCv1.0_gmap mRNA 22427 24851 . - . ID=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1.mrna1;Name=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1;Parent=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1.path1;coverage=100.0;identity=100.0;matches=2425;mismatches=0;indels=0;unknowns=0
chr1A IWGSCv1.0_gmap exon 22427 24851 100 - . ID=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1.mrna1.exon1;Name=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1;Parent=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1.mrna1;Target=TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1 1 2425 +
chr1A IWGSCv1.0_gmap gene 28794 39054 . + . ID=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.path1;Name=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1
chr1A IWGSCv1.0_gmap mRNA 28794 39054 . + . ID=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.mrna1;Name=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1;Parent=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.path1;coverage=100.0;identity=100.0;matches=1624;mismatches=0;indels=0;unknowns=0
chr1A IWGSCv1.0_gmap exon 28794 28929 100 + . ID=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.mrna1.exon1;Name=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1;Parent=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.mrna1;Target=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1 1 136 +
chr1A IWGSCv1.0_gmap exon 37567 39054 100 + . ID=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.mrna1.exon2;Name=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1;Parent=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1.mrna1;Target=TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1 137 1624 +
转换成gtf的格式,类似下边的结果:
chr1A IWGSCv1.0_gmap transcript 11740 12074 . + . transcript_id "TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1"; gene_id "TRIAE_CS42_1AS_TGACv1_023354_AA0082670";
chr1A IWGSCv1.0_gmap exon 11740 12074 100 + . transcript_id "TRIAE_CS42_1AS_TGACv1_023354_AA0082670.1"; gene_id "TRIAE_CS42_1AS_TGACv1_023354_AA0082670";exon_number 1;
chr1A IWGSCv1.0_gmap transcript 22427 24851 . - . transcript_id "TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1"; gene_id "TRIAE_CS42_1AS_TGACv1_024449_AA0082770";
chr1A IWGSCv1.0_gmap exon 22427 24851 100 - . transcript_id "TRIAE_CS42_1AS_TGACv1_024449_AA0082770.1"; gene_id "TRIAE_CS42_1AS_TGACv1_024449_AA0082770";exon_number 1;
chr1A IWGSCv1.0_gmap transcript 28794 39054 . + . transcript_id "TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1"; gene_id "TRIAE_CS42_1AS_TGACv1_021338_AA0081570";
chr1A IWGSCv1.0_gmap exon 28794 28929 100 + . transcript_id "TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1"; gene_id "TRIAE_CS42_1AS_TGACv1_021338_AA0081570";exon_number 1;
chr1A IWGSCv1.0_gmap exon 37567 39054 100 + . transcript_id "TRIAE_CS42_1AS_TGACv1_021338_AA0081570.1"; gene_id "TRIAE_CS42_1AS_TGACv1_021338_AA0081570";exon_number 2;
chr1A IWGSCv1.0_gmap transcript 59624 60578 . - . transcript_id "TRIAE_CS42_1AS_TGACv1_021658_AA0082030.1"; gene_id "TRIAE_CS42_1AS_TGACv1_021658_AA0082030";
chr1A IWGSCv1.0_gmap exon 59624 60578 99 - . transcript_id "TRIAE_CS42_1AS_TGACv1_021658_AA0082030.1"; gene_id "TRIAE_CS42_1AS_TGACv1_021658_AA0082030";exon_number 1;
chr1A IWGSCv1.0_gmap transcript 86763 89148 . - . transcript_id "TRIAE_CS42_1AS_TGACv1_021895_AA0082240.1"; gene_id "TRIAE_CS42_1AS_TGACv1_021895_AA0082240";
转换的脚本如下:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
__author__ = 'shengwei ma'
__author_email__ = '[email protected]'
with open('TGACv1.cdna.gff3', 'r') as f:
for line in f:
lin = line.strip().split('\t')
if lin[2] == 'gene':
print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\ttranscript_id \"%s\";gene_id \"%s\"" % \
(lin[0], lin[1], "transcript", lin[3], lin[4], lin[5], lin[6], lin[7], lin[8].split(';')[0][3:-6],
lin[8].split(';')[1].split('.')[0][5:])
if lin[2] == 'exon':
exon = lin[8].split(';')[0]
exon1 = exon.split('exon')[-1]
print "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\ttranscript_id \"%s\";gene_id \"%s\";exon_number %s" % \
(lin[0], lin[1], lin[2], lin[3], lin[4], lin[5], lin[6], lin[7], lin[8].split(';')[0][3:-12],
lin[8].split(';')[1].split('.')[0][5:], exon1)