统计外显子个数与内含子长度(Python)

面向对象。
只有在一个基因含两个以上外显子时才会有内含子。
输入为gff3文件

import re

class gene():
    def __init__(self):
        self.id = None
        self.exonnum = 0
        self.exon = []
        self.start = 0
        self.end = 0
        self.intron = []

    def calcintron(self):
        self.intron.append((self.start, self.exon[0][0]))
        for i in range(len(self.exon)-1): self.intron.append((self.exon[i][1], self.exon[i+1][0]))
        self.intron.append((self.exon[-1][1], self.end))
        return None

    def insert(self, tup, index = 0):
        while index < len(self.exon) and tup[0] > self.exon[index][0]: index += 1
        self.exon.insert(index, tup)
        return None

def read(key, line):
    lis = re.split('[\f\n\r\t\v]+', each)
    if len(lis) != 10:
        return False
    if type(key) == int:
        return lis[key-1]
    elif key == 'ID':
        ID = re.match('ID=[A-Za-z0-9_]+', lis[-2])
        return ID.group()  

lis = []
f = open('work6_input2.gff3', 'r')
for each in f:
    # 读取type
    typ = read(3, each)
    # 读取ID
    ID = read('ID', each)
    if typ == 'gene':
        temp = gene()
        temp.id = ID
        temp.start = int(read(4, each))
        temp.end = int(read(5, each))
        lis.append(temp)
    elif typ == 'exon':
        temp.exonnum += 1
        # 按序插入外显子序列
        temp.insert((int(read(4, each)), int(read(5, each))))

for each in lis:
    if each.exonnum >= 2:
        each.calcintron()
        leng = []
        for i in each.intron:
            leng.append(i[1]-i[0])
        print('{}的外显子数量为{}, 内含子长度为{}'.format(each.id, each.exonnum, leng))
    else:
        print('%s的外显子数量为%d'%(each.id, each.exonnum))

你可能感兴趣的:(统计外显子个数与内含子长度(Python))