python计算基因组外显子长度(第一题)

题目来自生信技能树
统计人类外显子长度
坐标的文件可如下下载

ftp://ftp.ncbi.nlm.nih.gov/pub/CCDS/current_human/CCDS.current.txt

import os
import re
from collections import OrderedDict
from operator import itemgetter

os.chdir("D:\python")
ha1 = {}
exonLength = 0
with open("CCDS.current.txt","rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        line = line.rstrip()
        lst = line.split('\t')
        if lst[-2] == '-':
            continue
        lst[-2] = re.sub('\[|\]','',lst[-2])
        exons = lst[-2].split(", ")
        for exon in exons:
            start = int(exon.split('-')[0])
            end = int(exon.split('-')[1])
            coordinate = lst[0] + ':' + exon
            if coordinate not in ha1:
                ha1[coordinate] = 1
                exonLength += end - start 
print(exonLength)
419272
ha1 = {}
exonLength = 0
with open("CCDS.current.txt","rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        line = line.rstrip()
        lst = line.split('\t')
        if lst[-2] == '-':
            continue
        lst[-2] = re.sub('\[|\]','',lst[-2])
        exons = lst[-2].split(", ")
        for exon in exons:
            start = int(exon.split('-')[0])
            end = int(exon.split('-')[1])
            for i in range(start,end):
                coordinate = lst[0] + ':' + str(i)
                if coordinate not in ha1:
                    exonLength += 1
print(exonLength)
451017

你可能感兴趣的:(python计算基因组外显子长度(第一题))