创建transcript to gene mapping file

使用bioconductor会很简单。
我们只需要:

mart_11 <- biomaRt::useMart(biomart = "ENSEMBL_MART_ENSEMBL", dataset = "hsapiens_gene_ensembl")
t2g <- biomaRt::getBM(attributes = c("ensembl_transcript_id", "ensembl_gene_id", "external_gene_name"), mart = mart_11)
t2g <- dplyr::rename(t2g, target_id = ensembl_transcript_id, ens_gene = ensembl_gene_id, ext_gene = external_gene_name)
write.table(t2g,file = "homo_transcrip_to_genes.txt",sep="\t",col.names = F,row.names = F,quote=F)

之后就可以在salmon,kallisto等软件中使用啦。

但是,得到的文件可能并不是和我们在ensemble上下载的gtf相对应的,所以不能使用。这是我们使用python来处理。

def create_transcript_list(input, use_name = True, use_version = True):
    r = {}
    for line in input:
        if len(line) == 0 or line[0] == '#':
            continue
        l = line.strip().split('\t')
        if l[2] == 'transcript':
            info = l[8]
            d = {}
            for x in info.split('; '):
                x = x.strip()
                p = x.find(' ')
                if p == -1:
                    continue
                k = x[:p]
                p = x.find('"',p)
                p2 = x.find('"',p+1)
                v = x[p+1:p2]
                d[k] = v


            if 'transcript_id' not in d or 'gene_id' not in d:
                continue

            tid = d['transcript_id']
            gid = d['gene_id']
            if use_version:
                if 'transcript_version' not in d or 'gene_version' not in d:
                    continue

                tid += '.' + d['transcript_version']
                gid += '.' + d['gene_version']
            gname = None
            if use_name:
                if 'gene_name' not in d:
                    continue
                gname = d['gene_name']

            if tid in r:
                continue

            r[tid] = (gid, gname)
    return r
def print_output(output, r, use_name = True):
    for tid in r:
        if use_name:
            output.write("%s\t%s\t%s\n"%(tid, r[tid][0], r[tid][1]))
        else:
            output.write("%s\t%s\n"%(tid, r[tid][0]))

with open('./homo_ercc.gtf') as file:
    r = create_transcript_list(file, use_name = True, use_version = True)
with open('human_transcript_to_gene.tsv', "w+") as output:
    print_output(output, r, use_name = True)
print('Created human_transcript_to_gene.tsv file')

你可能感兴趣的:(创建transcript to gene mapping file)