根据GFF3文件从基因组文件中获取全部基因的上游1000的启动子序列

def reverse_complement(seq):

  ntComplement = {'A':'T','T':'A','G':'C','C':'G','N':'N'}

  RevSeqList = list(reversed(seq))

  RevComSeqList = [ntComplement[k] for k in RevSeqList]

  RevComSeq = ''.join(RevComSeqList)

  return RevComSeq

gff_file = open('Sitalica_312_v2.2.gene.gff3')

genome_file = open('Sitalica_312_v2.fa')

out_file = open('Si.full.code.sequences.fasta','w')

chl_fasta = {}

for row in genome_file:

if ">" in row:

fsy = row.split()

chl_gene = fsy[0].replace('>', '')

fsy_sequence = ''

else:

row.strip()

sequences = row.split()

store = sequences[0]

fsy_sequence = fsy_sequence + store

chl_fasta[chl_gene] = fsy_sequence

genome_file.seek(0,0)

for line in gff_file:

fsy1 = line.split()

sign = fsy1[2]

if sign == 'mRNA':

if fsy1[6] == '+':

site_number = int(fsy1[3])-1000

if fsy1[0] in chl_fasta:

result = ">" + fsy1[8].split(';')[0].replace('ID=', '') + "\n" + chl_fasta[fsy1[0]][site_number:int(fsy1[3])] + "\n"

out_file.write(result)

# print(">" + fsy1[8] + "\n" + chl_fasta[fsy1[0]][site_number:fsy1[3]])

# genome_file.seek(0,0)

if fsy1[6] == '-':

site_number = int(fsy1[4])+1000

if fsy1[0] in chl_fasta:

word = chl_fasta[fsy1[0]][site_number:int(fsy1[3])]

reverse_complement(word)

result = ">" + fsy1[8].split(':')[0].replace('ID=', '') + "\n" + word + "\n"

out_file.write(result)

# print(">" + fsy1[8] + "\n" + chl_fasta[fsy1[0]][fsy1[4]:site_number])

这个程序写的不是很好,运行起来很慢,哎!

你可能感兴趣的:(根据GFF3文件从基因组文件中获取全部基因的上游1000的启动子序列)