使用Python程序读取fasta文件reads

# !/usr/bin/python3

import sys
import re
from reverse_seq import *

def format_seq(seq,num):
    l = len(seq)
    m = l // num #取整除
    fseq = ""
    for i in range(m+1):
        fseq += seq[i*num : (i+1)*num] + "\n"
    return fseq

if __name__ == '__main__':
    seq_file = open(r"D:\sequence.fasta","r")
    out_file = open(r"D:\sequence_rervese_complement.fasta","w")
    seq = ""
    for eachline in seq_file:
        #>lcl|JAKRYI020000014.1_cds_KAI4295232.1_1 [locus_tag=L6164_035299] [protein=hypothetical protein] [protein_id=KAI4295232.1] [location=join(11145..11254,12514..12625,13047..13118,13225..13436,13525..13633)] [gbkey=CDS]
        eachline = eachline.strip()
        if (eachline[0] == r">"):
            line_match = re.match(r">\w{3}\|(\w+\.\d).+", eachline)
            print("1:",line_match.group())
            print("2:", line_match.group(1))
            #print("3:",line_match.group(2))
            reads_name = line_match.group(1)
            if (seq == ""):
                out_file.write(r">" + reads_name+"\n")  # reads名称,只保留accession号,写入第一条reads名称
            else:
                trans_seq = translate_seq(seq)
                rseq = trans_seq[::-1]
                out_file.write(format_seq(rseq,70))
                out_file.write(r">" + reads_name + "\n")  # 写入第二至最后一条reads
                seq = ""  # 处理完后,把seq置为空
        else:
            seq = seq + eachline
    else: #for循环的else子句在for循环正常结束后执行
        #处理最后一条序列
        trans_seq = translate_seq(seq)
        rseq = trans_seq[::-1]
        out_file.write(format_seq(rseq,70))
    out_file.close()
    seq_file.close()

你可能感兴趣的:(python,开发语言)