多序列比对sto格式转a3m

多序列比对(Multiple Sequence Alignment,MSA)是对多个生物序列进行对齐的过程,以揭示它们之间的共同模式和结构。在生物信息学中,有多种文件格式用于存储多序列比对的结果,其中包括Stockholm (.sto) 和 A3M (.a3m) 格式。

注释信息: Stockholm文件通常包含更多的注释信息,提供关于序列和比对的额外信息。相比之下,A3M文件主要关注于序列本身。

应用领域: Stockholm文件广泛用于多种生物信息学工具,如HMMER。A3M文件通常用于蛋白质结构预测和深度学习模型的训练,如Alphafold。

from typing import Sequence, Iterable, Optional

def _convert_sto_seq_to_a3m(
      query_non_gaps: Sequence[bool], sto_seq: str) -> Iterable[str]:
    for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
        if is_query_res_non_gap:
            yield sequence_res
        elif sequence_res != '-':
            yield sequence_res.lower()
    
def convert_stockholm_to_a3m(stockholm_format: str,
                             max_sequences: Optional[int] = None,
                             remove_first_row_gaps: bool = True) -> str:
    """Converts MSA in Stockholm format to the A3M format."""
    descriptions = {}
    sequences = {}
    reached_max_sequences = False

    for line in stockholm_format.splitlines():
        reached_max_sequences = max_sequences and len(sequences) >= max_sequences
        if line.strip() and not line.startswith(('#', '//')):
            # Ignore blank lines, markup and end symbols - remainder are alignment
            # sequence parts.
            seqname, aligned_seq = line.split(maxsplit=1)
            if seqname not in sequences:
                if reached_max_sequences:
                    continue
                sequences[seqname] = ''
            sequences[seqname] += aligned_seq

    for line in stockholm_format.splitlines():
        if line[:4] == '#=GS':
            # Description row - example format is:
            # #=GS UniRef90_Q9H5Z4/4-78            DE [subseq from] cDNA: FLJ22755 ...
            columns = line.split(maxsplit=3)
            seqname, feature = columns[1:3]
            
            value = columns[3] if len(columns) == 4 else ''
            if feature != 'DE':
                continue
            if reached_max_sequences and seqname not in sequences:
                continue
            descriptions[seqname] = value
        if len(descriptions) == len(sequences):
            break

    # Convert sto format to a3m line by line
    a3m_sequences = {}
    if remove_first_row_gaps:
        # query_sequence is assumed to be the first sequence
        query_sequence = next(iter(sequences.values()))
        query_non_gaps = [res != '-' for res in query_sequence]
    for seqname, sto_sequence in sequences.items():
        # Dots are optional in a3m format and are commonly removed.
        out_sequence = sto_sequence.replace('.', '')
        if remove_first_row_gaps:
            out_sequence = ''.join(
                _convert_sto_seq_to_a3m(query_non_gaps, out_sequence))
            a3m_sequences[seqname] = out_sequence

    fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
                    for k in a3m_sequences)
    return '\n'.join(fasta_chunks) + '\n'  # Include terminating newline

with open("test_aln.sto") as f:
    sto_str = f.read()

print(sto_str)
print("====")
a3m_str = convert_stockholm_to_a3m(sto_str)
print(a3m_str)

你可能感兴趣的:(生物信息学)