多序列比对数据(Msa类格式)转化为特征字典:键的类型是字符串 (str),值的类型是 NumPy 数组。
### msa特征
# Msa类格式数据特征字典
# make_msa_features函数实现msa类格式数据向量化,输入为msa列表,返回msa特征字典,键为向量化的msa,deletion matrix 等
# ('deletion_matrix_int','msa','num_alignments','msa_species_identifiers')
from typing import MutableMapping, Sequence, Optional
import numpy as np
import dataclasses
import re
## 定义映射类型,存储特征(特征名称:numpy array)
# MutableMapping[str, np.ndarray]:键的类型是字符串 (str),值的类型是 NumPy 数组 (np.ndarray)
FeatureDict = MutableMapping[str, np.ndarray]
DeletionMatrix = Sequence[Sequence[int]]
@dataclasses.dataclass(frozen=True)
class Msa:
"""Class representing a parsed MSA file."""
sequences: Sequence[str]
deletion_matrix: DeletionMatrix
descriptions: Sequence[str]
def __post_init__(self):
if not (len(self.sequences) ==
len(self.deletion_matrix) ==
len(self.descriptions)):
raise ValueError(
'All fields for an MSA must have the same length. '
f'Got {len(self.sequences)} sequences, '
f'{len(self.deletion_matrix)} rows in the deletion matrix and '
f'{len(self.descriptions)} descriptions.')
def __len__(self):
return len(self.sequences)
def truncate(self, max_seqs: int):
return Msa(sequences=self.sequences[:max_seqs],
deletion_matrix=self.deletion_matrix[:max_seqs],
descriptions=self.descriptions[:max_seqs])
HHBLITS_AA_TO_ID = {
'A': 0,
'B': 2,
'C': 1,
'D': 2,
'E': 3,
'F': 4,
'G': 5,
'H': 6,
'I': 7,
'J': 20,
'K': 8,
'L': 9,
'M': 10,
'N': 11,
'O': 20,
'P': 12,
'Q': 13,
'R': 14,
'S': 15,
'T': 16,
'U': 1,
'V': 17,
'W': 18,
'X': 20,
'Y': 19,
'Z': 3,
'-': 21,
}
# Sequences coming from UniProtKB database come in the
# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
_UNIPROT_PATTERN = re.compile(
r"""
^
# UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
(?:tr|sp)
\|
# A primary accession number of the UniProtKB entry.
(?P[A-Za-z0-9]{6,10})
# Occasionally there is a _0 or _1 isoform suffix, which we ignore.
(?:_\d)?
\|
# TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
# protein ID code.
(?:[A-Za-z0-9]+)
_
# A mnemonic species identification code.
(?P([A-Za-z0-9]){1,5})
# Small BFD uses a final value after an underscore, which we ignore.
(?:_\d+)?
$
""",
re.VERBOSE)
@dataclasses.dataclass(frozen=True)
class Identifiers:
species_id: str = ''
def _extract_sequence_identifier(description: str) -> Optional[str]:
"""Extracts sequence identifier from description. Returns None if no match."""
split_description = description.split()
if split_description:
return split_description[0].partition('/')[0]
else:
return None
def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
"""Gets species from an msa sequence identifier.
The sequence identifier has the format specified by
_UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`
Args:
msa_sequence_identifier: a sequence identifier.
Returns:
An `Identifiers` instance with species_id. These
can be empty in the case where no identifier was found.
"""
matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
if matches:
return Identifiers(
species_id=matches.group('SpeciesIdentifier'))
return Identifiers()
def get_identifiers(description: str) -> Identifiers:
"""Computes extra MSA features from the description."""
sequence_identifier = _extract_sequence_identifier(description)
if sequence_identifier is None:
return Identifiers()
else:
return _parse_sequence_identifier(sequence_identifier)
def make_msa_features(msas: Sequence[Msa]) -> FeatureDict:
"""Constructs a feature dict of MSA features."""
if not msas:
raise ValueError('At least one MSA must be provided.')
int_msa = []
deletion_matrix = []
species_ids = []
seen_sequences = set()
for msa_index, msa in enumerate(msas):
#print("msa_index:", msa_index)
#print("msa:", msa)
if not msa:
raise ValueError(f'MSA {msa_index} must contain at least one sequence.')
for sequence_index, sequence in enumerate(msa.sequences):
# 去除重复序列及相应的deletion_matrix,descriptions
if sequence in seen_sequences:
continue
seen_sequences.add(sequence)
# 序列向量化
int_msa.append(
[HHBLITS_AA_TO_ID[res] for res in sequence])
deletion_matrix.append(msa.deletion_matrix[sequence_index])
identifiers = get_identifiers(
msa.descriptions[sequence_index])
species_ids.append(identifiers.species_id.encode('utf-8'))
num_res = len(msas[0].sequences[0])
num_alignments = len(int_msa)
features = {}
# 不同msa的deletion_matrix长度要相同,不然会报错
features['deletion_matrix_int'] = np.array(deletion_matrix, dtype=np.int32)
features['msa'] = np.array(int_msa, dtype=np.int32)
features['num_alignments'] = np.array(
[num_alignments] * num_res, dtype=np.int32)
features['msa_species_identifiers'] = np.array(species_ids, dtype=np.object_)
return features
m_seq1 = ["A-AL-L","AT-LAL","S-ALLI"] # 多序列比对后的数据
m_del_matrix1 = [[0,0,0,0,0,0],[0,1,0,0,1,0],[0,0,0,0,1,0]]
m_descriptions1 = ["seq1","seq2","seq3"]
m_seq2 = ["AAAL-L","AALL-L","LAALLL"] # 多序列比对后的数据
m_del_matrix2 = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,1,0]]
m_descriptions2 = ["seqA","seqB","seqC"]
# 实例化
test_msa1 = Msa(m_seq1, m_del_matrix1, m_descriptions1)
test_msa2 = Msa(m_seq2, m_del_matrix2, m_descriptions2)
#print(test_msa)
# 输入msa列表
test_msa_lst = [test_msa1, test_msa2]
#test_msa_lst = [test_msa1]
#print(test_msa_lst)
test_msa_features_dict = make_msa_features(test_msa_lst)
print("test_msa_features:",test_msa_features_dict)