Msa类格式数据特征字典

多序列比对数据(Msa类格式)转化为特征字典:键的类型是字符串 (str),值的类型是 NumPy 数组。

### msa特征

# Msa类格式数据特征字典

#  make_msa_features函数实现msa类格式数据向量化,输入为msa列表,返回msa特征字典,键为向量化的msa,deletion matrix 等
# ('deletion_matrix_int','msa','num_alignments','msa_species_identifiers')

from typing import MutableMapping, Sequence, Optional
import numpy as np
import dataclasses
import re

## 定义映射类型,存储特征(特征名称:numpy array)
# MutableMapping[str, np.ndarray]:键的类型是字符串 (str),值的类型是 NumPy 数组 (np.ndarray)
FeatureDict = MutableMapping[str, np.ndarray]

DeletionMatrix = Sequence[Sequence[int]]


@dataclasses.dataclass(frozen=True)
class Msa:
  """Class representing a parsed MSA file."""
  sequences: Sequence[str]
  deletion_matrix: DeletionMatrix
  descriptions: Sequence[str]

  def __post_init__(self):
    if not (len(self.sequences) ==
            len(self.deletion_matrix) ==
            len(self.descriptions)):
      raise ValueError(
          'All fields for an MSA must have the same length. '
          f'Got {len(self.sequences)} sequences, '
          f'{len(self.deletion_matrix)} rows in the deletion matrix and '
          f'{len(self.descriptions)} descriptions.')

  def __len__(self):
    return len(self.sequences)

  def truncate(self, max_seqs: int):
    return Msa(sequences=self.sequences[:max_seqs],
               deletion_matrix=self.deletion_matrix[:max_seqs],
               descriptions=self.descriptions[:max_seqs])


HHBLITS_AA_TO_ID = {
    'A': 0,
    'B': 2,
    'C': 1,
    'D': 2,
    'E': 3,
    'F': 4,
    'G': 5,
    'H': 6,
    'I': 7,
    'J': 20,
    'K': 8,
    'L': 9,
    'M': 10,
    'N': 11,
    'O': 20,
    'P': 12,
    'Q': 13,
    'R': 14,
    'S': 15,
    'T': 16,
    'U': 1,
    'V': 17,
    'W': 18,
    'X': 20,
    'Y': 19,
    'Z': 3,
    '-': 21,
}


# Sequences coming from UniProtKB database come in the
# `db|UniqueIdentifier|EntryName` format, e.g. `tr|A0A146SKV9|A0A146SKV9_FUNHE`
# or `sp|P0C2L1|A3X1_LOXLA` (for TREMBL/Swiss-Prot respectively).
_UNIPROT_PATTERN = re.compile(
    r"""
    ^
    # UniProtKB/TrEMBL or UniProtKB/Swiss-Prot
    (?:tr|sp)
    \|
    # A primary accession number of the UniProtKB entry.
    (?P[A-Za-z0-9]{6,10})
    # Occasionally there is a _0 or _1 isoform suffix, which we ignore.
    (?:_\d)?
    \|
    # TREMBL repeats the accession ID here. Swiss-Prot has a mnemonic
    # protein ID code.
    (?:[A-Za-z0-9]+)
    _
    # A mnemonic species identification code.
    (?P([A-Za-z0-9]){1,5})
    # Small BFD uses a final value after an underscore, which we ignore.
    (?:_\d+)?
    $
    """,
    re.VERBOSE)


@dataclasses.dataclass(frozen=True)
class Identifiers:
  species_id: str = ''


def _extract_sequence_identifier(description: str) -> Optional[str]:
  """Extracts sequence identifier from description. Returns None if no match."""
  split_description = description.split()
  if split_description:
    return split_description[0].partition('/')[0]
  else:
    return None


def _parse_sequence_identifier(msa_sequence_identifier: str) -> Identifiers:
  """Gets species from an msa sequence identifier.

  The sequence identifier has the format specified by
  _UNIPROT_TREMBL_ENTRY_NAME_PATTERN or _UNIPROT_SWISSPROT_ENTRY_NAME_PATTERN.
  An example of a sequence identifier: `tr|A0A146SKV9|A0A146SKV9_FUNHE`

  Args:
    msa_sequence_identifier: a sequence identifier.

  Returns:
    An `Identifiers` instance with species_id. These
    can be empty in the case where no identifier was found.
  """
  matches = re.search(_UNIPROT_PATTERN, msa_sequence_identifier.strip())
  if matches:
    return Identifiers(
        species_id=matches.group('SpeciesIdentifier'))
  return Identifiers()


def get_identifiers(description: str) -> Identifiers:
  """Computes extra MSA features from the description."""
  sequence_identifier = _extract_sequence_identifier(description)
  if sequence_identifier is None:
    return Identifiers()
  else:
    return _parse_sequence_identifier(sequence_identifier)


def make_msa_features(msas: Sequence[Msa]) -> FeatureDict:
  """Constructs a feature dict of MSA features."""
  if not msas:
    raise ValueError('At least one MSA must be provided.')

  int_msa = []
  deletion_matrix = []
  species_ids = []
  seen_sequences = set()
  for msa_index, msa in enumerate(msas):
    #print("msa_index:", msa_index)
    #print("msa:", msa)
    
    if not msa:
      raise ValueError(f'MSA {msa_index} must contain at least one sequence.')    
    for sequence_index, sequence in enumerate(msa.sequences):  
      # 去除重复序列及相应的deletion_matrix,descriptions
      if sequence in seen_sequences:
        continue
      seen_sequences.add(sequence)
    
      # 序列向量化
      int_msa.append(
          [HHBLITS_AA_TO_ID[res] for res in sequence])
      
      deletion_matrix.append(msa.deletion_matrix[sequence_index])
      identifiers = get_identifiers(
          msa.descriptions[sequence_index])
      species_ids.append(identifiers.species_id.encode('utf-8'))

  num_res = len(msas[0].sequences[0])
  num_alignments = len(int_msa)
  features = {}
  
  # 不同msa的deletion_matrix长度要相同,不然会报错 
  features['deletion_matrix_int'] = np.array(deletion_matrix, dtype=np.int32)
  features['msa'] = np.array(int_msa, dtype=np.int32)
  features['num_alignments'] = np.array(
      [num_alignments] * num_res, dtype=np.int32)
  features['msa_species_identifiers'] = np.array(species_ids, dtype=np.object_)
  return features


m_seq1 = ["A-AL-L","AT-LAL","S-ALLI"] # 多序列比对后的数据
m_del_matrix1 = [[0,0,0,0,0,0],[0,1,0,0,1,0],[0,0,0,0,1,0]]
m_descriptions1 = ["seq1","seq2","seq3"]

m_seq2 = ["AAAL-L","AALL-L","LAALLL"] # 多序列比对后的数据
m_del_matrix2 = [[0,0,0,0,0,0],[0,0,0,0,0,0],[0,0,0,0,1,0]]
m_descriptions2 = ["seqA","seqB","seqC"]
 
# 实例化
test_msa1 = Msa(m_seq1, m_del_matrix1, m_descriptions1)
test_msa2 = Msa(m_seq2, m_del_matrix2, m_descriptions2)
#print(test_msa)

# 输入msa列表
test_msa_lst = [test_msa1, test_msa2]
#test_msa_lst = [test_msa1]
#print(test_msa_lst)

test_msa_features_dict = make_msa_features(test_msa_lst)
print("test_msa_features:",test_msa_features_dict)

你可能感兴趣的:(python,生物信息学)