提取模版特征

根据hit.query和hit.hit_sequence的对齐映射(mapping),对hit.hit_sequence结构中每个残差的原子进行索引,使其与查询序列中(original query sequence)相应的残差一致, hit.hit_sequence中没有位置计为缺失(“-”),根据mmcif的信息,返回的元组包含模版特征字典。

import dataclasses
from Bio import PDB
from typing import Any, Tuple, Dict, Mapping, Sequence, Optional, List
import numpy as np
import re
import contextlib
import tempfile
import shutil
import os
import subprocess
import time
import string
import pickle
from absl import logging

ChainId = str
PdbHeader = Mapping[str, Any]
PdbStructure = PDB.Structure.Structure
SeqRes = str

DeletionMatrix = Sequence[Sequence[int]]

atom_types = [
    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
    'CZ3', 'NZ', 'OXT'
]
atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
atom_type_num = len(atom_types)  # := 37.


HHBLITS_AA_TO_ID = {
    'A': 0,
    'B': 2,
    'C': 1,
    'D': 2,
    'E': 3,
    'F': 4,
    'G': 5,
    'H': 6,
    'I': 7,
    'J': 20,
    'K': 8,
    'L': 9,
    'M': 10,
    'N': 11,
    'O': 20,
    'P': 12,
    'Q': 13,
    'R': 14,
    'S': 15,
    'T': 16,
    'U': 1,
    'V': 17,
    'W': 18,
    'X': 20,
    'Y': 19,
    'Z': 3,
    '-': 21,
}


@dataclasses.dataclass(frozen=True)
class TemplateHit:
  """Class representing a template hit."""
  index: int
  name: str
  aligned_cols: int
  sum_probs: Optional[float]
  query: str
  hit_sequence: str
  indices_query: List[int]
  indices_hit: List[int]
    
    
@dataclasses.dataclass(frozen=True)
class ResiduePosition:
  chain_id: str
  residue_number: int
  insertion_code: str
    

@dataclasses.dataclass(frozen=True)
class ResidueAtPosition:
  position: Optional[ResiduePosition]
  name: str
  is_missing: bool
  hetflag: str
    
    
@dataclasses.dataclass(frozen=True)
class MmcifObject:
  """Representation of a parsed mmCIF file.

  Contains:
    file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
      files being processed.
    header: Biopython header.
    structure: Biopython structure.
    chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
      {'A': 'ABCDEFG'}
    seqres_to_structure: Dict; for each chain_id contains a mapping between
      SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
                                                        1: ResidueAtPosition,
                                                        ...}}
    raw_string: The raw string used to construct the MmcifObject.
  """
  file_id: str
  header: PdbHeader
  structure: PdbStructure
  chain_to_seqres: Mapping[ChainId, SeqRes]
  seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
  raw_string: Any


class Error(Exception):
  """Base class for exceptions."""


class SequenceNotInTemplateError(Error):
  """An error indicating that template mmCIF didn't contain the sequence."""


class QueryToTemplateAlignError(Error):
  """An error indicating that the query can't be aligned to the template."""


@dataclasses.dataclass(frozen=True)
class Msa:
  """Class representing a parsed MSA file."""
  sequences: Sequence[str]
  deletion_matrix: DeletionMatrix
  descriptions: Sequence[str]

  def __post_init__(self):
    if not (len(self.sequences) ==
            len(self.deletion_matrix) ==
            len(self.descriptions)):
      raise ValueError(
          'All fields for an MSA must have the same length. '
          f'Got {len(self.sequences)} sequences, '
          f'{len(self.deletion_matrix)} rows in the deletion matrix and '
          f'{len(self.descriptions)} descriptions.')

  def __len__(self):
    return len(self.sequences)

  def truncate(self, max_seqs: int):
    return Msa(sequences=self.sequences[:max_seqs],
               deletion_matrix=self.deletion_matrix[:max_seqs],
               descriptions=self.descriptions[:max_seqs])


class CaDistanceError(Error):
  """An error indicating that a CA atom distance exceeds a threshold."""


def _check_residue_distances(all_positions: np.ndarray,
                             all_positions_mask: np.ndarray,
                             max_ca_ca_distance: float):
  """Checks if the distance between unmasked neighbor residues is ok."""
  ca_position = atom_order['CA']
  prev_is_unmasked = False
  prev_calpha = None
  for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
    this_is_unmasked = bool(mask[ca_position])
    if this_is_unmasked:
      this_calpha = coords[ca_position]
      if prev_is_unmasked:
        distance = np.linalg.norm(this_calpha - prev_calpha)
        if distance > max_ca_ca_distance:
          raise CaDistanceError(
              'The distance between residues %d and %d is %f > limit %f.' % (
                  i, i + 1, distance, max_ca_ca_distance))
      prev_calpha = this_calpha
    prev_is_unmasked = this_is_unmasked
    
    
def _find_template_in_pdb(
    template_chain_id: str,
    template_sequence: str,
    mmcif_object: MmcifObject) -> Tuple[str, str, int]:
  """Tries to find the template chain in the given pdb file.

  This method tries the three following things in order:
    1. Tries if there is an exact match in both the chain ID and the sequence.
       If yes, the chain sequence is returned. Otherwise:
    2. Tries if there is an exact match only in the sequence.
       If yes, the chain sequence is returned. Otherwise:
    3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
       If yes, the chain sequence is returned.
  If none of these succeed, a SequenceNotInTemplateError is thrown.

  Args:
    template_chain_id: The template chain ID.
    template_sequence: The template chain sequence.
    mmcif_object: The PDB object to search for the template in.

  Returns:
    A tuple with:
    * The chain sequence that was found to match the template in the PDB object.
    * The ID of the chain that is being returned.
    * The offset where the template sequence starts in the chain sequence.

  Raises:
    SequenceNotInTemplateError: If no match is found after the steps described
      above.
  """
  # Try if there is an exact match in both the chain ID and the (sub)sequence.
  pdb_id = mmcif_object.file_id
  chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
  # 精确匹配到template_chain_id的chain_sequence
  if chain_sequence and (template_sequence in chain_sequence):
    logging.info(
        'Found an exact template match %s_%s.', pdb_id, template_chain_id)
    mapping_offset = chain_sequence.find(template_sequence)
    return chain_sequence, template_chain_id, mapping_offset

  # Try if there is an exact match in the (sub)sequence only.
  # 遍历mmcif_object链,只要找到template_sequence就可以
  for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
    if chain_sequence and (template_sequence in chain_sequence):
      logging.info('Found a sequence-only match %s_%s.', pdb_id, chain_id)
      mapping_offset = chain_sequence.find(template_sequence)
      return chain_sequence, chain_id, mapping_offset

  # Return a chain sequence that fuzzy matches (X = wildcard) the template.
  # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
  # 模糊匹配
  regex = ['.' if aa == 'X' else '(?:%s|X)' % aa for aa in template_sequence]
  regex = re.compile(''.join(regex))
  for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
    match = re.search(regex, chain_sequence)
    if match:
      logging.info('Found a fuzzy sequence-only match %s_%s.', pdb_id, chain_id)
      mapping_offset = match.start()
      return chain_sequence, chain_id, mapping_offset

  # No hits, raise an error.
  raise SequenceNotInTemplateError(
      'Could not find the template sequence in %s_%s. Template sequence: %s, '
      'chain_to_seqres: %s' % (pdb_id, template_chain_id, template_sequence,
                               mmcif_object.chain_to_seqres))


def parse_a3m(a3m_string: str) -> Msa:
  """Parses sequences and deletion matrix from a3m format alignment.

  Args:
    a3m_string: The string contents of a a3m file. The first sequence in the
      file should be the query sequence.

  Returns:
    A tuple of:
      * A list of sequences that have been aligned to the query. These
        might contain duplicates.
      * The deletion matrix for the alignment as a list of lists. The element
        at `deletion_matrix[i][j]` is the number of residues deleted from
        the aligned sequence i at residue position j.
      * A list of descriptions, one per sequence, from the a3m file.
  """
  sequences, descriptions = parse_fasta(a3m_string)
  deletion_matrix = []
  for msa_sequence in sequences:
    deletion_vec = []
    deletion_count = 0
    for j in msa_sequence:
      if j.islower():
        deletion_count += 1
      else:
        deletion_vec.append(deletion_count)
        deletion_count = 0
    deletion_matrix.append(deletion_vec)

  # Make the MSA matrix out of aligned (deletion-free) sequences.
  deletion_table = str.maketrans('', '', string.ascii_lowercase)
  aligned_sequences = [s.translate(deletion_table) for s in sequences]
  return Msa(sequences=aligned_sequences,
             deletion_matrix=deletion_matrix,
             descriptions=descriptions)


def _realign_pdb_template_to_query(
    old_template_sequence: str,
    template_chain_id: str,
    mmcif_object: MmcifObject,
    old_mapping: Mapping[int, int],
    kalign_binary_path: str) -> Tuple[str, Mapping[int, int]]:
  """Aligns template from the mmcif_object to the query.

  In case PDB70 contains a different version of the template sequence, we need
  to perform a realignment to the actual sequence that is in the mmCIF file.
  This method performs such realignment, but returns the new sequence and
  mapping only if the sequence in the mmCIF file is 90% identical to the old
  sequence.

  Note that the old_template_sequence comes from the hit, and contains only that
  part of the chain that matches with the query while the new_template_sequence
  is the full chain.

  Args:
    old_template_sequence: The template sequence that was returned by the PDB
      template search (typically done using HHSearch).
    template_chain_id: The template chain id was returned by the PDB template
      search (typically done using HHSearch). This is used to find the right
      chain in the mmcif_object chain_to_seqres mapping.
    mmcif_object: A mmcif_object which holds the actual template data.
    old_mapping: A mapping from the query sequence to the template sequence.
      This mapping will be used to compute the new mapping from the query
      sequence to the actual mmcif_object template sequence by aligning the
      old_template_sequence and the actual template sequence.
    kalign_binary_path: The path to a kalign executable.

  Returns:
    A tuple (new_template_sequence, new_query_to_template_mapping) where:
    * new_template_sequence is the actual template sequence that was found in
      the mmcif_object.
    * new_query_to_template_mapping is the new mapping from the query to the
      actual template found in the mmcif_object.

  Raises:
    QueryToTemplateAlignError:
    * If there was an error thrown by the alignment tool.
    * Or if the actual template sequence differs by more than 10% from the
      old_template_sequence.
  """
  aligner = Kalign(binary_path=kalign_binary_path)
  new_template_sequence = mmcif_object.chain_to_seqres.get(
      template_chain_id, '')

  # Sometimes the template chain id is unknown. But if there is only a single
  # sequence within the mmcif_object, it is safe to assume it is that one.
  if not new_template_sequence:
    if len(mmcif_object.chain_to_seqres) == 1:
      logging.info('Could not find %s in %s, but there is only 1 sequence, so '
                   'using that one.',
                   template_chain_id,
                   mmcif_object.file_id)
      new_template_sequence = list(mmcif_object.chain_to_seqres.values())[0]
    else:
      raise QueryToTemplateAlignError(
          f'Could not find chain {template_chain_id} in {mmcif_object.file_id}. '
          'If there are no mmCIF parsing errors, it is possible it was not a '
          'protein chain.')

  try:
    parsed_a3m = parse_a3m(
        aligner.align([old_template_sequence, new_template_sequence]))
    old_aligned_template, new_aligned_template = parsed_a3m.sequences
  except Exception as e:
    raise QueryToTemplateAlignError(
        'Could not align old template %s to template %s (%s_%s). Error: %s' %
        (old_template_sequence, new_template_sequence, mmcif_object.file_id,
         template_chain_id, str(e)))

  logging.info('Old aligned template: %s\nNew aligned template: %s',
               old_aligned_template, new_aligned_template)

  old_to_new_template_mapping = {}
  old_template_index = -1
  new_template_index = -1
  num_same = 0
  for old_template_aa, new_template_aa in zip(
      old_aligned_template, new_aligned_template):
    if old_template_aa != '-':
      old_template_index += 1
    if new_template_aa != '-':
      new_template_index += 1
    if old_template_aa != '-' and new_template_aa != '-':
      old_to_new_template_mapping[old_template_index] = new_template_index
      if old_template_aa == new_template_aa:
        num_same += 1

  # Require at least 90 % sequence identity wrt to the shorter of the sequences.
  if float(num_same) / min(
      len(old_template_sequence), len(new_template_sequence)) < 0.9:
    raise QueryToTemplateAlignError(
        'Insufficient similarity of the sequence in the database: %s to the '
        'actual sequence in the mmCIF file %s_%s: %s. We require at least '
        '90 %% similarity wrt to the shorter of the sequences. This is not a '
        'problem unless you think this is a template that should be included.' %
        (old_template_sequence, mmcif_object.file_id, template_chain_id,
         new_template_sequence))

  new_query_to_template_mapping = {}
  for query_index, old_template_index in old_mapping.items():
    new_query_to_template_mapping[query_index] = (
        old_to_new_template_mapping.get(old_template_index, -1))

  new_template_sequence = new_template_sequence.replace('-', '')

  return new_template_sequence, new_query_to_template_mapping


def _get_atom_positions(
    mmcif_object: MmcifObject,
    auth_chain_id: str,
    max_ca_ca_distance: float) -> Tuple[np.ndarray, np.ndarray]:
  """Gets atom positions and mask from a list of Biopython Residues."""
  num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])

  relevant_chains = [c for c in mmcif_object.structure.get_chains()
                     if c.id == auth_chain_id]
  if len(relevant_chains) != 1:
    raise MultipleChainsError(
        f'Expected exactly one chain in structure with id {auth_chain_id}.')
  chain = relevant_chains[0]

  all_positions = np.zeros([num_res, atom_type_num, 3])
  all_positions_mask = np.zeros([num_res, atom_type_num],
                                dtype=np.int64)
  for res_index in range(num_res):
    pos = np.zeros([atom_type_num, 3], dtype=np.float32)
    mask = np.zeros([atom_type_num], dtype=np.float32)
    res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
    if not res_at_position.is_missing:
      res = chain[(res_at_position.hetflag,
                   res_at_position.position.residue_number,
                   res_at_position.position.insertion_code)]
      for atom in res.get_atoms():
        atom_name = atom.get_name()
        x, y, z = atom.get_coord()
        if atom_name in atom_order.keys():
          pos[atom_order[atom_name]] = [x, y, z]
          mask[atom_order[atom_name]] = 1.0
        elif atom_name.upper() == 'SE' and res.get_resname() == 'MSE':
          # Put the coordinates of the selenium atom in the sulphur column.
          pos[atom_order['SD']] = [x, y, z]
          mask[atom_order['SD']] = 1.0

      # Fix naming errors in arginine residues where NH2 is incorrectly
      # assigned to be closer to CD than NH1.
      cd = atom_order['CD']
      nh1 = atom_order['NH1']
      nh2 = atom_order['NH2']
      if (res.get_resname() == 'ARG' and
          all(mask[atom_index] for atom_index in (cd, nh1, nh2)) and
          (np.linalg.norm(pos[nh1] - pos[cd]) >
           np.linalg.norm(pos[nh2] - pos[cd]))):
        pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
        mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()

    all_positions[res_index] = pos
    all_positions_mask[res_index] = mask
  _check_residue_distances(
      all_positions, all_positions_mask, max_ca_ca_distance)
  return all_positions, all_positions_mask


def sequence_to_onehot(
    sequence: str,
    mapping: Mapping[str, int],
    map_unknown_to_x: bool = False) -> np.ndarray:
  """Maps the given sequence into a one-hot encoded matrix.

  Args:
    sequence: An amino acid sequence.
    mapping: A dictionary mapping amino acids to integers.
    map_unknown_to_x: If True, any amino acid that is not in the mapping will be
      mapped to the unknown amino acid 'X'. If the mapping doesn't contain
      amino acid 'X', an error will be thrown. If False, any amino acid not in
      the mapping will throw an error.

  Returns:
    A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
    the sequence.

  Raises:
    ValueError: If the mapping doesn't contain values from 0 to
      num_unique_aas - 1 without any gaps.
  """
  num_entries = max(mapping.values()) + 1

  if sorted(set(mapping.values())) != list(range(num_entries)):
    raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
                     'without any gaps. Got: %s' % sorted(mapping.values()))

  one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)

  for aa_index, aa_type in enumerate(sequence):
    if map_unknown_to_x:
      if aa_type.isalpha() and aa_type.isupper():
        aa_id = mapping.get(aa_type, mapping['X'])
      else:
        raise ValueError(f'Invalid character in the sequence: {aa_type}')
    else:
      aa_id = mapping[aa_type]
    one_hot_arr[aa_index, aa_id] = 1

  return one_hot_arr


def extract_template_features(
    mmcif_object: MmcifObject,
    pdb_id: str,
    mapping: Mapping[int, int],
    template_sequence: str,
    query_sequence: str,
    template_chain_id: str,
    kalign_binary_path: str) -> Tuple[Dict[str, Any], Optional[str]]:
  """Parses atom positions in the target structure and aligns with the query.

  Atoms for each residue in the template structure are indexed to coincide
  with their corresponding residue in the query sequence, according to the
  alignment mapping provided.

  Args:
    mmcif_object: mmcif_parsing.MmcifObject representing the template.
    pdb_id: PDB code for the template.
    mapping: Dictionary mapping indices in the query sequence to indices in
      the template sequence.
    template_sequence: String describing the amino acid sequence for the
      template protein.
    query_sequence: String describing the amino acid sequence for the query
      protein.
    template_chain_id: String ID describing which chain in the structure proto
      should be used.
    kalign_binary_path: The path to a kalign executable used for template
        realignment.

  Returns:
    A tuple with:
    * A dictionary containing the extra features derived from the template
      protein structure.
    * A warning message if the hit was realigned to the actual mmCIF sequence.
      Otherwise None.

  Raises:
    NoChainsError: If the mmcif object doesn't contain any chains.
    SequenceNotInTemplateError: If the given chain id / sequence can't
      be found in the mmcif object.
    QueryToTemplateAlignError: If the actual template in the mmCIF file
      can't be aligned to the query.
    NoAtomDataInTemplateError: If the mmcif object doesn't contain
      atom positions.
    TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
      unmasked residues.
  """
  if mmcif_object is None or not mmcif_object.chain_to_seqres:
    raise NoChainsError('No chains in PDB: %s_%s' % (pdb_id, template_chain_id))

  warning = None
  try:
    seqres, chain_id, mapping_offset = _find_template_in_pdb(
        template_chain_id=template_chain_id,
        template_sequence=template_sequence,
        mmcif_object=mmcif_object)
    
    print(seqres, chain_id, mapping_offset)
    
   
  except SequenceNotInTemplateError:
    # If PDB70 contains a different version of the template, we use the sequence
    # from the mmcif_object.
    chain_id = template_chain_id
    warning = (
        f'The exact sequence {template_sequence} was not found in '
        f'{pdb_id}_{chain_id}. Realigning the template to the actual sequence.')
    logging.warning(warning)
    # This throws an exception if it fails to realign the hit.
    seqres, mapping = _realign_pdb_template_to_query(
        old_template_sequence=template_sequence,
        template_chain_id=template_chain_id,
        mmcif_object=mmcif_object,
        old_mapping=mapping,
        kalign_binary_path=kalign_binary_path)
    logging.info('Sequence in %s_%s: %s successfully realigned to %s',
                 pdb_id, chain_id, template_sequence, seqres)
    # The template sequence changed.
    template_sequence = seqres
    # No mapping offset, the query is aligned to the actual sequence.
    mapping_offset = 0

  try:
    # Essentially set to infinity - we don't want to reject templates unless
    # they're really really bad.
    
    # 注:all_atom_positions 指定链所有氨基酸原子的坐标,包括hit.hit_sequence
    all_atom_positions, all_atom_mask = _get_atom_positions(
        mmcif_object, chain_id, max_ca_ca_distance=150.0)
  except (CaDistanceError, KeyError) as ex:
    raise NoAtomDataInTemplateError(
        'Could not get atom data (%s_%s): %s' % (pdb_id, chain_id, str(ex))
        ) from ex

  # np.split函数用于将数组沿指定轴分割为多个子数组
  all_atom_positions = np.split(all_atom_positions, all_atom_positions.shape[0])
  all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])

  output_templates_sequence = []
  templates_all_atom_positions = []
  templates_all_atom_masks = []

  # templates_all_atom_positions 和 query_sequence等长,
  # 代表query_sequence对应位置的氨基酸原子的坐标
  # 注: query_sequence为要预测结构的原始蛋白质序列,不是hit.query
  for _ in query_sequence:
    # Residues in the query_sequence that are not in the template_sequence:
    templates_all_atom_positions.append(
        np.zeros((atom_type_num, 3)))
    templates_all_atom_masks.append(np.zeros(atom_type_num))
    output_templates_sequence.append('-')
  
  # 模版上原子坐标对应到query序列中,
  # mapping_offset为hit.hit_sequence在mmcif中相应链chain_sequence中相对位置
  for k, v in mapping.items():
    template_index = v + mapping_offset
    templates_all_atom_positions[k] = all_atom_positions[template_index][0]
    templates_all_atom_masks[k] = all_atom_masks[template_index][0]
    output_templates_sequence[k] = template_sequence[v]

  # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
  if np.sum(templates_all_atom_masks) < 5:
    raise TemplateAtomMaskAllZerosError(
        'Template all atom mask was all zeros: %s_%s. Residue range: %d-%d' %
        (pdb_id, chain_id, min(mapping.values()) + mapping_offset,
         max(mapping.values()) + mapping_offset))

  output_templates_sequence = ''.join(output_templates_sequence)

  templates_aatype = sequence_to_onehot(
      output_templates_sequence, HHBLITS_AA_TO_ID)

  return (
      {    # 值向量都和query sequence等长
          'template_all_atom_positions': np.array(templates_all_atom_positions),
          'template_all_atom_masks': np.array(templates_all_atom_masks),
          'template_sequence': output_templates_sequence.encode(),
          'template_aatype': np.array(templates_aatype),
          'template_domain_names': f'{pdb_id.lower()}_{chain_id}'.encode(),
      },
      warning)


def _to_a3m(sequences: Sequence[str]) -> str:
  """Converts sequences to an a3m file."""
  names = ['sequence %d' % i for i in range(1, len(sequences) + 1)]
  a3m = []
  for sequence, name in zip(sequences, names):
    a3m.append(u'>' + name + u'\n')
    a3m.append(sequence + u'\n')
  return ''.join(a3m)


@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):
  """Context manager that deletes a temporary directory on exit."""
  tmpdir = tempfile.mkdtemp(dir=base_dir)
  try:
    yield tmpdir
  finally:
    shutil.rmtree(tmpdir, ignore_errors=True)


@contextlib.contextmanager
def timing(msg: str):
  logging.info('Started %s', msg)
  tic = time.time()
  yield
  toc = time.time()
  logging.info('Finished %s in %.3f seconds', msg, toc - tic)


def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
  """Parses FASTA string and returns list of strings with amino-acid sequences.

  Arguments:
    fasta_string: The string contents of a FASTA file.

  Returns:
    A tuple of two lists:
    * A list of sequences.
    * A list of sequence descriptions taken from the comment lines. In the
      same order as the sequences.
  """
  sequences = []
  descriptions = []
  index = -1
  for line in fasta_string.splitlines():
    line = line.strip()
    if line.startswith('>'):
      index += 1
      descriptions.append(line[1:])  # Remove the '>' at the beginning.
      sequences.append('')
      continue
    elif not line:
      continue  # Skip blank lines.
    sequences[index] += line

  return sequences, descriptions


class Kalign:
  """Python wrapper of the Kalign binary."""

  def __init__(self, *, binary_path: str):
    """Initializes the Python Kalign wrapper.

    Args:
      binary_path: The path to the Kalign binary.

    Raises:
      RuntimeError: If Kalign binary not found within the path.
    """
    self.binary_path = binary_path

  def align(self, sequences: Sequence[str]) -> str:
    """Aligns the sequences and returns the alignment in A3M string.

    Args:
      sequences: A list of query sequence strings. The sequences have to be at
        least 6 residues long (Kalign requires this). Note that the order in
        which you give the sequences might alter the output slightly as
        different alignment tree might get constructed.

    Returns:
      A string with the alignment in a3m format.

    Raises:
      RuntimeError: If Kalign fails.
      ValueError: If any of the sequences is less than 6 residues long.
    """
    logging.info('Aligning %d sequences', len(sequences))
    
    print("in kalign function")
    
    for s in sequences:
      if len(s) < 6:
        raise ValueError('Kalign requires all sequences to be at least 6 '
                         'residues long. Got %s (%d residues).' % (s, len(s)))

    with tmpdir_manager() as query_tmp_dir:
      input_fasta_path = os.path.join(query_tmp_dir, 'input.fasta')
      output_a3m_path = os.path.join(query_tmp_dir, 'output.a3m')

      with open(input_fasta_path, 'w') as f:
        f.write(_to_a3m(sequences))

      cmd = [
          self.binary_path,
          '-i', input_fasta_path,
          '-o', output_a3m_path,
          '-format', 'fasta',
      ]

      logging.info('Launching subprocess "%s"', ' '.join(cmd))
      process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

      with timing('Kalign query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()
        logging.info('Kalign stdout:\n%s\n\nstderr:\n%s\n',
                     stdout.decode('utf-8'), stderr.decode('utf-8'))

      if retcode:
        raise RuntimeError('Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n'
                           % (stdout.decode('utf-8'), stderr.decode('utf-8')))

      with open(output_a3m_path) as f:
        a3m = f.read()
      
      print(f"kalign a3m:{a3m}")
    
      return a3m


####准备输入数据

### 读入Sequence[TemplateHit]数据
with open('test_pdb_hits.pkl', 'rb') as file:
  # 使用 pickle.load 从文件中加载对象
  test_pdb_hits = pickle.load(file)

#print(test_pdb_hits)

#取第一条作为演示数据
hit = test_pdb_hits[0] # TemplateHit实例
print(hit)

# 下载mmcif文件,解析成MmcifObject
#print(hit.name)

hit_pdb_code = hit.name.split()[0].split("_")[0] # "1J1V"
hit_chain_id = hit.name.split()[0].split("_")[1] # "A"

#print(hit_pdb_code)
#print(hit_chain_id)

#print(f"hit.query:{hit.query}")
#print(f"hit.hit_sequence:{hit.hit_sequence}")

  
#from Bio.PDB import *
#pdbl = PDBList()
# #它将从服务器下载指定的文件('1j1v.cif')并将其存储在当前工作目录中。
#pdbl.retrieve_pdb_file(hit_pdb_code, pdir = '.', file_format = 'mmCif')


# 解析mmCIF 格式字符串并保存 

with open('mmcif_objec_1j1v.pkl', 'rb') as file:
  # 使用 pickle.load 从文件中加载对象
  test_mmcif_object = pickle.load(file)

#print(test_mmcif_objec)

hit_pdb_code = "1J1V"
hit_chain_id = "A"

template_sequence = hit.hit_sequence.replace('-', '')
#print(template_sequence)
 
input_fasta_file = 'Q94K49.fasta'
## 从fasta文件提取 query_sequence(str格式)
query_sequence = ""
with open(input_fasta_file) as f:
  for line in f.readlines():
    if line.startswith(">"):
      continue
    query_sequence += line.strip()


#mapping =  build_query_to_hit_index_mapping(hit.query, 
#                                            hit.hit_sequence, 
#                                            hit.indices_hit, 
#                                            hit.indices_query,
#                                            query_sequence)


# 索引对应dict 
#with open('test_mapping_1j1v.pkl', 'wb') as file:
#  pickle.dump(mapping, file)

# 打开二进制文件以进行读取
with open('test_mapping_1j1v.pkl', 'rb') as file:
    # 使用 pickle.load 从文件中加载对象
    test_mapping = pickle.load(file)

#kalign_binary_path = "/Users/zhengxueming/anaconda3/envs/protein_design/bin/kalign"
kalign_binary_path = "~/anaconda3/envs/protein_design/bin/kalign"
## 参数查看
#print(f"test_mmcif_object:{test_mmcif_object}")
#print(f"hit_pdb_code:{hit_pdb_code}")
#print(f"test_mapping:{test_mapping}")
#print(f"template_sequence:{template_sequence}")
#print(f"query_sequence:{query_sequence}")
#print(f"hit_chain_id:{hit_chain_id}")
#print(f"kalign_binary_path:{kalign_binary_path}")

features, realign_warning = extract_template_features(mmcif_object=test_mmcif_object,
                                                      pdb_id=hit_pdb_code,
                                                      mapping=test_mapping,
                                                      template_sequence=template_sequence,
                                                      query_sequence=query_sequence,
                                                      template_chain_id=hit_chain_id,
                                                      kalign_binary_path=kalign_binary_path)


print(features)
print(realign_warning)

你可能感兴趣的:(生物信息学)