qq_27390023

提取模版特征

根据hit.query和hit.hit_sequence的对齐映射(mapping)，对hit.hit_sequence结构中每个残差的原子进行索引，使其与查询序列中(original query sequence)相应的残差一致, hit.hit_sequence中没有位置计为缺失（“-”），根据mmcif的信息，返回的元组包含模版特征字典。

import dataclasses
from Bio import PDB
from typing import Any, Tuple, Dict, Mapping, Sequence, Optional, List
import numpy as np
import re
import contextlib
import tempfile
import shutil
import os
import subprocess
import time
import string
import pickle
from absl import logging

ChainId = str
PdbHeader = Mapping[str, Any]
PdbStructure = PDB.Structure.Structure
SeqRes = str

DeletionMatrix = Sequence[Sequence[int]]

atom_types = [
    'N', 'CA', 'C', 'CB', 'O', 'CG', 'CG1', 'CG2', 'OG', 'OG1', 'SG', 'CD',
    'CD1', 'CD2', 'ND1', 'ND2', 'OD1', 'OD2', 'SD', 'CE', 'CE1', 'CE2', 'CE3',
    'NE', 'NE1', 'NE2', 'OE1', 'OE2', 'CH2', 'NH1', 'NH2', 'OH', 'CZ', 'CZ2',
    'CZ3', 'NZ', 'OXT'
]
atom_order = {atom_type: i for i, atom_type in enumerate(atom_types)}
atom_type_num = len(atom_types)  # := 37.


HHBLITS_AA_TO_ID = {
    'A': 0,
    'B': 2,
    'C': 1,
    'D': 2,
    'E': 3,
    'F': 4,
    'G': 5,
    'H': 6,
    'I': 7,
    'J': 20,
    'K': 8,
    'L': 9,
    'M': 10,
    'N': 11,
    'O': 20,
    'P': 12,
    'Q': 13,
    'R': 14,
    'S': 15,
    'T': 16,
    'U': 1,
    'V': 17,
    'W': 18,
    'X': 20,
    'Y': 19,
    'Z': 3,
    '-': 21,
}


@dataclasses.dataclass(frozen=True)
class TemplateHit:
  """Class representing a template hit."""
  index: int
  name: str
  aligned_cols: int
  sum_probs: Optional[float]
  query: str
  hit_sequence: str
  indices_query: List[int]
  indices_hit: List[int]
    
    
@dataclasses.dataclass(frozen=True)
class ResiduePosition:
  chain_id: str
  residue_number: int
  insertion_code: str
    

@dataclasses.dataclass(frozen=True)
class ResidueAtPosition:
  position: Optional[ResiduePosition]
  name: str
  is_missing: bool
  hetflag: str
    
    
@dataclasses.dataclass(frozen=True)
class MmcifObject:
  """Representation of a parsed mmCIF file.

  Contains:
    file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
      files being processed.
    header: Biopython header.
    structure: Biopython structure.
    chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
      {'A': 'ABCDEFG'}
    seqres_to_structure: Dict; for each chain_id contains a mapping between
      SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
                                                        1: ResidueAtPosition,
                                                        ...}}
    raw_string: The raw string used to construct the MmcifObject.
  """
  file_id: str
  header: PdbHeader
  structure: PdbStructure
  chain_to_seqres: Mapping[ChainId, SeqRes]
  seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
  raw_string: Any


class Error(Exception):
  """Base class for exceptions."""


class SequenceNotInTemplateError(Error):
  """An error indicating that template mmCIF didn't contain the sequence."""


class QueryToTemplateAlignError(Error):
  """An error indicating that the query can't be aligned to the template."""


@dataclasses.dataclass(frozen=True)
class Msa:
  """Class representing a parsed MSA file."""
  sequences: Sequence[str]
  deletion_matrix: DeletionMatrix
  descriptions: Sequence[str]

  def __post_init__(self):
    if not (len(self.sequences) ==
            len(self.deletion_matrix) ==
            len(self.descriptions)):
      raise ValueError(
          'All fields for an MSA must have the same length. '
          f'Got {len(self.sequences)} sequences, '
          f'{len(self.deletion_matrix)} rows in the deletion matrix and '
          f'{len(self.descriptions)} descriptions.')

  def __len__(self):
    return len(self.sequences)

  def truncate(self, max_seqs: int):
    return Msa(sequences=self.sequences[:max_seqs],
               deletion_matrix=self.deletion_matrix[:max_seqs],
               descriptions=self.descriptions[:max_seqs])


class CaDistanceError(Error):
  """An error indicating that a CA atom distance exceeds a threshold."""


def _check_residue_distances(all_positions: np.ndarray,
                             all_positions_mask: np.ndarray,
                             max_ca_ca_distance: float):
  """Checks if the distance between unmasked neighbor residues is ok."""
  ca_position = atom_order['CA']
  prev_is_unmasked = False
  prev_calpha = None
  for i, (coords, mask) in enumerate(zip(all_positions, all_positions_mask)):
    this_is_unmasked = bool(mask[ca_position])
    if this_is_unmasked:
      this_calpha = coords[ca_position]
      if prev_is_unmasked:
        distance = np.linalg.norm(this_calpha - prev_calpha)
        if distance > max_ca_ca_distance:
          raise CaDistanceError(
              'The distance between residues %d and %d is %f > limit %f.' % (
                  i, i + 1, distance, max_ca_ca_distance))
      prev_calpha = this_calpha
    prev_is_unmasked = this_is_unmasked
    
    
def _find_template_in_pdb(
    template_chain_id: str,
    template_sequence: str,
    mmcif_object: MmcifObject) -> Tuple[str, str, int]:
  """Tries to find the template chain in the given pdb file.

  This method tries the three following things in order:
    1. Tries if there is an exact match in both the chain ID and the sequence.
       If yes, the chain sequence is returned. Otherwise:
    2. Tries if there is an exact match only in the sequence.
       If yes, the chain sequence is returned. Otherwise:
    3. Tries if there is a fuzzy match (X = wildcard) in the sequence.
       If yes, the chain sequence is returned.
  If none of these succeed, a SequenceNotInTemplateError is thrown.

  Args:
    template_chain_id: The template chain ID.
    template_sequence: The template chain sequence.
    mmcif_object: The PDB object to search for the template in.

  Returns:
    A tuple with:
    * The chain sequence that was found to match the template in the PDB object.
    * The ID of the chain that is being returned.
    * The offset where the template sequence starts in the chain sequence.

  Raises:
    SequenceNotInTemplateError: If no match is found after the steps described
      above.
  """
  # Try if there is an exact match in both the chain ID and the (sub)sequence.
  pdb_id = mmcif_object.file_id
  chain_sequence = mmcif_object.chain_to_seqres.get(template_chain_id)
  # 精确匹配到template_chain_id的chain_sequence
  if chain_sequence and (template_sequence in chain_sequence):
    logging.info(
        'Found an exact template match %s_%s.', pdb_id, template_chain_id)
    mapping_offset = chain_sequence.find(template_sequence)
    return chain_sequence, template_chain_id, mapping_offset

  # Try if there is an exact match in the (sub)sequence only.
  # 遍历mmcif_object链，只要找到template_sequence就可以
  for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
    if chain_sequence and (template_sequence in chain_sequence):
      logging.info('Found a sequence-only match %s_%s.', pdb_id, chain_id)
      mapping_offset = chain_sequence.find(template_sequence)
      return chain_sequence, chain_id, mapping_offset

  # Return a chain sequence that fuzzy matches (X = wildcard) the template.
  # Make parentheses unnamed groups (?:_) to avoid the 100 named groups limit.
  # 模糊匹配
  regex = ['.' if aa == 'X' else '(?:%s|X)' % aa for aa in template_sequence]
  regex = re.compile(''.join(regex))
  for chain_id, chain_sequence in mmcif_object.chain_to_seqres.items():
    match = re.search(regex, chain_sequence)
    if match:
      logging.info('Found a fuzzy sequence-only match %s_%s.', pdb_id, chain_id)
      mapping_offset = match.start()
      return chain_sequence, chain_id, mapping_offset

  # No hits, raise an error.
  raise SequenceNotInTemplateError(
      'Could not find the template sequence in %s_%s. Template sequence: %s, '
      'chain_to_seqres: %s' % (pdb_id, template_chain_id, template_sequence,
                               mmcif_object.chain_to_seqres))


def parse_a3m(a3m_string: str) -> Msa:
  """Parses sequences and deletion matrix from a3m format alignment.

  Args:
    a3m_string: The string contents of a a3m file. The first sequence in the
      file should be the query sequence.

  Returns:
    A tuple of:
      * A list of sequences that have been aligned to the query. These
        might contain duplicates.
      * The deletion matrix for the alignment as a list of lists. The element
        at `deletion_matrix[i][j]` is the number of residues deleted from
        the aligned sequence i at residue position j.
      * A list of descriptions, one per sequence, from the a3m file.
  """
  sequences, descriptions = parse_fasta(a3m_string)
  deletion_matrix = []
  for msa_sequence in sequences:
    deletion_vec = []
    deletion_count = 0
    for j in msa_sequence:
      if j.islower():
        deletion_count += 1
      else:
        deletion_vec.append(deletion_count)
        deletion_count = 0
    deletion_matrix.append(deletion_vec)

  # Make the MSA matrix out of aligned (deletion-free) sequences.
  deletion_table = str.maketrans('', '', string.ascii_lowercase)
  aligned_sequences = [s.translate(deletion_table) for s in sequences]
  return Msa(sequences=aligned_sequences,
             deletion_matrix=deletion_matrix,
             descriptions=descriptions)


def _realign_pdb_template_to_query(
    old_template_sequence: str,
    template_chain_id: str,
    mmcif_object: MmcifObject,
    old_mapping: Mapping[int, int],
    kalign_binary_path: str) -> Tuple[str, Mapping[int, int]]:
  """Aligns template from the mmcif_object to the query.

  In case PDB70 contains a different version of the template sequence, we need
  to perform a realignment to the actual sequence that is in the mmCIF file.
  This method performs such realignment, but returns the new sequence and
  mapping only if the sequence in the mmCIF file is 90% identical to the old
  sequence.

  Note that the old_template_sequence comes from the hit, and contains only that
  part of the chain that matches with the query while the new_template_sequence
  is the full chain.

  Args:
    old_template_sequence: The template sequence that was returned by the PDB
      template search (typically done using HHSearch).
    template_chain_id: The template chain id was returned by the PDB template
      search (typically done using HHSearch). This is used to find the right
      chain in the mmcif_object chain_to_seqres mapping.
    mmcif_object: A mmcif_object which holds the actual template data.
    old_mapping: A mapping from the query sequence to the template sequence.
      This mapping will be used to compute the new mapping from the query
      sequence to the actual mmcif_object template sequence by aligning the
      old_template_sequence and the actual template sequence.
    kalign_binary_path: The path to a kalign executable.

  Returns:
    A tuple (new_template_sequence, new_query_to_template_mapping) where:
    * new_template_sequence is the actual template sequence that was found in
      the mmcif_object.
    * new_query_to_template_mapping is the new mapping from the query to the
      actual template found in the mmcif_object.

  Raises:
    QueryToTemplateAlignError:
    * If there was an error thrown by the alignment tool.
    * Or if the actual template sequence differs by more than 10% from the
      old_template_sequence.
  """
  aligner = Kalign(binary_path=kalign_binary_path)
  new_template_sequence = mmcif_object.chain_to_seqres.get(
      template_chain_id, '')

  # Sometimes the template chain id is unknown. But if there is only a single
  # sequence within the mmcif_object, it is safe to assume it is that one.
  if not new_template_sequence:
    if len(mmcif_object.chain_to_seqres) == 1:
      logging.info('Could not find %s in %s, but there is only 1 sequence, so '
                   'using that one.',
                   template_chain_id,
                   mmcif_object.file_id)
      new_template_sequence = list(mmcif_object.chain_to_seqres.values())[0]
    else:
      raise QueryToTemplateAlignError(
          f'Could not find chain {template_chain_id} in {mmcif_object.file_id}. '
          'If there are no mmCIF parsing errors, it is possible it was not a '
          'protein chain.')

  try:
    parsed_a3m = parse_a3m(
        aligner.align([old_template_sequence, new_template_sequence]))
    old_aligned_template, new_aligned_template = parsed_a3m.sequences
  except Exception as e:
    raise QueryToTemplateAlignError(
        'Could not align old template %s to template %s (%s_%s). Error: %s' %
        (old_template_sequence, new_template_sequence, mmcif_object.file_id,
         template_chain_id, str(e)))

  logging.info('Old aligned template: %s\nNew aligned template: %s',
               old_aligned_template, new_aligned_template)

  old_to_new_template_mapping = {}
  old_template_index = -1
  new_template_index = -1
  num_same = 0
  for old_template_aa, new_template_aa in zip(
      old_aligned_template, new_aligned_template):
    if old_template_aa != '-':
      old_template_index += 1
    if new_template_aa != '-':
      new_template_index += 1
    if old_template_aa != '-' and new_template_aa != '-':
      old_to_new_template_mapping[old_template_index] = new_template_index
      if old_template_aa == new_template_aa:
        num_same += 1

  # Require at least 90 % sequence identity wrt to the shorter of the sequences.
  if float(num_same) / min(
      len(old_template_sequence), len(new_template_sequence)) < 0.9:
    raise QueryToTemplateAlignError(
        'Insufficient similarity of the sequence in the database: %s to the '
        'actual sequence in the mmCIF file %s_%s: %s. We require at least '
        '90 %% similarity wrt to the shorter of the sequences. This is not a '
        'problem unless you think this is a template that should be included.' %
        (old_template_sequence, mmcif_object.file_id, template_chain_id,
         new_template_sequence))

  new_query_to_template_mapping = {}
  for query_index, old_template_index in old_mapping.items():
    new_query_to_template_mapping[query_index] = (
        old_to_new_template_mapping.get(old_template_index, -1))

  new_template_sequence = new_template_sequence.replace('-', '')

  return new_template_sequence, new_query_to_template_mapping


def _get_atom_positions(
    mmcif_object: MmcifObject,
    auth_chain_id: str,
    max_ca_ca_distance: float) -> Tuple[np.ndarray, np.ndarray]:
  """Gets atom positions and mask from a list of Biopython Residues."""
  num_res = len(mmcif_object.chain_to_seqres[auth_chain_id])

  relevant_chains = [c for c in mmcif_object.structure.get_chains()
                     if c.id == auth_chain_id]
  if len(relevant_chains) != 1:
    raise MultipleChainsError(
        f'Expected exactly one chain in structure with id {auth_chain_id}.')
  chain = relevant_chains[0]

  all_positions = np.zeros([num_res, atom_type_num, 3])
  all_positions_mask = np.zeros([num_res, atom_type_num],
                                dtype=np.int64)
  for res_index in range(num_res):
    pos = np.zeros([atom_type_num, 3], dtype=np.float32)
    mask = np.zeros([atom_type_num], dtype=np.float32)
    res_at_position = mmcif_object.seqres_to_structure[auth_chain_id][res_index]
    if not res_at_position.is_missing:
      res = chain[(res_at_position.hetflag,
                   res_at_position.position.residue_number,
                   res_at_position.position.insertion_code)]
      for atom in res.get_atoms():
        atom_name = atom.get_name()
        x, y, z = atom.get_coord()
        if atom_name in atom_order.keys():
          pos[atom_order[atom_name]] = [x, y, z]
          mask[atom_order[atom_name]] = 1.0
        elif atom_name.upper() == 'SE' and res.get_resname() == 'MSE':
          # Put the coordinates of the selenium atom in the sulphur column.
          pos[atom_order['SD']] = [x, y, z]
          mask[atom_order['SD']] = 1.0

      # Fix naming errors in arginine residues where NH2 is incorrectly
      # assigned to be closer to CD than NH1.
      cd = atom_order['CD']
      nh1 = atom_order['NH1']
      nh2 = atom_order['NH2']
      if (res.get_resname() == 'ARG' and
          all(mask[atom_index] for atom_index in (cd, nh1, nh2)) and
          (np.linalg.norm(pos[nh1] - pos[cd]) >
           np.linalg.norm(pos[nh2] - pos[cd]))):
        pos[nh1], pos[nh2] = pos[nh2].copy(), pos[nh1].copy()
        mask[nh1], mask[nh2] = mask[nh2].copy(), mask[nh1].copy()

    all_positions[res_index] = pos
    all_positions_mask[res_index] = mask
  _check_residue_distances(
      all_positions, all_positions_mask, max_ca_ca_distance)
  return all_positions, all_positions_mask


def sequence_to_onehot(
    sequence: str,
    mapping: Mapping[str, int],
    map_unknown_to_x: bool = False) -> np.ndarray:
  """Maps the given sequence into a one-hot encoded matrix.

  Args:
    sequence: An amino acid sequence.
    mapping: A dictionary mapping amino acids to integers.
    map_unknown_to_x: If True, any amino acid that is not in the mapping will be
      mapped to the unknown amino acid 'X'. If the mapping doesn't contain
      amino acid 'X', an error will be thrown. If False, any amino acid not in
      the mapping will throw an error.

  Returns:
    A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of
    the sequence.

  Raises:
    ValueError: If the mapping doesn't contain values from 0 to
      num_unique_aas - 1 without any gaps.
  """
  num_entries = max(mapping.values()) + 1

  if sorted(set(mapping.values())) != list(range(num_entries)):
    raise ValueError('The mapping must have values from 0 to num_unique_aas-1 '
                     'without any gaps. Got: %s' % sorted(mapping.values()))

  one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)

  for aa_index, aa_type in enumerate(sequence):
    if map_unknown_to_x:
      if aa_type.isalpha() and aa_type.isupper():
        aa_id = mapping.get(aa_type, mapping['X'])
      else:
        raise ValueError(f'Invalid character in the sequence: {aa_type}')
    else:
      aa_id = mapping[aa_type]
    one_hot_arr[aa_index, aa_id] = 1

  return one_hot_arr


def extract_template_features(
    mmcif_object: MmcifObject,
    pdb_id: str,
    mapping: Mapping[int, int],
    template_sequence: str,
    query_sequence: str,
    template_chain_id: str,
    kalign_binary_path: str) -> Tuple[Dict[str, Any], Optional[str]]:
  """Parses atom positions in the target structure and aligns with the query.

  Atoms for each residue in the template structure are indexed to coincide
  with their corresponding residue in the query sequence, according to the
  alignment mapping provided.

  Args:
    mmcif_object: mmcif_parsing.MmcifObject representing the template.
    pdb_id: PDB code for the template.
    mapping: Dictionary mapping indices in the query sequence to indices in
      the template sequence.
    template_sequence: String describing the amino acid sequence for the
      template protein.
    query_sequence: String describing the amino acid sequence for the query
      protein.
    template_chain_id: String ID describing which chain in the structure proto
      should be used.
    kalign_binary_path: The path to a kalign executable used for template
        realignment.

  Returns:
    A tuple with:
    * A dictionary containing the extra features derived from the template
      protein structure.
    * A warning message if the hit was realigned to the actual mmCIF sequence.
      Otherwise None.

  Raises:
    NoChainsError: If the mmcif object doesn't contain any chains.
    SequenceNotInTemplateError: If the given chain id / sequence can't
      be found in the mmcif object.
    QueryToTemplateAlignError: If the actual template in the mmCIF file
      can't be aligned to the query.
    NoAtomDataInTemplateError: If the mmcif object doesn't contain
      atom positions.
    TemplateAtomMaskAllZerosError: If the mmcif object doesn't have any
      unmasked residues.
  """
  if mmcif_object is None or not mmcif_object.chain_to_seqres:
    raise NoChainsError('No chains in PDB: %s_%s' % (pdb_id, template_chain_id))

  warning = None
  try:
    seqres, chain_id, mapping_offset = _find_template_in_pdb(
        template_chain_id=template_chain_id,
        template_sequence=template_sequence,
        mmcif_object=mmcif_object)
    
    print(seqres, chain_id, mapping_offset)
    
   
  except SequenceNotInTemplateError:
    # If PDB70 contains a different version of the template, we use the sequence
    # from the mmcif_object.
    chain_id = template_chain_id
    warning = (
        f'The exact sequence {template_sequence} was not found in '
        f'{pdb_id}_{chain_id}. Realigning the template to the actual sequence.')
    logging.warning(warning)
    # This throws an exception if it fails to realign the hit.
    seqres, mapping = _realign_pdb_template_to_query(
        old_template_sequence=template_sequence,
        template_chain_id=template_chain_id,
        mmcif_object=mmcif_object,
        old_mapping=mapping,
        kalign_binary_path=kalign_binary_path)
    logging.info('Sequence in %s_%s: %s successfully realigned to %s',
                 pdb_id, chain_id, template_sequence, seqres)
    # The template sequence changed.
    template_sequence = seqres
    # No mapping offset, the query is aligned to the actual sequence.
    mapping_offset = 0

  try:
    # Essentially set to infinity - we don't want to reject templates unless
    # they're really really bad.
    
    # 注：all_atom_positions 指定链所有氨基酸原子的坐标，包括hit.hit_sequence
    all_atom_positions, all_atom_mask = _get_atom_positions(
        mmcif_object, chain_id, max_ca_ca_distance=150.0)
  except (CaDistanceError, KeyError) as ex:
    raise NoAtomDataInTemplateError(
        'Could not get atom data (%s_%s): %s' % (pdb_id, chain_id, str(ex))
        ) from ex

  # np.split函数用于将数组沿指定轴分割为多个子数组
  all_atom_positions = np.split(all_atom_positions, all_atom_positions.shape[0])
  all_atom_masks = np.split(all_atom_mask, all_atom_mask.shape[0])

  output_templates_sequence = []
  templates_all_atom_positions = []
  templates_all_atom_masks = []

  # templates_all_atom_positions 和 query_sequence等长，
  # 代表query_sequence对应位置的氨基酸原子的坐标
  # 注： query_sequence为要预测结构的原始蛋白质序列，不是hit.query
  for _ in query_sequence:
    # Residues in the query_sequence that are not in the template_sequence:
    templates_all_atom_positions.append(
        np.zeros((atom_type_num, 3)))
    templates_all_atom_masks.append(np.zeros(atom_type_num))
    output_templates_sequence.append('-')
  
  # 模版上原子坐标对应到query序列中，
  # mapping_offset为hit.hit_sequence在mmcif中相应链chain_sequence中相对位置
  for k, v in mapping.items():
    template_index = v + mapping_offset
    templates_all_atom_positions[k] = all_atom_positions[template_index][0]
    templates_all_atom_masks[k] = all_atom_masks[template_index][0]
    output_templates_sequence[k] = template_sequence[v]

  # Alanine (AA with the lowest number of atoms) has 5 atoms (C, CA, CB, N, O).
  if np.sum(templates_all_atom_masks) < 5:
    raise TemplateAtomMaskAllZerosError(
        'Template all atom mask was all zeros: %s_%s. Residue range: %d-%d' %
        (pdb_id, chain_id, min(mapping.values()) + mapping_offset,
         max(mapping.values()) + mapping_offset))

  output_templates_sequence = ''.join(output_templates_sequence)

  templates_aatype = sequence_to_onehot(
      output_templates_sequence, HHBLITS_AA_TO_ID)

  return (
      {    # 值向量都和query sequence等长
          'template_all_atom_positions': np.array(templates_all_atom_positions),
          'template_all_atom_masks': np.array(templates_all_atom_masks),
          'template_sequence': output_templates_sequence.encode(),
          'template_aatype': np.array(templates_aatype),
          'template_domain_names': f'{pdb_id.lower()}_{chain_id}'.encode(),
      },
      warning)


def _to_a3m(sequences: Sequence[str]) -> str:
  """Converts sequences to an a3m file."""
  names = ['sequence %d' % i for i in range(1, len(sequences) + 1)]
  a3m = []
  for sequence, name in zip(sequences, names):
    a3m.append(u'>' + name + u'\n')
    a3m.append(sequence + u'\n')
  return ''.join(a3m)


@contextlib.contextmanager
def tmpdir_manager(base_dir: Optional[str] = None):
  """Context manager that deletes a temporary directory on exit."""
  tmpdir = tempfile.mkdtemp(dir=base_dir)
  try:
    yield tmpdir
  finally:
    shutil.rmtree(tmpdir, ignore_errors=True)


@contextlib.contextmanager
def timing(msg: str):
  logging.info('Started %s', msg)
  tic = time.time()
  yield
  toc = time.time()
  logging.info('Finished %s in %.3f seconds', msg, toc - tic)


def parse_fasta(fasta_string: str) -> Tuple[Sequence[str], Sequence[str]]:
  """Parses FASTA string and returns list of strings with amino-acid sequences.

  Arguments:
    fasta_string: The string contents of a FASTA file.

  Returns:
    A tuple of two lists:
    * A list of sequences.
    * A list of sequence descriptions taken from the comment lines. In the
      same order as the sequences.
  """
  sequences = []
  descriptions = []
  index = -1
  for line in fasta_string.splitlines():
    line = line.strip()
    if line.startswith('>'):
      index += 1
      descriptions.append(line[1:])  # Remove the '>' at the beginning.
      sequences.append('')
      continue
    elif not line:
      continue  # Skip blank lines.
    sequences[index] += line

  return sequences, descriptions


class Kalign:
  """Python wrapper of the Kalign binary."""

  def __init__(self, *, binary_path: str):
    """Initializes the Python Kalign wrapper.

    Args:
      binary_path: The path to the Kalign binary.

    Raises:
      RuntimeError: If Kalign binary not found within the path.
    """
    self.binary_path = binary_path

  def align(self, sequences: Sequence[str]) -> str:
    """Aligns the sequences and returns the alignment in A3M string.

    Args:
      sequences: A list of query sequence strings. The sequences have to be at
        least 6 residues long (Kalign requires this). Note that the order in
        which you give the sequences might alter the output slightly as
        different alignment tree might get constructed.

    Returns:
      A string with the alignment in a3m format.

    Raises:
      RuntimeError: If Kalign fails.
      ValueError: If any of the sequences is less than 6 residues long.
    """
    logging.info('Aligning %d sequences', len(sequences))
    
    print("in kalign function")
    
    for s in sequences:
      if len(s) < 6:
        raise ValueError('Kalign requires all sequences to be at least 6 '
                         'residues long. Got %s (%d residues).' % (s, len(s)))

    with tmpdir_manager() as query_tmp_dir:
      input_fasta_path = os.path.join(query_tmp_dir, 'input.fasta')
      output_a3m_path = os.path.join(query_tmp_dir, 'output.a3m')

      with open(input_fasta_path, 'w') as f:
        f.write(_to_a3m(sequences))

      cmd = [
          self.binary_path,
          '-i', input_fasta_path,
          '-o', output_a3m_path,
          '-format', 'fasta',
      ]

      logging.info('Launching subprocess "%s"', ' '.join(cmd))
      process = subprocess.Popen(cmd, stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE)

      with timing('Kalign query'):
        stdout, stderr = process.communicate()
        retcode = process.wait()
        logging.info('Kalign stdout:\n%s\n\nstderr:\n%s\n',
                     stdout.decode('utf-8'), stderr.decode('utf-8'))

      if retcode:
        raise RuntimeError('Kalign failed\nstdout:\n%s\n\nstderr:\n%s\n'
                           % (stdout.decode('utf-8'), stderr.decode('utf-8')))

      with open(output_a3m_path) as f:
        a3m = f.read()
      
      print(f"kalign a3m:{a3m}")
    
      return a3m


####准备输入数据

### 读入Sequence[TemplateHit]数据
with open('test_pdb_hits.pkl', 'rb') as file:
  # 使用 pickle.load 从文件中加载对象
  test_pdb_hits = pickle.load(file)

#print(test_pdb_hits)

#取第一条作为演示数据
hit = test_pdb_hits[0] # TemplateHit实例
print(hit)

# 下载mmcif文件，解析成MmcifObject
#print(hit.name)

hit_pdb_code = hit.name.split()[0].split("_")[0] # "1J1V"
hit_chain_id = hit.name.split()[0].split("_")[1] # "A"

#print(hit_pdb_code)
#print(hit_chain_id)

#print(f"hit.query:{hit.query}")
#print(f"hit.hit_sequence:{hit.hit_sequence}")

  
#from Bio.PDB import *
#pdbl = PDBList()
# #它将从服务器下载指定的文件('1j1v.cif')并将其存储在当前工作目录中。
#pdbl.retrieve_pdb_file(hit_pdb_code, pdir = '.', file_format = 'mmCif')


# 解析mmCIF 格式字符串并保存 

with open('mmcif_objec_1j1v.pkl', 'rb') as file:
  # 使用 pickle.load 从文件中加载对象
  test_mmcif_object = pickle.load(file)

#print(test_mmcif_objec)

hit_pdb_code = "1J1V"
hit_chain_id = "A"

template_sequence = hit.hit_sequence.replace('-', '')
#print(template_sequence)
 
input_fasta_file = 'Q94K49.fasta'
## 从fasta文件提取 query_sequence（str格式）
query_sequence = ""
with open(input_fasta_file) as f:
  for line in f.readlines():
    if line.startswith(">"):
      continue
    query_sequence += line.strip()


#mapping =  build_query_to_hit_index_mapping(hit.query, 
#                                            hit.hit_sequence, 
#                                            hit.indices_hit, 
#                                            hit.indices_query,
#                                            query_sequence)


# 索引对应dict 
#with open('test_mapping_1j1v.pkl', 'wb') as file:
#  pickle.dump(mapping, file)

# 打开二进制文件以进行读取
with open('test_mapping_1j1v.pkl', 'rb') as file:
    # 使用 pickle.load 从文件中加载对象
    test_mapping = pickle.load(file)

#kalign_binary_path = "/Users/zhengxueming/anaconda3/envs/protein_design/bin/kalign"
kalign_binary_path = "~/anaconda3/envs/protein_design/bin/kalign"
## 参数查看
#print(f"test_mmcif_object:{test_mmcif_object}")
#print(f"hit_pdb_code:{hit_pdb_code}")
#print(f"test_mapping:{test_mapping}")
#print(f"template_sequence:{template_sequence}")
#print(f"query_sequence:{query_sequence}")
#print(f"hit_chain_id:{hit_chain_id}")
#print(f"kalign_binary_path:{kalign_binary_path}")

features, realign_warning = extract_template_features(mmcif_object=test_mmcif_object,
                                                      pdb_id=hit_pdb_code,
                                                      mapping=test_mapping,
                                                      template_sequence=template_sequence,
                                                      query_sequence=query_sequence,
                                                      template_chain_id=hit_chain_id,
                                                      kalign_binary_path=kalign_binary_path)


print(features)
print(realign_warning)

推荐一份生物信息学入门很好的参考材料小明的数据分析笔记本
链接是https://bioinformatics.uconn.edu/resources-and-events/tutorials-2/这个是康涅狄格大学（UniversityofConnecticut）提供的一份教程，主要的内容包括1、生物信息学中经常用到的文件格式image.png2、linux操作系统和R语言的基础知识image.png3、转录组数据的处理流程image.png这里包括有参
【机器学习】朴素贝叶斯方法的概率图表示以及贝叶斯统计中的共轭先验方法 Lossya 机器学习概率论人工智能朴素贝叶斯共轭先验
引言朴素贝叶斯方法是一种基于贝叶斯定理的简单概率模型，它假设特征之间相互独立。文章目录引言一、朴素贝叶斯方法的概率图表示1.1节点表示1.2边表示1.3无其他连接1.4总结二、朴素贝叶斯的应用场景2.1文本分类2.2推荐系统2.3医疗诊断2.4欺诈检测2.5情感分析2.6邮件过滤2.7信息检索2.8生物信息学三、朴素贝叶斯的优点四、朴素贝叶斯的局限性4.1特征独立性假设4.2敏感于输入数据的表示4
零基础入门生信数据分析——导读呆猪儿生信之转录组——上游分析生信之转录组——下游分析学习方法 r语言数据分析数据库数据挖掘需求分析大数据
零基础入门生信数据分析——导读生信数据分析，即生物信息学数据分析，是一个涵盖了生物学、计算机科学、数学和统计学等多个领域的交叉学科。它主要利用计算机算法和统计方法对生物学数据进行处理、分析和解释，以揭示生物分子、细胞、组织和生物体等各个层次的生物学规律和机制。本帖主要是为生信数据分析的各个分析点提供跳转链接（简单说就是提供了一个目录供大家选择自己想要的知识点可以直接跳转）关联的生信数据分析的分析点
NCBI BLAST+：分析生物内在编码的工具 belldeep 生物信息学 Blast 生物数据分析
在生物信息学的广阔领域中，NCBI（NationalCenterforBiotechnologyInformation，美国国立生物技术信息中心）开发的BLAST（BasicLocalAlignmentSearchTool，基本局部比对搜索工具）无疑是一把不可或缺的分析工具。NCBIBLAST+，作为其最新版本2.16.0+，为科研工作者提供了一套强大的序列比对和搜索功能，帮助解析生命现象背后的遗
【图论简介】 WA-自动机图论深度优先算法架构后端前端面试
图论简介图论是一门数学分支，主要研究图（Graph）的性质、结构和应用。图论在计算机科学、网络理论、优化问题、生物信息学等多个领域都有广泛的应用。本文将简要介绍图论的基本概念、常见算法及其在实际中的应用。一、图的基本概念图（Graph）：图是由一组顶点（Vertices）和连接顶点的边（Edges）组成的结构。可以表示为(G=(V,E))，其中(V)是顶点的集合，(E)是边的集合。根据边的不同属性
生信圆桌：专业生信服务器与平台服务的提供者生信圆桌x生信云服务器服务器人工智能运维
生信圆桌是一个专注于提供生物信息学（生信）服务器和平台服务的领先企业，致力于为全球科研机构、企业和独立研究者提供高性能的生信分析解决方案。随着生物信息学研究对计算资源的需求日益增加，生信圆桌凭借其先进的服务器技术和专业的服务团队，成为了生信领域中不可或缺的合作伙伴。访问生信圆桌,使用生信云。高效分析少走弯路www.tebteb.cc生信圆桌的核心服务高性能生信服务器定制：生信圆桌为客户提供定制化的
用Python实现生信分析——基序（Motif）识别详解写代码的M教授生信分析 python 开发语言
1.什么是基序（Motif）？在生物信息学中，基序（Motif）是指在生物序列（如DNA、RNA或蛋白质序列）中具有特定功能或结构的短序列片段。基序通常在生物进化中得到保留，因为它们在生物学功能中起着重要作用。例如，在DNA序列中，基序可能是一个转录因子结合位点；在蛋白质序列中，基序可能是一个具有特定功能的结构域。基序识别是指从一组生物序列中识别出保守的短序列片段，这对于功能预测、基因调控网络分析
数据结构与算法——动态规划 passion更好数据结构 C++动态规划算法
目录引言最优子结构重叠子问题打家劫舍（LeetCode198题）经典例题1.爬楼梯（LeetCode70题）2.斐波那契数列（LeetCode126题）3.最长公共子序列（LeetCode95题）引言动态规划（DynamicProgramming,简称DP）是一种在数学、计算机科学、经济学和生物信息学等领域广泛使用的算法设计技术。它通过把原问题分解为相对简单的子问题的方式，来求解复杂问题。动态规划
深度学习——概念引入韶光流年都束之高阁深度学习日记深度学习人工智能职场和发展
深度学习深度学习简介深度学习分类根据网络结构划分：循环神经网络卷积神经网络根据学习方式划分：监督学习无监督学习半监督学习根据应用领域划分：计算机视觉自然语言处理语音识别生物信息学深度学习简介深度学习（DeepLearning，DL）是机器学习领域中的一个新的研究方向，主要是通过学习样本数据的内在规律和表示层次，让机器能够具有类似于人类的分析学习能力。深度学习的最终目标是让机器能够识别和解释各种数据
考研调剂：中医生命科学菌心说双脑论
科学网—考研调剂——欢迎研究生调剂到我们的招生专业方向“中西医结合基础”：中医药与肠道菌群、生物信息学等交叉学科-张成岗的博文http://blog.sciencenet.cn/home.php?mod=space&uid=40692&do=blog&id=1281078欢迎各位有志于从事中医生命科学、解码中医、中西医结合以及医学与数学、计算机科学等交叉学科研究的青年才俊加入我们的研究团队，共同见
2020-04-07 liuyang2020
学习小组Day2笔记--linux入门（刘阳）1.为什么学习linux大多数人用的是可视化界面，便捷的windows，linux用户量比较少，但是需要知道，linux的功能相当的强大，对于数据处理、程序运行方面的优势，那是其它的系统无法比拟的，生物信息学数据处理对电脑要求较高，因此学习linux，，嘿嘿，大势所趋。2.linux操作2.1登录远程登录linux服务器，好像有很多连接软件，今天尝试应
Python在生物信息学中的应用：有序字典简说基因-专业生信合作伙伴 python 开发语言
我们知道，通过{}创建的字典是无序的。如何创建有序字典呢？解决方案可以使用collections模块中的OrderedDict类。当对字典做迭代时，它会严格按照元素添加的顺序进行。例如：from collection import OrderedDictd=OrderedDict()d['1st'] = 1d['2nd'] = 2d['3rd'] = 3d['4th']=4forkeyind:
Python在生物信息学中的应用：同时对数据做转换和换算简说基因-专业生信合作伙伴 python 开发语言
我们需要调用一个换算（reduction）函数，例如sum()、min()、max()等，但首先得对数据做转换或筛选。解决方案一种优雅的方式能将数据换算和转换结合在一起，即在函数中使用生成器表达式。例如，要计算平方和，可以这样：nums=[1,2,3,4,5]s=sum(x*xforxinnums)更多的例子：#Determineifany.pyfilesexistinadirectoryimpo
Python在生物信息学中的应用：列表推导式简说基因-专业生信合作伙伴 python windows 开发语言
列表中有一些数据，我们想提取或删除某些值，该怎么办？解决方案最简单的方法是使用列表推导式（listcomprehension）。例如：>>>mylist=[1,4,-5,10,-7,2,3,-1]>>>[nforninmylistifn>0][1,4,10,2,3]>>>[nforninmylistifn>>列表推导式的使用需要注意其内存占用，当原始列表比较大时，其内存占用较高，可以使用生成器表达
最长公共子序列(LCS) 算法
定义(维基百科)在一个序列集合中（通常为两个序列）查找所有序列中最长的子序列。这与查找最长公共子串的问题不同的地方是：子序列不需要在原序列中占用连续的位置。最长公共子序列问题是一个经典的计算机科学问题，也是数据比较程序，比如Diff工具和生物信息学应用的基础。它也被广泛地应用在版本控制，比如Git用来调和文件之间的改变解决方案这类问题通常都是采用动态规划的思想来解决，核心就是构造出动态解决方程。以
自学生物信息学 gtt儿_生物信息学习
我是生物工程专业出身，在大三保研时选择了生物信息的道路，到现在为止已经在行业里摸爬滚打了6年的时间，在这6年的学习之路上疑惑过，也迷茫过，特此把我学习的过程以及遇到的问题总结出来以让大家避免出现同样的问题。在我学习生物信息过程的基础上带着大家顺畅的走一遍。在学习生物信息学之前，我们先来了解一下什么是生物信息学。生物信息学，顾名思义，生物学和信息学的结合。生物学，这个对大家比较简单，基本入生信行的同
我们能成为孩子的上帝吗—— 谁来管理非法行医的贺建奎闲月农
贺建奎，原南方科技大学副教授，毕业于美国斯坦福大学，拥有多学科交叉的背景，并在基因测序仪研究，CRISPR基因编辑，生物信息学等多个领域取得研究突破。2018年11月26日，贺建奎“基因编辑婴儿”事件引发轩然大波。2018年12月19日，贺建奎入选《Nature》年度十大科学人物。2019年4月18日，上榜美国《时代》杂志（Time）2019年度全球百位最具影响力人物榜单。2019年12月30日，
2022-01-27 学习生信的小兔子
参考：生物信息学100个基础问题——第1~5题答案公布-知乎(zhihu.com)掌握FASTQ格式特点第2行就是测序得到的序列信息，一般用ATCGN来表示，其中N用于荧光信号干扰无法判断到底是哪个碱基时的代表符号；第3行以“+”开始，可以储存一些附加信息，但目前的测序fastq文件这一行一般是空的。第4行储存的是质量信息，与第2行的碱基序列是一一对应的，其中的每一个符号对应的ASCII值是经过换
金域医学：医检行业顶级学术委员会成立，钟南山院士任主席里昂杰森
4位院士领衔23位顶级专家加盟,金域医学“最强大脑”助力中国医学检验2017年12月1日，国内第三方医学检验行业的开拓者和引领者广州金域医学检验集团在广州国际生物岛总部，召开金域医学学术委员会成立大会暨金域学术汇报会由呼吸系统疾病专家、中国工程院院士钟南山出任委员会主席，医学遗传学家、中国工程院院士曾溢滔，生物信息学家、中国科学院院士陈润生，以及我国著名肾脏病专家、中国科学院院士侯凡凡出任委员会顾
机器学习系列——（十九）层次聚类飞影铠甲机器学习机器学习聚类人工智能
引言在机器学习和数据挖掘领域，聚类算法是一种重要的无监督学习方法，它试图将数据集中的样本分组，使得同一组内的样本相似度高，不同组间的样本相似度低。层次聚类（HierarchicalClustering）是聚类算法中的一种，以其独特的层次分解方式，在各种应用场景中得到广泛应用，如生物信息学、图像分析、社交网络分析等。一、概述层次聚类算法主要分为两大类：凝聚的层次聚类（AgglomerativeHie
东南大学-生物信息学 wangchuang2017
http://www.lmbe.seu.edu.cn/chenyuan/xsun/bioinfomatics/Web/Index.html目录image第1章生物信息学引论第2章生物信息学的生物学基础第3章序列比较第4章生物分子数据库第5章基因组信息分析第6章系统发生分析第7章蛋白质结构预测第8章基因表达数据分析附录常用基本词汇表
TCGA新版数据库表达矩阵提取医学和生信笔记
本文首发于公众号：医学和生信笔记医学和生信笔记，专注R语言在临床医学中的使用，R语言数据分析和可视化。主要分享R语言做医学统计学、meta分析、网络药理学、临床预测模型、机器学习、生物信息学等。现在使用TCGAbiolinks下载转录组数据后，直接是一个SummarizedExperiment对象，这个对象非常重要且好用。因为里面直接包含了表达矩阵、样本信息、基因信息，可以非常方便的通过内置函数直
R语言可视化学习笔记之ggridges包生信宝典 R 生物信息生物信息可视化
作者：严涛浙江大学作物遗传育种在读研究生（生物信息学方向）伪码农，R语言爱好者，爱开源。严涛老师的绘图教程还有：gganimate|诺奖文章里面的动图布局教程来了！！ggplot2学习笔记之图形排列R包ggseqlogo|置换序列分析图ggplot2高效实用指南（可视化脚本，工具，套路，配色）简介ggridges。主要包用来绘制山峦图产品尤其的英文针对时间或者空间分布****可视化。具有十分好的效
microRNA数据库与预测、功能分析软件大全 Seurat_Satija
在microRNA的研究中，生物信息学发挥越来越重要的作用，以下是microRNA相关的数据库与预测、功能分析软件，绝对值得收藏。1.miRBase:http://www.mirbase.orgmiRBase序列数据库是一个提供包括已发表的miRNA序列数据、注释、预测基因靶标等信息的全方位数据库，是存储miRNA信息最主要的公共数据库之一。miRBase提供便捷的网上查询服务，允许用户使用关键词
从列表中删除元素|自学生信Python（第十六天）天明豆豆
从列表中删除元素Python有从数据结构对象，如列表和字典中去除数据项的函数。写在前面的话：本人是一枚生物学的学生，由于对生物信息学特别感兴趣，于是想自学生物信息学（新手莫怪）。了解到生物信息学要有编程基础，尤其是要会一门编程语言，例如：R语言、Python、Perl等，还要熟悉Linux系统，作为生信小白，听说Python挺简单的，于是就自学了Python，花了两天时间了解了Python的基础语
「转录组」从环境配置之conda 旮旯蜗牛_c299
image.png什么是condaconda：开源包管理系统和环境管理系统，用于安装多个版本的软件包及其依赖关系，并在它们之间轻松切换。系统：适用Linux，OSX和Windows。For：为Python程序创建的，但可以打包和分发任何软件。【生物信息学频道bioconda】Anaconda是一个开源的Python发行版本，包含了conda、python等180多个科学包及其依赖项。因为包含了大量
生信绘图：在线绘制序列 Logo 图 Ningbo_JiaYT 统计绘图生物信息学 R 学习方法
本文介绍通过WebLogo网站在线绘制序列Logo图（序列分析图）。网站链接：WebLogo3-About(threeplusone.com)1序列Logo图序列Logo是一种常用于可视化DNA、RNA或氨基酸序列中保守性和模式的图形化方法。它是由生物信息学领域中的生物学家TomSchneider和R.MichaelStephens在1990年首次引入的。序列Logo通过显示序列中每个位置上不同碱
理解生物信息学FASTA格式陈佶1
在生物信息学中，FASTA格式是一种基于文本的、用于表示核苷酸序列或氨基酸序列的格式。在这种格式中碱基对或氨基酸用单个字母来表示，且允许在序列前添加序列名及注释。FASTA文件以序列表示和序列作为一个基本单元，各行记录信息如下：第一行是由大于号">"开头的任意文字说明，用于序列标记，为了保证后续分析软件能够区分每条序列，单个序列的标识必须具有唯一性。；从第二行开始为序列本身，只允许使用既定的核苷酸
Cytoscape软件下载、安装、插件学习[基础教程] 小杜的生信筆記 R语言精美图形绘制教程数据分析 Cytoscape 网络图富集分析信息可视化生物信息学 r语言
写在前面今天分享的内容是自己遇到问题后，咨询社群里面的同学，帮忙解决的总结。关于Cytoscape，对于做组学或生物信息学的同学基本是陌生的，可能有的同学用这个软件作图是非常溜的，做出来的网络图也是十分的好看，“可玩性”很高，就像前面分享的aPEAR包一样aPEAR包绘制功能富集网络图。自己在前面写论文的时候也是一直在使用，以前使用的版本是3.3.0的版本。但是，时间一长，很多操作都忘记。今天，在
支持向量机小森( ﹡ˆoˆ﹡ ) 机器学习算法支持向量机算法机器学习
支持向量机（SupportVectorMachine，SVM）是一个非常优雅的算法，具有非常完善的数学理论，常用于数据分类，也可以用于数据的回归预测中。支持向量机在许多领域都有广泛的应用，如文本分类、图像识别、生物信息学、金融预测等。支持向量机的应用：（1）文本分类：支持向量机可以用于文本分类任务，如垃圾邮件过滤、情感分析、主题分类等。通过对文本数据进行预处理，提取特征，然后使用支持向量机进行训练
tomcat基础与部署发布暗黑小菠萝 Tomcat java web
从51cto搬家了，以后会更新在这里方便自己查看。做项目一直用tomcat，都是配置到eclipse中使用，这几天有时间整理一下使用心得，有一些自己配置遇到的细节问题。 Tomcat：一个Servlets和JSP页面的容器，以提供网站服务。一、Tomcat安装安装方式：①运行.exe安装包 &n
网站架构发展的过程 ayaoxinchao 数据库应用服务器网站架构
1.初始阶段网站架构：应用程序、数据库、文件等资源在同一个服务器上 2.应用服务和数据服务分离：应用服务器、数据库服务器、文件服务器 3.使用缓存改善网站性能：为应用服务器提供本地缓存，但受限于应用服务器的内存容量，可以使用专门的缓存服务器，提供分布式缓存服务器架构 4.使用应用服务器集群改善网站的并发处理能力：使用负载均衡调度服务器，将来自客户端浏览器的访问请求分发到应用服务器集群中的任何
[信息与安全]数据库的备份问题 comsci 数据库
如果你们建设的信息系统是采用中心-分支的模式,那么这里有一个问题如果你的数据来自中心数据库,那么中心数据库如果出现故障,你的分支机构的数据如何保证安全呢? 是否应该在这种信息系统结构的基础上进行改造,容许分支机构的信息系统也备份一个中心数据库的文件呢? &n
使用maven tomcat plugin插件debug关联源代码商人shang maven debug 查看源码 tomcat-plugin
*首先需要配置好'''maven-tomcat7-plugin'''，参见[[Maven开发Web项目]]的'''Tomcat'''部分。 *配置好后，在[[Eclipse]]中打开'''Debug Configurations'''界面，在'''Maven Build'''项下新建当前工程的调试。在'''Main'''选项卡中点击'''Browse Workspace...'''选择需要开发的
大访问量高并发 oloz 大访问量高并发
大访问量高并发的网站主要压力还是在于数据库的操作上，尽量避免频繁的请求数据库。下面简要列出几点解决方案： 01、优化你的代码和查询语句，合理使用索引 02、使用缓存技术例如memcache、ecache将不经常变化的数据放入缓存之中 03、采用服务器集群、负载均衡分担大访问量高并发压力 04、数据读写分离 05、合理选用框架，合理架构(推荐分布式架构)。
cache 服务器小猪猪08 cache
Cache 即高速缓存.那么cache是怎么样提高系统性能与运行速度呢？是不是在任何情况下用cache都能提高性能？是不是cache用的越多就越好呢？我在近期开发的项目中有所体会，写下来当作总结也希望能跟大家一起探讨探讨，有错误的地方希望大家批评指正。　　1.Cache 是怎么样工作的? 　　Cache 是分配在服务器上
mysql存储过程香水浓 mysql
Description:插入大量测试数据 use xmpl; drop procedure if exists mockup_test_data_sp; create procedure mockup_test_data_sp( in number_of_records int ) begin declare cnt int; declare name varch
CSS的class、id、css文件名的常用命名规则 agevs JavaScript UI 框架 Ajax css
CSS的class、id、css文件名的常用命名规则 (一)常用的CSS命名规则　　头：header 　　内容：content/container 　　尾：footer 　　导航：nav 　　侧栏：sidebar 　　栏目：column 　　页面外围控制整体布局宽度：wrapper 　　左右中：left right
全局数据源 AILIKES java tomcat mysql jdbc JNDI
实验目的：为了研究两个项目同时访问一个全局数据源的时候是创建了一个数据源对象，还是创建了两个数据源对象。 1：将diuid和mysql驱动包（druid-1.0.2.jar和mysql-connector-java-5.1.15.jar）copy至%TOMCAT_HOME%/lib下；2：配置数据源，将JNDI在%TOMCAT_HOME%/conf/context.xml中配置好,格式如下：&l
MYSQL的随机查询的实现方法 baalwolf mysql
MYSQL的随机抽取实现方法。举个例子，要从tablename表中随机提取一条记录，大家一般的写法就是：SELECT * FROM tablename ORDER BY RAND() LIMIT 1。但是，后来我查了一下MYSQL的官方手册，里面针对RAND()的提示大概意思就是，在ORDER BY从句里面不能使用RAND()函数，因为这样会导致数据列被多次扫描。但是在MYSQL 3.23版本中，
JAVA的getBytes()方法 bijian1013 java eclipse unix OS
在Java中，String的getBytes()方法是得到一个操作系统默认的编码格式的字节数组。这个表示在不同OS下，返回的东西不一样！ String.getBytes(String decode)方法会根据指定的decode编码返回某字符串在该编码下的byte数组表示，如： byte[] b_gbk = "
AngularJS中操作Cookies bijian1013 JavaScript AngularJS Cookies
如果你的应用足够大、足够复杂，那么你很快就会遇到这样一咱种情况：你需要在客户端存储一些状态信息，这些状态信息是跨session(会话)的。你可能还记得利用document.cookie接口直接操作纯文本cookie的痛苦经历。幸运的是，这种方式已经一去不复返了，在所有现代浏览器中几乎
[Maven学习笔记五]Maven聚合和继承特性 bit1129 maven
Maven聚合在实际的项目中，一个项目通常会划分为多个模块，为了说明问题，以用户登陆这个小web应用为例。通常一个web应用分为三个模块： 1. 模型和数据持久化层user-core, 2. 业务逻辑层user-service以 3. web展现层user-web， user-service依赖于user-core user-web依赖于user-core和use
【JVM七】JVM知识点总结 bit1129 jvm
1. JVM运行模式 1.1 JVM运行时分为-server和-client两种模式，在32位机器上只有client模式的JVM。通常，64位的JVM默认都是使用server模式，因为server模式的JVM虽然启动慢点，但是，在运行过程，JVM会尽可能的进行优化 1.2 JVM分为三种字节码解释执行方式：mixed mode, interpret mode以及compiler
linux下查看nginx、apache、mysql、php的编译参数 ronin47
在linux平台下的应用，最流行的莫过于nginx、apache、mysql、php几个。而这几个常用的应用，在手工编译完以后，在其他一些情况下（如：新增模块），往往想要查看当初都使用了那些参数进行的编译。这时候就可以利用以下方法查看。 1、nginx [root@361way ~]# /App/nginx/sbin/nginx -V nginx: nginx version: nginx/
unity中运用Resources.Load的方法？ brotherlamp unity视频 unity资料 unity自学 unity unity教程
问：unity中运用Resources.Load的方法？答：Resources.Load是unity本地动态加载资本所用的方法,也即是你想动态加载的时分才用到它,比方枪弹,特效,某些实时替换的图像什么的,主张此文件夹不要放太多东西,在打包的时分,它会独自把里边的一切东西都会集打包到一同,不论里边有没有你用的东西,所以大多数资本应该是自个建文件放置 1、unity实时替换的物体即是依据环境条件
线段树-入门 bylijinnan java 算法线段树
/** * 线段树入门 * 问题：已知线段[2,5] [4,6] [0,7]；求点2,4,7分别出现了多少次 * 以下代码建立的线段树用链表来保存，且树的叶子结点类似[i,i] * * 参考链接：http://hi.baidu.com/semluhiigubbqvq/item/be736a33a8864789f4e4ad18 * @author lijinna
全选与反选 chicony 全选
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>全选与反选</title>
vim一些简单记录 chenchao051 vim
mac在/usr/share/vim/vimrc linux在/etc/vimrc 1、问：后退键不能删除数据，不能往后退怎么办？答：在vimrc中加入set backspace=2 2、问：如何控制tab键的缩进？答：在vimrc中加入set tabstop=4 (任何
Sublime Text 快捷键 daizj 快捷键 sublime
[size=large][/size]Sublime Text快捷键：Ctrl+Shift+P：打开命令面板Ctrl+P：搜索项目中的文件Ctrl+G：跳转到第几行Ctrl+W：关闭当前打开文件Ctrl+Shift+W：关闭所有打开文件Ctrl+Shift+V：粘贴并格式化Ctrl+D：选择单词，重复可增加选择下一个相同的单词Ctrl+L：选择行，重复可依次增加选择下一行Ctrl+Shift+L：
php 引用(&)详解 dcj3sjt126com PHP
在PHP 中引用的意思是：不同的名字访问同一个变量内容. 与Ｃ语言中的指针是有差别的．Ｃ语言中的指针里面存储的是变量的内容在内存中存放的地址变量的引用 PHP 的引用允许你用两个变量来指向同一个内容复制代码代码如下: <? $a="ABC"; $b =&$a; echo
SVN中trunk,branches,tags用法详解 dcj3sjt126com SVN
Subversion有一个很标准的目录结构，是这样的。比如项目是proj，svn地址为svn://proj/，那么标准的svn布局是svn://proj/|+-trunk+-branches+-tags这是一个标准的布局，trunk为主开发目录，branches为分支开发目录，tags为tag存档目录（不允许修改）。但是具体这几个目录应该如何使用，svn并没有明确的规范，更多的还是用户自己的习惯。
对软件设计的思考 e200702084 设计模式数据结构算法 ssh 活动
软件设计的宏观与微观软件开发是一种高智商的开发活动。一个优秀的软件设计人员不仅要从宏观上把握软件之间的开发，也要从微观上把握软件之间的开发。宏观上，可以应用面向对象设计，采用流行的SSH架构，采用web层，业务逻辑层，持久层分层架构。采用设计模式提供系统的健壮性和可维护性。微观上，对于一个类，甚至方法的调用，从计算机的角度模拟程序的运行情况。了解内存分配，参数传
同步、异步、阻塞、非阻塞 geeksun 非阻塞
同步、异步、阻塞、非阻塞这几个概念有时有点混淆，在此文试图解释一下。同步：发出方法调用后，当没有返回结果，当前线程会一直在等待（阻塞）状态。场景：打电话，营业厅窗口办业务、B/S架构的http请求-响应模式。异步：方法调用后不立即返回结果，调用结果通过状态、通知或回调通知方法调用者或接收者。异步方法调用后，当前线程不会阻塞，会继续执行其他任务。实现：
Reverse SSH Tunnel 反向打洞實錄 hongtoushizi ssh
實際的操作步驟： # 首先，在客戶那理的機器下指令連回我們自己的 Server，並設定自己 Server 上的 12345 port 會對應到幾器上的 SSH port ssh -NfR 12345:localhost:22 [email protected] # 然後在 myhost 的機器上連自己的 12345 port，就可以連回在客戶那的機器 ssh localhost -p 1
Hibernate中的缓存 Josh_Persistence 一级缓存 Hiberante缓存查询缓存二级缓存
Hibernate中的缓存一、Hiberante中常见的三大缓存：一级缓存，二级缓存和查询缓存。 Hibernate中提供了两级Cache，第一级别的缓存是Session级别的缓存，它是属于事务范围的缓存。这一级别的缓存是由hibernate管理的，一般情况下无需进行干预；第二级别的缓存是SessionFactory级别的缓存，它是属于进程范围或群集范围的缓存。这一级别的缓存
对象关系行为模式之延迟加载 home198979 PHP 架构延迟加载
形象化设计模式实战 HELLO!架构一、概念 Lazy Load：一个对象，它虽然不包含所需要的所有数据，但是知道怎么获取这些数据。延迟加载貌似很简单，就是在数据需要时再从数据库获取，减少数据库的消耗。但这其中还是有不少技巧的。二、实现延迟加载实现Lazy Load主要有四种方法：延迟初始化、虚
xml 验证 pengfeicao521 xml xml解析
有些字符，xml不能识别，用jdom或者dom4j解析的时候就报错 public static void testPattern() { // 含有非法字符的串 String str = "Jamey친Ñ&#1282
div设置半透明效果 spjich css 半透明
为div设置如下样式： div{filter:alpha(Opacity=80);-moz-opacity:0.5;opacity: 0.5;} 说明： 1、filter：对win IE设置半透明滤镜效果，filter:alpha(Opacity=80)代表该对象80%半透明，火狐浏览器不认2、-moz-opaci
你真的了解单例模式么？ w574240966 java 单例设计模式 jvm
单例模式，很多初学者认为单例模式很简单，并且认为自己已经掌握了这种设计模式。但事实上，你真的了解单例模式了么。一，单例模式的5中写法。（回字的四种写法，哈哈。） 1，懒汉式（1）线程不安全的懒汉式 public cla

提取模版特征

你可能感兴趣的:(生物信息学)