额外的MSA序列的最近邻序列采样MSA序列

分别计算每个额外的MSA序列(protein['extra_msa'])和采样MSA(protein['msa'])序列的相似性(msa中相同的氨基酸越多,序列越相似),从而找到和每条额外MSA序列最相似的抽样序列。

import pickle
import tensorflow as tf
import ml_collections
import numpy as np


def shape_list(x):
  """Return list of dimensions of a tensor, statically where possible.

  Like `x.shape.as_list()` but with tensors instead of `None`s.

  Args:
    x: A tensor.
  Returns:
    A list with length equal to the rank of the tensor. The n-th element of the
    list is an integer when that dimension is statically known otherwise it is
    the n-th element of `tf.shape(x)`.
  """
  x = tf.convert_to_tensor(x)

  # If unknown rank, return dynamic shape
  if x.get_shape().dims is None:
    return tf.shape(x)

  static = x.get_shape().as_list()
  shape = tf.shape(x)

  ret = []
  for i in range(len(static)):
    dim = static[i]
    if dim is None:
      dim = shape[i]
    ret.append(dim)
  return ret


def data_transforms_curry1(f):
  """Supply all arguments but the first."""

  def fc(*args, **kwargs):
    return lambda x: f(x, *args, **kwargs)

  return fc


@data_transforms_curry1
def nearest_neighbor_clusters(protein, gap_agreement_weight=0.):
  """Assign each extra MSA sequence to its nearest neighbor in sampled MSA."""

  # Determine how much weight we assign to each agreement.  In theory, we could
  # use a full blosum matrix here, but right now let's just down-weight gap
  # agreement because it could be spurious.
  # Never put weight on agreeing on BERT mask
  # 除了gap权重为0,其他(restype+X+mask)权重为1
  weights = tf.concat([
      tf.ones(21),
      gap_agreement_weight * tf.ones(1),
      np.zeros(1)], 0)

  # Make agreement score as weighted Hamming distance
  # 增加一个维度
  sample_one_hot = (protein['msa_mask'][:, :, None] *
                    tf.one_hot(protein['msa'], 23))
  extra_one_hot = (protein['extra_msa_mask'][:, :, None] *
                   tf.one_hot(protein['extra_msa'], 23))

  num_seq, num_res, _ = shape_list(sample_one_hot)
  extra_num_seq, _, _ = shape_list(extra_one_hot)

  # Compute tf.einsum('mrc,nrc,c->mn', sample_one_hot, extra_one_hot, weights)
  # in an optimized fashion to avoid possible memory or computation blowup.
  # 判断extra msa序列与MSA sample序列的相似度,相同的氨基酸越多,越相似。
  # 没有考虑氨基酸的性质,可以改进
  # 注意氨基酸的权重(weights)
  agreement = tf.matmul(
      tf.reshape(extra_one_hot, [extra_num_seq, num_res * 23]),
      tf.reshape(sample_one_hot * weights, [num_seq, num_res * 23]),
      transpose_b=True)

  # Assign each sequence in the extra sequences to the closest MSA sample
  # 对extra msa中每一条序列,取相似度最高的MSA sample序列
  protein['extra_cluster_assignment'] = tf.argmax(
      agreement, axis=1, output_type=tf.int32)

  return protein


with open("Human_HBB_tensor_dict_masked.pkl",'rb') as f:
    protein = pickle.load(f)

#print(protein.keys())

protein = nearest_neighbor_clusters()(protein)

print(protein.keys())
print(protein['extra_cluster_assignment'].shape)
print(protein['extra_cluster_assignment'])

你可能感兴趣的:(生物信息学,python,tensorflow)