分别计算每个额外的MSA序列(protein['extra_msa'])和采样MSA(protein['msa'])序列的相似性(msa中相同的氨基酸越多,序列越相似),从而找到和每条额外MSA序列最相似的抽样序列。
import pickle
import tensorflow as tf
import ml_collections
import numpy as np
def shape_list(x):
"""Return list of dimensions of a tensor, statically where possible.
Like `x.shape.as_list()` but with tensors instead of `None`s.
Args:
x: A tensor.
Returns:
A list with length equal to the rank of the tensor. The n-th element of the
list is an integer when that dimension is statically known otherwise it is
the n-th element of `tf.shape(x)`.
"""
x = tf.convert_to_tensor(x)
# If unknown rank, return dynamic shape
if x.get_shape().dims is None:
return tf.shape(x)
static = x.get_shape().as_list()
shape = tf.shape(x)
ret = []
for i in range(len(static)):
dim = static[i]
if dim is None:
dim = shape[i]
ret.append(dim)
return ret
def data_transforms_curry1(f):
"""Supply all arguments but the first."""
def fc(*args, **kwargs):
return lambda x: f(x, *args, **kwargs)
return fc
@data_transforms_curry1
def nearest_neighbor_clusters(protein, gap_agreement_weight=0.):
"""Assign each extra MSA sequence to its nearest neighbor in sampled MSA."""
# Determine how much weight we assign to each agreement. In theory, we could
# use a full blosum matrix here, but right now let's just down-weight gap
# agreement because it could be spurious.
# Never put weight on agreeing on BERT mask
# 除了gap权重为0,其他(restype+X+mask)权重为1
weights = tf.concat([
tf.ones(21),
gap_agreement_weight * tf.ones(1),
np.zeros(1)], 0)
# Make agreement score as weighted Hamming distance
# 增加一个维度
sample_one_hot = (protein['msa_mask'][:, :, None] *
tf.one_hot(protein['msa'], 23))
extra_one_hot = (protein['extra_msa_mask'][:, :, None] *
tf.one_hot(protein['extra_msa'], 23))
num_seq, num_res, _ = shape_list(sample_one_hot)
extra_num_seq, _, _ = shape_list(extra_one_hot)
# Compute tf.einsum('mrc,nrc,c->mn', sample_one_hot, extra_one_hot, weights)
# in an optimized fashion to avoid possible memory or computation blowup.
# 判断extra msa序列与MSA sample序列的相似度,相同的氨基酸越多,越相似。
# 没有考虑氨基酸的性质,可以改进
# 注意氨基酸的权重(weights)
agreement = tf.matmul(
tf.reshape(extra_one_hot, [extra_num_seq, num_res * 23]),
tf.reshape(sample_one_hot * weights, [num_seq, num_res * 23]),
transpose_b=True)
# Assign each sequence in the extra sequences to the closest MSA sample
# 对extra msa中每一条序列,取相似度最高的MSA sample序列
protein['extra_cluster_assignment'] = tf.argmax(
agreement, axis=1, output_type=tf.int32)
return protein
with open("Human_HBB_tensor_dict_masked.pkl",'rb') as f:
protein = pickle.load(f)
#print(protein.keys())
protein = nearest_neighbor_clusters()(protein)
print(protein.keys())
print(protein['extra_cluster_assignment'].shape)
print(protein['extra_cluster_assignment'])