蛋白质extra_msa特征已经经过了nearest_neighbor_clusters,summarize_clusters等处理。通过make_msa_feat函数创建 protein['msa_feat']以及 protein['target_feat']特征 。
import tensorflow as tf
import numpy as np
def data_transforms_curry1(f):
"""Supply all arguments but the first."""
def fc(*args, **kwargs):
return lambda x: f(x, *args, **kwargs)
return fc
@data_transforms_curry1
def make_msa_feat(protein):
"""Create and concatenate MSA features."""
# Whether there is a domain break. Always zero for chains, but keeping
# for compatibility with domain datasets.
has_break = tf.clip_by_value(
tf.cast(protein['between_segment_residues'], tf.float32),
0, 1)
aatype_1hot = tf.one_hot(protein['aatype'], 21, axis=-1)
target_feat = [
tf.expand_dims(has_break, axis=-1),
aatype_1hot, # Everyone gets the original sequence.
]
msa_1hot = tf.one_hot(protein['msa'], 23, axis=-1)
has_deletion = tf.clip_by_value(protein['deletion_matrix'], 0., 1.)
deletion_value = tf.atan(protein['deletion_matrix'] / 3.) * (2. / np.pi)
msa_feat = [
msa_1hot,
tf.expand_dims(has_deletion, axis=-1),
tf.expand_dims(deletion_value, axis=-1),
]
if 'cluster_profile' in protein:
deletion_mean_value = (
tf.atan(protein['cluster_deletion_mean'] / 3.) * (2. / np.pi))
msa_feat.extend([
protein['cluster_profile'],
tf.expand_dims(deletion_mean_value, axis=-1),
])
if 'extra_deletion_matrix' in protein:
protein['extra_has_deletion'] = tf.clip_by_value(
protein['extra_deletion_matrix'], 0., 1.)
protein['extra_deletion_value'] = tf.atan(
protein['extra_deletion_matrix'] / 3.) * (2. / np.pi)
protein['msa_feat'] = tf.concat(msa_feat, axis=-1)
protein['target_feat'] = tf.concat(target_feat, axis=-1)
return protein
protein = make_msa_feat()(protein)
print(protein['msa_feat'].shape) # [num_seq, num_resi, 49]
print(protein['target_feat'].shape) # [num_resi, 22]