执行make_fixed_size函数,根据pad_size_map对蛋白质序列、多序列比对及模版特征填充
NUM_RES,NUM_MSA_SEQ,NUM_EXTRA_SEQ,NUM_TEMPLATES特征填充(0)到指定的大小。
主要函数tf.pad 是 TensorFlow 中的一个函数,用于在张量的周围或指定位置进行填充。
tf.pad(tensor, paddings, mode='CONSTANT', constant_values=0, name=None)
import pickle
import tensorflow as tf
import ml_collections
NUM_RES = 'num residues placeholder'
NUM_MSA_SEQ = 'msa placeholder'
NUM_EXTRA_SEQ = 'extra msa placeholder'
NUM_TEMPLATES = 'num templates placeholder'
CONFIG = ml_collections.ConfigDict({
'data': {
'common': {
'masked_msa': {
'profile_prob': 0.1,
'same_prob': 0.1,
'uniform_prob': 0.1
},
'max_extra_msa': 1024,
'msa_cluster_features': True,
'num_recycle': 3,
'reduce_msa_clusters_by_max_templates': False,
'resample_msa_in_recycling': True,
'template_features': [
'template_all_atom_positions', 'template_sum_probs',
'template_aatype', 'template_all_atom_masks',
'template_domain_names'
],
'unsupervised_features': [
'aatype', 'residue_index', 'sequence', 'msa', 'domain_name',
'num_alignments', 'seq_length', 'between_segment_residues',
'deletion_matrix'
],
'use_templates': False,
},
'eval': {
'feat': {
'aatype': [NUM_RES],
'all_atom_mask': [NUM_RES, None],
'all_atom_positions': [NUM_RES, None, None],
'alt_chi_angles': [NUM_RES, None],
'atom14_alt_gt_exists': [NUM_RES, None],
'atom14_alt_gt_positions': [NUM_RES, None, None],
'atom14_atom_exists': [NUM_RES, None],
'atom14_atom_is_ambiguous': [NUM_RES, None],
'atom14_gt_exists': [NUM_RES, None],
'atom14_gt_positions': [NUM_RES, None, None],
'atom37_atom_exists': [NUM_RES, None],
'backbone_affine_mask': [NUM_RES],
'backbone_affine_tensor': [NUM_RES, None],
'bert_mask': [NUM_MSA_SEQ, NUM_RES],
'chi_angles': [NUM_RES, None],
'chi_mask': [NUM_RES, None],
'extra_deletion_value': [NUM_EXTRA_SEQ, NUM_RES],
'extra_has_deletion': [NUM_EXTRA_SEQ, NUM_RES],
'extra_msa': [NUM_EXTRA_SEQ, NUM_RES],
'extra_msa_mask': [NUM_EXTRA_SEQ, NUM_RES],
'extra_msa_row_mask': [NUM_EXTRA_SEQ],
'is_distillation': [],
'msa_feat': [NUM_MSA_SEQ, NUM_RES, None],
'msa_mask': [NUM_MSA_SEQ, NUM_RES],
'msa_row_mask': [NUM_MSA_SEQ],
'pseudo_beta': [NUM_RES, None],
'pseudo_beta_mask': [NUM_RES],
'random_crop_to_size_seed': [None],
'residue_index': [NUM_RES],
'residx_atom14_to_atom37': [NUM_RES, None],
'residx_atom37_to_atom14': [NUM_RES, None],
'resolution': [],
'rigidgroups_alt_gt_frames': [NUM_RES, None, None],
'rigidgroups_group_exists': [NUM_RES, None],
'rigidgroups_group_is_ambiguous': [NUM_RES, None],
'rigidgroups_gt_exists': [NUM_RES, None],
'rigidgroups_gt_frames': [NUM_RES, None, None],
'seq_length': [],
'seq_mask': [NUM_RES],
'target_feat': [NUM_RES, None],
'template_aatype': [NUM_TEMPLATES, NUM_RES],
'template_all_atom_masks': [NUM_TEMPLATES, NUM_RES, None],
'template_all_atom_positions': [
NUM_TEMPLATES, NUM_RES, None, None],
'template_backbone_affine_mask': [NUM_TEMPLATES, NUM_RES],
'template_backbone_affine_tensor': [
NUM_TEMPLATES, NUM_RES, None],
'template_mask': [NUM_TEMPLATES],
'template_pseudo_beta': [NUM_TEMPLATES, NUM_RES, None],
'template_pseudo_beta_mask': [NUM_TEMPLATES, NUM_RES],
'template_sum_probs': [NUM_TEMPLATES, None],
'true_msa': [NUM_MSA_SEQ, NUM_RES]
},
'fixed_size': True,
'subsample_templates': True, # We want top templates.
'masked_msa_replace_fraction': 0.15,
'max_msa_clusters': 512,
'max_templates': 4,
'num_ensemble': 1,
'crop_size': 100,
},
},
'model': {
'embeddings_and_evoformer': {
'evoformer_num_block': 48,
'evoformer': {
'msa_row_attention_with_pair_bias': {
'dropout_rate': 0.15,
'gating': True,
'num_head': 8,
'orientation': 'per_row',
'shared_dropout': True
},
'msa_column_attention': {
'dropout_rate': 0.0,
'gating': True,
'num_head': 8,
'orientation': 'per_column',
'shared_dropout': True
},
'msa_transition': {
'dropout_rate': 0.0,
'num_intermediate_factor': 4,
'orientation': 'per_row',
'shared_dropout': True
},
'outer_product_mean': {
'first': False,
'chunk_size': 128,
'dropout_rate': 0.0,
'num_outer_channel': 32,
'orientation': 'per_row',
'shared_dropout': True
},
'triangle_attention_starting_node': {
'dropout_rate': 0.25,
'gating': True,
'num_head': 4,
'orientation': 'per_row',
'shared_dropout': True
},
'triangle_attention_ending_node': {
'dropout_rate': 0.25,
'gating': True,
'num_head': 4,
'orientation': 'per_column',
'shared_dropout': True
},
'triangle_multiplication_outgoing': {
'dropout_rate': 0.25,
'equation': 'ikc,jkc->ijc',
'num_intermediate_channel': 128,
'orientation': 'per_row',
'shared_dropout': True,
'fuse_projection_weights': False,
},
'triangle_multiplication_incoming': {
'dropout_rate': 0.25,
'equation': 'kjc,kic->ijc',
'num_intermediate_channel': 128,
'orientation': 'per_row',
'shared_dropout': True,
'fuse_projection_weights': False,
},
'pair_transition': {
'dropout_rate': 0.0,
'num_intermediate_factor': 4,
'orientation': 'per_row',
'shared_dropout': True
}
},
'extra_msa_channel': 64,
'extra_msa_stack_num_block': 4,
'max_relative_feature': 32,
'msa_channel': 256,
'pair_channel': 128,
'prev_pos': {
'min_bin': 3.25,
'max_bin': 20.75,
'num_bins': 15
},
'recycle_features': True,
'recycle_pos': True,
'seq_channel': 384,
'template': {
'attention': {
'gating': False,
'key_dim': 64,
'num_head': 4,
'value_dim': 64
},
'dgram_features': {
'min_bin': 3.25,
'max_bin': 50.75,
'num_bins': 39
},
'embed_torsion_angles': False,
'enabled': False,
'template_pair_stack': {
'num_block': 2,
'triangle_attention_starting_node': {
'dropout_rate': 0.25,
'gating': True,
'key_dim': 64,
'num_head': 4,
'orientation': 'per_row',
'shared_dropout': True,
'value_dim': 64
},
'triangle_attention_ending_node': {
'dropout_rate': 0.25,
'gating': True,
'key_dim': 64,
'num_head': 4,
'orientation': 'per_column',
'shared_dropout': True,
'value_dim': 64
},
'triangle_multiplication_outgoing': {
'dropout_rate': 0.25,
'equation': 'ikc,jkc->ijc',
'num_intermediate_channel': 64,
'orientation': 'per_row',
'shared_dropout': True,
'fuse_projection_weights': False,
},
'triangle_multiplication_incoming': {
'dropout_rate': 0.25,
'equation': 'kjc,kic->ijc',
'num_intermediate_channel': 64,
'orientation': 'per_row',
'shared_dropout': True,
'fuse_projection_weights': False,
},
'pair_transition': {
'dropout_rate': 0.0,
'num_intermediate_factor': 2,
'orientation': 'per_row',
'shared_dropout': True
}
},
'max_templates': 4,
'subbatch_size': 128,
'use_template_unit_vector': False,
}
},
'global_config': {
'deterministic': False,
'multimer_mode': False,
'subbatch_size': 4,
'use_remat': False,
'zero_init': True,
'eval_dropout': False,
},
'heads': {
'distogram': {
'first_break': 2.3125,
'last_break': 21.6875,
'num_bins': 64,
'weight': 0.3
},
'predicted_aligned_error': {
# `num_bins - 1` bins uniformly space the
# [0, max_error_bin A] range.
# The final bin covers [max_error_bin A, +infty]
# 31A gives bins with 0.5A width.
'max_error_bin': 31.,
'num_bins': 64,
'num_channels': 128,
'filter_by_resolution': True,
'min_resolution': 0.1,
'max_resolution': 3.0,
'weight': 0.0,
},
'experimentally_resolved': {
'filter_by_resolution': True,
'max_resolution': 3.0,
'min_resolution': 0.1,
'weight': 0.01
},
'structure_module': {
'num_layer': 8,
'fape': {
'clamp_distance': 10.0,
'clamp_type': 'relu',
'loss_unit_distance': 10.0
},
'angle_norm_weight': 0.01,
'chi_weight': 0.5,
'clash_overlap_tolerance': 1.5,
'compute_in_graph_metrics': True,
'dropout': 0.1,
'num_channel': 384,
'num_head': 12,
'num_layer_in_transition': 3,
'num_point_qk': 4,
'num_point_v': 8,
'num_scalar_qk': 16,
'num_scalar_v': 16,
'position_scale': 10.0,
'sidechain': {
'atom_clamp_distance': 10.0,
'num_channel': 128,
'num_residual_block': 2,
'weight_frac': 0.5,
'length_scale': 10.,
},
'structural_violation_loss_weight': 1.0,
'violation_tolerance_factor': 12.0,
'weight': 1.0
},
'predicted_lddt': {
'filter_by_resolution': True,
'max_resolution': 3.0,
'min_resolution': 0.1,
'num_bins': 50,
'num_channels': 128,
'weight': 0.01
},
'masked_msa': {
'num_output': 23,
'weight': 2.0
},
},
'num_recycle': 3,
'resample_msa_in_recycling': True
},
})
def shape_list(x):
"""Return list of dimensions of a tensor, statically where possible.
Like `x.shape.as_list()` but with tensors instead of `None`s.
Args:
x: A tensor.
Returns:
A list with length equal to the rank of the tensor. The n-th element of the
list is an integer when that dimension is statically known otherwise it is
the n-th element of `tf.shape(x)`.
"""
x = tf.convert_to_tensor(x)
# If unknown rank, return dynamic shape
if x.get_shape().dims is None:
return tf.shape(x)
static = x.get_shape().as_list()
shape = tf.shape(x)
ret = []
for i in range(len(static)):
dim = static[i]
if dim is None:
dim = shape[i]
ret.append(dim)
return ret
eval_cfg = CONFIG.data.eval
common_cfg = CONFIG.data.common
crop_feats = dict(eval_cfg.feat)
#pad_msa_clusters = eval_cfg.max_msa_clusters
shape_schema = crop_feats
#print("shape_schema")
#print(shape_schema)
def data_transforms_curry1(f):
"""Supply all arguments but the first."""
def fc(*args, **kwargs):
return lambda x: f(x, *args, **kwargs)
return fc
@data_transforms_curry1
def make_fixed_size(protein, shape_schema, msa_cluster_size, extra_msa_size,
num_res, num_templates=0):
"""Guess at the MSA and sequence dimensions to make fixed size."""
pad_size_map = {
NUM_RES: num_res,
NUM_MSA_SEQ: msa_cluster_size,
NUM_EXTRA_SEQ: extra_msa_size,
NUM_TEMPLATES: num_templates,
}
for k, v in protein.items():
# Don't transfer this to the accelerator.
if k == 'extra_cluster_assignment':
continue
shape = v.shape.as_list()
# 特征维度placeholder
schema = shape_schema[k]
assert len(shape) == len(schema), (
f'Rank mismatch between shape and shape schema for {k}: '
f'{shape} vs {schema}')
# 特征张量不同维度的填充尺寸(pad_size)。需要填充的维度尺寸由pad_size_map决定。
# 字典get方法,键不存在时返回的None,这时列表取 s1 for (s1, s2) in zip(shape, schema)
pad_size = [
pad_size_map.get(s2, None) or s1 for (s1, s2) in zip(shape, schema)
]
# 在张量的后面填充,需要填充0的数目为填充尺寸减去现有的尺寸(p - tf.shape(v)[i])
padding = [(0, p - tf.shape(v)[i]) for i, p in enumerate(pad_size)]
if padding:
protein[k] = tf.pad(
v, padding, name=f'pad_to_fixed_{k}')
protein[k].set_shape(pad_size)
return protein
def select_feat(protein, feature_list):
return {k: v for k, v in protein.items() if k in feature_list}
# 选择特征,不然运行make_fixed_size会报错
protein = select_feat(protein, shape_schema)
if common_cfg.reduce_msa_clusters_by_max_templates:
pad_msa_clusters = eval_cfg.max_msa_clusters - eval_cfg.max_templates
else:
pad_msa_clusters = eval_cfg.max_msa_clusters
protein = make_fixed_size(shape_schema=crop_feats,
msa_cluster_size=pad_msa_clusters,
extra_msa_size=common_cfg.max_extra_msa,
num_res=eval_cfg.crop_size,
num_templates=eval_cfg.max_templates)(protein)
print(protein.keys())
print(protein['msa_mask'].shape)
print(protein['extra_msa'].shape)
print(protein['template_all_atom_masks'].shape)