mmCIF(macromolecular Crystallographic Information File)是一种用于存储生物大分子结构数据的标准文件格式。它是 PDB(Protein Data Bank)数据文件格式的一种扩展,用于存储 X 射线晶体学和核磁共振测定的生物大分子的结构信息。
mmCIF 使用一种名为 CIF(Crystallographic Information File)的文本格式,但在生物大分子领域中,通常将其称为 mmCIF。与传统的 PDB 格式相比,mmCIF 提供了更灵活、结构化的数据表示方式,允许更详细和准确地描述生物大分子的结构、实验条件、结晶学信息等。
mmCIF 文件包含数据和元数据,包括原子坐标、晶体学信息、实验条件、分子结构、结合物信息等。每个数据项都有相应的描述,使得文件更容易理解和解释。mmCIF 还支持字典定义,用于规定可用的数据项和它们的含义,使得不同研究中的数据可以一致地描述和共享。
https://mmcif.wwpdb.org/docs/pdb_to_pdbx_correspondences.html
from Bio import PDB
from typing import Mapping, Any, Optional, Sequence, Tuple
#from typing import Any, Mapping, Optional, Sequence, Tuple
import dataclasses
import functools
import io
from absl import logging
import collections
from Bio.Data import SCOPData
### mmCIF string 解析
PdbStructure = PDB.Structure.Structure
PdbHeader = Mapping[str, Any]
ChainId = str
SeqRes = str
MmCIFDict = Mapping[str, Sequence[str]]
# 使用 @dataclasses.dataclass 装饰器,
# 可以轻松地创建一个包含字段、__init__ 方法、__repr__ 方法等的类,而不必手动编写这些代码。
# 当使用 frozen=True 参数时,生成的类将不再允许修改其实例的字段值。
# Used to map SEQRES index to a residue in the structure.
@dataclasses.dataclass(frozen=True)
class ResiduePosition:
chain_id: str
residue_number: int
insertion_code: str
@dataclasses.dataclass(frozen=True)
class ResidueAtPosition:
position: Optional[ResiduePosition]
name: str
is_missing: bool
hetflag: str
@dataclasses.dataclass(frozen=True)
class Monomer:
id: str
num: int
# Note - mmCIF format provides no guarantees on the type of author-assigned
# sequence numbers. They need not be integers.
@dataclasses.dataclass(frozen=True)
class AtomSite:
residue_name: str
author_chain_id: str
mmcif_chain_id: str
author_seq_num: str
mmcif_seq_num: int
insertion_code: str
hetatm_atom: str
model_num: int
@dataclasses.dataclass(frozen=True)
class MmcifObject:
"""Representation of a parsed mmCIF file.
Contains:
file_id: A meaningful name, e.g. a pdb_id. Should be unique amongst all
files being processed.
header: Biopython header.
structure: Biopython structure.
chain_to_seqres: Dict mapping chain_id to 1 letter amino acid sequence. E.g.
{'A': 'ABCDEFG'}
seqres_to_structure: Dict; for each chain_id contains a mapping between
SEQRES index and a ResidueAtPosition. e.g. {'A': {0: ResidueAtPosition,
1: ResidueAtPosition,
...}}
raw_string: The raw string used to construct the MmcifObject.
"""
file_id: str
header: PdbHeader
structure: PdbStructure
chain_to_seqres: Mapping[ChainId, SeqRes]
seqres_to_structure: Mapping[ChainId, Mapping[int, ResidueAtPosition]]
raw_string: Any
@dataclasses.dataclass(frozen=True)
class ParsingResult:
"""Returned by the parse function.
Contains:
mmcif_object: A MmcifObject, may be None if no chain could be successfully
parsed.
errors: A dict mapping (file_id, chain_id) to any exception generated.
"""
mmcif_object: Optional[MmcifObject]
errors: Mapping[Tuple[str, str], Any]
def _get_atom_site_list(parsed_info: MmCIFDict) -> Sequence[AtomSite]:
"""Returns list of atom sites; contains data not present in the structure."""
return [AtomSite(*site) for site in zip( # pylint:disable=g-complex-comprehension
parsed_info['_atom_site.label_comp_id'],
parsed_info['_atom_site.auth_asym_id'],
parsed_info['_atom_site.label_asym_id'],
parsed_info['_atom_site.auth_seq_id'],
parsed_info['_atom_site.label_seq_id'],
parsed_info['_atom_site.pdbx_PDB_ins_code'],
parsed_info['_atom_site.group_PDB'],
parsed_info['_atom_site.pdbx_PDB_model_num'],
)]
def _get_first_model(structure: PdbStructure) -> PdbStructure:
"""Returns the first model in a Biopython structure."""
return next(structure.get_models())
def _get_header(parsed_info: MmCIFDict) -> PdbHeader:
"""Returns a basic header containing method, release date and resolution."""
header = {}
experiments = mmcif_loop_to_list('_exptl.', parsed_info)
header['structure_method'] = ','.join([
experiment['_exptl.method'].lower() for experiment in experiments])
# Note: The release_date here corresponds to the oldest revision. We prefer to
# use this for dataset filtering over the deposition_date.
if '_pdbx_audit_revision_history.revision_date' in parsed_info:
header['release_date'] = get_release_date(parsed_info)
else:
logging.warning('Could not determine release_date: %s',
parsed_info['_entry.id'])
header['resolution'] = 0.00
for res_key in ('_refine.ls_d_res_high', '_em_3d_reconstruction.resolution',
'_reflns.d_resolution_high'):
if res_key in parsed_info:
try:
raw_resolution = parsed_info[res_key][0]
header['resolution'] = float(raw_resolution)
except ValueError:
logging.debug('Invalid resolution format: %s', parsed_info[res_key])
return header
def mmcif_loop_to_list(prefix: str,
parsed_info: MmCIFDict) -> Sequence[Mapping[str, str]]:
"""Extracts loop associated with a prefix from mmCIF data as a list.
Reference for loop_ in mmCIF:
http://mmcif.wwpdb.org/docs/tutorials/mechanics/pdbx-mmcif-syntax.html
Args:
prefix: Prefix shared by each of the data items in the loop.
e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
_entity_poly_seq.mon_id. Should include the trailing period.
parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
parser.
Returns:
Returns a list of dicts; each dict represents 1 entry from an mmCIF loop.
"""
cols = []
data = []
for key, value in parsed_info.items():
if key.startswith(prefix):
cols.append(key)
data.append(value)
assert all([len(xs) == len(data[0]) for xs in data]), (
'mmCIF error: Not all loops are the same length: %s' % cols)
return [dict(zip(cols, xs)) for xs in zip(*data)]
def mmcif_loop_to_dict(prefix: str,
index: str,
parsed_info: MmCIFDict,
) -> Mapping[str, Mapping[str, str]]:
"""Extracts loop associated with a prefix from mmCIF data as a dictionary.
Args:
prefix: Prefix shared by each of the data items in the loop.
e.g. '_entity_poly_seq.', where the data items are _entity_poly_seq.num,
_entity_poly_seq.mon_id. Should include the trailing period.
index: Which item of loop data should serve as the key.
parsed_info: A dict of parsed mmCIF data, e.g. _mmcif_dict from a Biopython
parser.
Returns:
Returns a dict of dicts; each dict represents 1 entry from an mmCIF loop,
indexed by the index column.
"""
entries = mmcif_loop_to_list(prefix, parsed_info)
return {entry[index]: entry for entry in entries}
def get_release_date(parsed_info: MmCIFDict) -> str:
"""Returns the oldest revision date."""
revision_dates = parsed_info['_pdbx_audit_revision_history.revision_date']
return min(revision_dates)
def _get_protein_chains(
*, parsed_info: Mapping[str, Any]) -> Mapping[ChainId, Sequence[Monomer]]:
"""Extracts polymer information for protein chains only.
Args:
parsed_info: _mmcif_dict produced by the Biopython parser.
Returns:
A dict mapping mmcif chain id to a list of Monomers.
"""
# Get polymer information for each entity in the structure.
entity_poly_seqs = mmcif_loop_to_list('_entity_poly_seq.', parsed_info)
polymers = collections.defaultdict(list)
for entity_poly_seq in entity_poly_seqs:
polymers[entity_poly_seq['_entity_poly_seq.entity_id']].append(
Monomer(id=entity_poly_seq['_entity_poly_seq.mon_id'],
num=int(entity_poly_seq['_entity_poly_seq.num'])))
# Get chemical compositions. Will allow us to identify which of these polymers
# are proteins.
chem_comps = mmcif_loop_to_dict('_chem_comp.', '_chem_comp.id', parsed_info)
# Get chains information for each entity. Necessary so that we can return a
# dict keyed on chain id rather than entity.
struct_asyms = mmcif_loop_to_list('_struct_asym.', parsed_info)
entity_to_mmcif_chains = collections.defaultdict(list)
for struct_asym in struct_asyms:
chain_id = struct_asym['_struct_asym.id']
entity_id = struct_asym['_struct_asym.entity_id']
entity_to_mmcif_chains[entity_id].append(chain_id)
# Identify and return the valid protein chains.
valid_chains = {}
for entity_id, seq_info in polymers.items():
chain_ids = entity_to_mmcif_chains[entity_id]
# Reject polymers without any peptide-like components, such as DNA/RNA.
if any(['peptide' in chem_comps[monomer.id]['_chem_comp.type'].lower()
for monomer in seq_info]):
for chain_id in chain_ids:
valid_chains[chain_id] = seq_info
return valid_chains
def _is_set(data: str) -> bool:
"""Returns False if data is a special mmCIF character indicating 'unset'."""
return data not in ('.', '?')
@functools.lru_cache(16, typed=False)
def parse(*,
file_id: str,
mmcif_string: str,
catch_all_errors: bool = True) -> ParsingResult:
"""Entry point, parses an mmcif_string.
Args:
file_id: A string identifier for this file. Should be unique within the
collection of files being processed.
mmcif_string: Contents of an mmCIF file.
catch_all_errors: If True, all exceptions are caught and error messages are
returned as part of the ParsingResult. If False exceptions will be allowed
to propagate.
Returns:
A ParsingResult.
"""
errors = {}
try:
# PDB.MMCIFParser 类用于解析 mmCIF(macromolecular Crystallographic Information File)格式
# 的蛋白质结构数据。
parser = PDB.MMCIFParser(QUIET=True)
#print(type(parser))
#print(dir(parser))
# 在内存中创建一个类文件对象,该对象可以像文件一样进行读写操作,
# 但实际上是在内存中进行操作,而不是在磁盘上创建文件。
handle = io.StringIO(mmcif_string)
#print("handle value:",handle.getvalue())
# parser.get_structure(structure_id, file_path) 方法用于
# 从一个 PDB 文件中读取结构数据并创建一个 Structure 对象。
full_structure = parser.get_structure('', handle)
#print(type(full_structure))
#print("full_structure:",full_structure)
#for structure in next(full_structure.get_models()):
# print(type(structure))
# print(structure)
#model = full_structure.get_models()
#models = list(model)
#print("模型:")
#print(models)
#print("模型的数量:")
#print(len(models))
first_model_structure = _get_first_model(full_structure)
#print("first_model_structure:",first_model_structure)
# Extract the _mmcif_dict from the parser, which contains useful fields not
# reflected in the Biopython structure.
parsed_info = parser._mmcif_dict # pylint:disable=protected-access
#print("parsed_info:",parsed_info)
# Ensure all values are lists, even if singletons.
for key, value in parsed_info.items():
if not isinstance(value, list):
parsed_info[key] = [value]
header = _get_header(parsed_info) # 方法、日期、分辨率
#print(header)
# Determine the protein chains, and their start numbers according to the
# internal mmCIF numbering scheme (likely but not guaranteed to be 1).
valid_chains = _get_protein_chains(parsed_info=parsed_info)
# valid_chains 字典类型,键为链名,值为Monomer实例(氨基酸名称和编号)
# {'A': [Monomer(id='MET', num=1), Monomer(id='GLY', num=2),...],...}
if not valid_chains:
return ParsingResult(
None, {(file_id, ''): 'No protein chains found in this file.'})
# 每条链的起始氨基酸编号
seq_start_num = {chain_id: min([monomer.num for monomer in seq])
for chain_id, seq in valid_chains.items()}
# Loop over the atoms for which we have coordinates. Populate two mappings:
# -mmcif_to_author_chain_id (maps internal mmCIF chain ids to chain ids used
# the authors / Biopython).
# -seq_to_structure_mappings (maps idx into sequence to ResidueAtPosition).
mmcif_to_author_chain_id = {}
seq_to_structure_mappings = {}
for atom in _get_atom_site_list(parsed_info):
if atom.model_num != '1':
# We only process the first model at the moment.
continue
mmcif_to_author_chain_id[atom.mmcif_chain_id] = atom.author_chain_id
# HETATM 表示非标准的、异构的、或非生物分子的原子。
# ATOM 标记表示蛋白质或核酸原子
if atom.mmcif_chain_id in valid_chains: #蛋白质链中的原子
hetflag = ' '
if atom.hetatm_atom == 'HETATM':
# Water atoms are assigned a special hetflag of W in Biopython. We
# need to do the same, so that this hetflag can be used to fetch
# a residue from the Biopython structure by id.
if atom.residue_name in ('HOH', 'WAT'):
hetflag = 'W'
else:
hetflag = 'H_' + atom.residue_name
insertion_code = atom.insertion_code
if not _is_set(atom.insertion_code):
insertion_code = ' '
position = ResiduePosition(chain_id=atom.author_chain_id,
residue_number=int(atom.author_seq_num),
insertion_code=insertion_code)
# 重新索引(从0开始编码),蛋白质结构数据中每条链的起始氨基酸编码不总是从1开始
# 如 6->5, 1->0
#print("atom",atom)
# 查看AtomSite类, atom.mmcif_seq_num 为 _atom_site.label_seq_id
seq_idx = int(atom.mmcif_seq_num) - seq_start_num[atom.mmcif_chain_id]
#print("seq_start_num[atom.mmcif_chain_id]",seq_start_num[atom.mmcif_chain_id])
#print(int(atom.mmcif_seq_num))
#print(seq_idx)
current = seq_to_structure_mappings.get(atom.author_chain_id, {})
current[seq_idx] = ResidueAtPosition(position=position,
name=atom.residue_name,
is_missing=False,
hetflag=hetflag)
seq_to_structure_mappings[atom.author_chain_id] = current
#print("seq_to_structure_mappings",seq_to_structure_mappings)
# {'A': {5: ResidueAtPosition(position=ResiduePosition(chain_id='A', residue_number=13, insertion_code=' '), name='ARG', is_missing=False, hetflag=' '),
# 6: ResidueAtPosition(position=ResiduePosition(chain_id='A', residue_number=14, insertion_code=' '), name='ASN', is_missing=False, hetflag=' '),
# ...}, ...}
# Add missing residue information to seq_to_structure_mappings.
for chain_id, seq_info in valid_chains.items():
# 包括不在结构中的氨基酸
# print(seq_info)
# break
author_chain = mmcif_to_author_chain_id[chain_id]
current_mapping = seq_to_structure_mappings[author_chain]
#print("current_mapping:",current_mapping)
for idx, monomer in enumerate(seq_info):
#print ("idx:",idx)
#print (monomer)
if idx not in current_mapping:
current_mapping[idx] = ResidueAtPosition(position=None,
name=monomer.id,
is_missing=True,
hetflag=' ')
#print(current_mapping[idx])
#print("----")
# seq_to_structure_mappings[author_chain] = current_mapping ## 为什么不加这一句也可以??
# print("seq_to_structure_mappings['A'][0]:",seq_to_structure_mappings['A'][0])
author_chain_to_sequence = {}
for chain_id, seq_info in valid_chains.items():
author_chain = mmcif_to_author_chain_id[chain_id]
seq = []
for monomer in seq_info:
code = SCOPData.protein_letters_3to1.get(monomer.id, 'X')
seq.append(code if len(code) == 1 else 'X')
seq = ''.join(seq)
#print("seq:", seq)
author_chain_to_sequence[author_chain] = seq
#print(author_chain_to_sequence)
#print("mmcif_object construction")
#print("file_id:", file_id)
#print("header:", header)
#print("first_model_structure:", first_model_structure)
#print("author_chain_to_sequence:", author_chain_to_sequence)
# print("seq_to_structure_mappings:", seq_to_structure_mappings)
#print("parsed_info:", parsed_info)
mmcif_object = MmcifObject(file_id=file_id,
header=header,
structure=first_model_structure,
chain_to_seqres=author_chain_to_sequence,
seqres_to_structure=seq_to_structure_mappings,
raw_string=parsed_info)
# print("mmcif_object:",mmcif_object)
return ParsingResult(mmcif_object=mmcif_object, errors=errors)
except Exception as e: # pylint:disable=broad-except
errors[(file_id, '')] = e
if not catch_all_errors:
raise
return ParsingResult(mmcif_object=None, errors=errors)
### 从 https://www.rcsb.org 下载好8jlp.cif 文件
with open("8jlp.cif") as f:
mmcif_str = f.read()
result = parse(file_id = '8jlp.cif', mmcif_string = mmcif_str)
print(result)