AnnData ,Annotated Data,是一种类似矩阵的数据,用于单细胞分析,已成为 Python 中该领域的标准数据结构。
AnnData 基于键的存储方式用起来简单,便于可重复性分析,转换为 R 语言的单细胞分析数据结构也十分容易。
假设我们有 n n n 个观测(observations),每个观测可表示为 d d d 维向量,每个维度对应一个变量或特征(variable)。
这个 n × d n \times d n×d 矩阵特殊在于带索引。
pip install anndata
import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32) # 泊松分布构建稀疏矩阵
adata = ad.AnnData(counts) # 初始化AnnData
print(adata) # 基本统计信息
# AnnData object with n_obs × n_vars = 100 × 2000
print(adata.X) # 访问稀疏矩阵的数据
# (0, 1) 1.0
# (0, 3) 1.0
# (0, 4) 1.0
# ......
adata.obs_names = [f'Cell_{i:d}' for i in range(adata.n_obs)]
adata.var_names = [f'Gene_{i:d}' for i in range(adata.n_vars)]
print(adata.obs_names[:5]) # 输出前五个观测名,即X轴
print(adata.var_names[:5]) # 输出前五个特征名,即Y轴
# Index(['Cell_0', 'Cell_1', 'Cell_2', 'Cell_3', 'Cell_4'], dtype='object')
# Index(['Gene_0', 'Gene_1', 'Gene_2', 'Gene_3', 'Gene_4'], dtype='object')
print(adata[['Cell_1', 'Cell_10'], ['Gene_5', 'Gene_1900']]) # 构建子集
# View of AnnData object with n_obs × n_vars = 2 × 2
adata.obs
和 adata.var
都是 Pandas DataFramespandas.Categorical()
:表示经典 R / S-plus 方式的分类变量,可以降低数据存储提升计算速度import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
ct = np.random.choice(['B', 'T', 'Monocyte'], size=(adata.n_obs,)) # 随机生成cell_type
adata.obs['cell_type'] = pd.Categorical(ct) # 为了效率,用pd.Categorical()
print(adata.obs)
# cell_type
# 0 B
# 1 Monocyte
# 2 T
# ......
# [100 rows x 1 columns]
print(adata)
# AnnData object with n_obs × n_vars = 100 × 2000
# obs: 'cell_type'
obsm
和 varm
添加元数据.obs
容易绘制import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
ct = np.random.choice(['B', 'T', 'Monocyte'], size=(adata.n_obs,)) # 随机生成cell_type
adata.obs['cell_type'] = pd.Categorical(ct) # 为了效率,用pd.Categorical()
adata.obsm['X_umap'] = np.random.normal(0, 1, size=(adata.n_obs, 2))
adata.varm['gene_stuff'] = np.random.normal(0, 1, size=(adata.n_vars, 5))
print(adata.obsm)
# AxisArrays with keys: X_umap
print(adata)
# AnnData object with n_obs × n_vars = 100 × 2000
# obs: 'cell_type'
# obsm: 'X_umap'
# varm: 'gene_stuff'
.uns
可存储非结构化元数据,可以是任意数据。import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata.uns['random'] = [1, 2, 3]
print(adata.uns)
# OverloadedDict, wrapping:
# OrderedDict([('random', [1, 2, 3])])
# With overloaded keys:
# ['neighbors'].
import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata.layers['log_transformed'] = np.log1p(adata.X)
print(adata)
# AnnData object with n_obs × n_vars = 100 × 2000
# layers: 'log_transformed'
import numpy as np
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata.obs_names = [f'Cell_{i:d}' for i in range(adata.n_obs)]
adata.var_names = [f'Gene_{i:d}' for i in range(adata.n_vars)]
adata.layers['log_transformed'] = np.log1p(adata.X)
print(adata.to_df(layer='log_transformed')) # 可以看到保留了索引
AnnData
对应文件格式为 h5ad
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata.obs_names = [f'Cell_{i:d}' for i in range(adata.n_obs)]
adata.var_names = [f'Gene_{i:d}' for i in range(adata.n_vars)]
ct = np.random.choice(['B', 'T', 'Monocyte'], size=(adata.n_obs,))
adata.obs['cell_type'] = pd.Categorical(ct)
adata.obsm['X_umap'] = np.random.normal(0, 1, size=(adata.n_obs, 2))
adata.varm['gene_stuff'] = np.random.normal(0, 1, size=(adata.n_vars, 5))
adata.uns['random'] = [1, 2, 3]
adata.layers['log_transformed'] = np.log1p(adata.X)
adata.write('my_results.h5ad', compression='gzip')
在 Linux 下执行命令:h5ls my_results.h5ad
X Group
layers Group
obs Group
obsm Group
obsp Group
uns Group
var Group
varm Group
varp Group
.copy()
获取真实 AnnData 对象。但这样做没什么必要,因为调用 .[]
都会在内部调用 .copy()
[]
,整型索引类似 pandas 的 .iloc
,字符串索引类似 .loc
import numpy as np
import pandas as pd
import anndata as ad
from scipy.sparse import csr_matrix
counts = csr_matrix(np.random.poisson(1, size=(100, 2000)), dtype=np.float32)
adata = ad.AnnData(counts)
adata.obs_names = [f'Cell_{i:d}' for i in range(adata.n_obs)]
adata.var_names = [f'Gene_{i:d}' for i in range(adata.n_vars)]
obs_meta = pd.DataFrame({
'time_yr': np.random.choice([0, 2, 4, 8], adata.n_obs),
'subject_id': np.random.choice(['subject 1', 'subject 2', 'subject 4', 'subject 8'], adata.n_obs),
'instrument_type': np.random.choice(['type a', 'type b'], adata.n_obs),
'site': np.random.choice(['site x', 'site y'], adata.n_obs),
},
index=adata.obs.index, # 同观测的索引
)
adata = ad.AnnData(adata.X, obs=obs_meta, var=adata.var) # 构建新的AnnData
print(adata)
# AnnData object with n_obs × n_vars = 100 × 2000
# obs: 'time_yr', 'subject_id', 'instrument_type', 'site'
print(adata[:5, ['Gene_1', 'Gene_3']]) # 生成的是视图
# View of AnnData object with n_obs × n_vars = 5 × 2
# obs: 'time_yr', 'subject_id', 'instrument_type', 'site'
adata_subset = adata[:5, ['Gene_1', 'Gene_3']].copy() # 全新AnnData
print(adata[:3, 'Gene_1'].X.toarray().tolist())
adata[:3, 'Gene_1'].X = [0, 0, 0] # 视图也可以设值
print(adata[:3, 'Gene_1'].X.toarray().tolist())
# [[0.0], [0.0], [0.0]]
adata_subset = adata[:3, ['Gene_1', 'Gene_2']] # 访问视图的某些部分,内容会被自动复制,并生成一个数据存储对象
print(adata_subset)
# View of AnnData object with n_obs × n_vars = 3 × 2
# obs: 'time_yr', 'subject_id', 'instrument_type', 'site'
adata_subset.obs['foo'] = range(3) # 现在adata_subset不再是adata的引用
print(adata_subset)
# AnnData object with n_obs × n_vars = 3 × 2
# obs: 'time_yr', 'subject_id', 'instrument_type', 'site', 'foo'
print(adata[adata.obs.time_yr.isin([2, 4])].obs.head())
如果 h5ad
文件非常大,可以使用 backed
模式部分读入内存
import anndata as ad
adata = ad.read('my_results.h5ad', backed='r')
print(adata.isbacked)
# True
print(adata.filename)
# my_results.h5ad
adata.file.close()
功能:以指定格式打印数据集信息。
用法:h5ls [OPTIONS] file [OBJECTS...]
参数
简写 | 全写 | 功能 |
---|---|---|
-a | –address | 打印原始数据地址,必须和 -v 或 --verbose 一起使用 |
-d | –data | 打印数据集的值 |
–enable-error-stack | 打印错误栈 | |
–follow-symlinks | 使用符号链接显示目标对象信息 | |
–no-dangling-links | 检查不解析为现有对象的符号链接,必须和 --follow-symlinks 一起使用 | |
-f | –full | 打印完整路径名称 |
-g | –group | 显示组的信息 |
-l | –label | 标记复合数据集的成员 |
-r | –recursive | 递归列出所有组 |
-s | –string | 以 ASCII 格式打印 1 字节数据集 |
-S | –simple | 使用机器可读的输出格式 |
-wN | –width=N | 设置输出的列数 |
-v | –verbose | 生成更多信息 |
-V | –version | 打印版本号 |
–vfd=DRIVER | 指定虚拟文件驱动 | |
-x | –hexdump | 以十六进制格式显示原始数据 |
OBJECTS | 每个对象由一个HDF5文件名(可选地,后面跟着一个斜杠)和文件中的一个对象名称组成(如果文件中没有指定对象,则显示根组的内容)。 |
常用命令
h5ls x.h5ad # 查看基础信息
h5ls -d x.h5ad/obs/louvain # 查看/obs/louvain的数据
h5ls -d x.h5ad/obs/__categories/louvain # 查看/obs/louvain的数据
h5ls -d -S x.h5ad/obs/louvain | sort | uniq # 对/obs/louvain的数据去重
h5ls -d -S x.h5ad/obs/louvain | sort | uniq -c # 对/obs/louvain的数据去重并显示出现次数
功能:显示 HDF5 内容。
用法:h5dump [OPTIONS] file
参数
简写 | 全写 | 功能 |
---|---|---|
-h | –help | 帮助文档 |
-B | –bootblock | Print the content of the boot block. (This option is not yet implemented.) |
-H | –header | 标题 |
-A | 属性头和属性值 | |
-i | –object-ids | 对象ids(貌似是全部打印) |
-r | –string | Print 1-bytes integer datasets as ASCII. |
-V | –version | 版本号 |
-a P | –attribute=P | 指定属性 |
-d P | –dataset=P | 指定数据集 |
-f D | –filedriver=D | Specify which driver to open the file with. |
-g P | –group=P | 指定组 |
-l P | –soft-link=P | Print the value(s) of the specified soft link. |
-o F | –output=F | Output raw data into file F. |
-t T | –datatype=T | Print the specified named datatype. |
-w N | –width=N | 输出的列数 |
-x | –xml | Output XML using XML schema (default) instead of DDL. |
-u | –use-dtd | Output XML using XML DTD instead of DDL. |
-D U | –xml-dtd=U | In XML output, refer to the DTD or schema at U instead of the default schema/DTD. |
-X S | –xml-dns=S | In XML output, (XML Schema) use qualified names in the XML: “:”: no namespace, default: “hdf5:” |
-s L | –start=L | 子集选择开始的偏移量 |
-S L | –stride=L | Hyperslab stride. Default: 1 in all dimensions. |
-c L | –count=L | 块的数量 |
-k L | –block=L | Size of block in hyperslab. Default: 1 in all dimensions. |
– | Indicate that all following arguments are non-options. E.g., to dump a file called `-f’, use h5dump – -f. | |
file | The file to be examined. |
常用命令
h5dump -H x.h5ad # 打印标题
h5dump -g obs x.h5ad # 打印Group为obs的数据
h5dump -g /obs/__categories x.h5ad # 打印Group为obs的数据
h5dump -d X -c "10,10" x.h5ad # 打印Dataset为X的(10, 10)数据
h5dump -d /obs/louvain x.h5ad # 打印/obs/louvain的数据
h5dump -d /obs/louvain -c 100 x.h5ad # 打印/obs/louvain的数据
h5dump -a /obs/_index x.h5ad # 打印属性
功能:比较
HDFView
X[0]
是第一个细胞的所有基因的表达量,X[:, 0]
是第一个基因在左右细胞的表达量