单个样本分析接上一篇:Python图文复现2022||02-数据介绍与下载,结合视频观看效果更佳~
视频相关代码如下:
- 视频如下:https://www.youtube.com/watch?v=uvyG9yLuNSE
- 代码:https://github.com/mousepixels/sanbomics_scripts
- 主代码:https://github.com/mousepixels/sanbomics_scripts/blob/main/single_cell_analysis_complete_class.ipynb
读取数据
先定义一个函数,批量运行多个样本:这里一定要注意缩进问题
def pp(csv_path):
adata = sc.read_csv(csv_path).T
sc.pp.filter_genes(adata, min_cells = 10)
sc.pp.highly_variable_genes(adata, n_top_genes = 2000, subset = True, flavor = 'seurat_v3')
scvi.model.SCVI.setup_anndata(adata)
vae = scvi.model.SCVI(adata)
vae.train()
solo = scvi.external.SOLO.from_scvi_model(vae)
solo.train()
df = solo.predict()
df['prediction'] = solo.predict(soft = False)
df.index = df.index.map(lambda x: x[:-2])
df['dif'] = df.doublet - df.singlet
doublets = df[(df.prediction == 'doublet') & (df.dif > 1)]
adata = sc.read_csv(csv_path).T
adata.obs['Sample'] = csv_path.split('_')[2] #'raw_counts/GSM5226574_C51ctr_raw_counts.csv'
adata.obs['doublet'] = adata.obs.index.isin(doublets.index)
adata = adata[~adata.obs.doublet]
sc.pp.filter_cells(adata, min_genes=200) #get rid of cells with fewer than 200 genes
#sc.pp.filter_genes(adata, min_cells=3) #get rid of genes that are found in fewer than 3 cells
adata.var['mt'] = adata.var_names.str.startswith('mt-') # annotate the group of mitochondrial genes as 'mt'
adata.var['ribo'] = adata.var_names.isin(ribo_genes[0].values)
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt', 'ribo'], percent_top=None, log1p=False, inplace=True)
upper_lim = np.quantile(adata.obs.n_genes_by_counts.values, .98)
adata = adata[adata.obs.n_genes_by_counts < upper_lim]
adata = adata[adata.obs.pct_counts_mt < 20]
adata = adata[adata.obs.pct_counts_ribo < 2]
return adata
这个过程比较久,会依次读取GSE171524数据集中的26例样本,并进行上面的函数里面定义的分析。
这个过程中就可以跑去听听视频了。
import os
# 注意dir改成自己的路径
dir = '/path/data/GSE171524/'
out = []
for file in os.listdir(dir+'raw_counts/'):
out.append(pp(dir + 'raw_counts/' + file))
将多个样本连接合并在一起,合并后共有105264个细胞:
adata = sc.concat(out)
adata
AnnData object with n_obs × n_vars = 105264 × 29236
obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
var: 'n_cells'
过滤并保存:
sc.pp.filter_genes(adata, min_cells = 10)
from scipy.sparse import csr_matrix
adata.X = csr_matrix(adata.X)
adata.X
adata.write_h5ad(dir+'combined.h5ad')
预处理
先读取上次保存的数据:105264的细胞
import scanpy as sc
import scvi
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 注意dir改成自己的路径
dir = '/path/data/GSE171524/'
adata = sc.read_h5ad(dir+'combined.h5ad')
adata
AnnData object with n_obs × n_vars = 105264 × 29236
obs: 'Sample', 'doublet', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo'
var: 'n_cells'
每个样本中的各种指标统计:
adata.obs.groupby('Sample').count()
共26个样本:
低表达过滤以及数据标准化
sc.pp.filter_genes(adata, min_cells = 100)
adata.layers['counts'] = adata.X.copy()
sc.pp.normalize_total(adata, target_sum = 1e4)
sc.pp.log1p(adata)
adata.raw = adata
# 查看每个细胞的指标
adata.obs.head()
结果图:
去Sample间的批次以及聚类
这个过程运行时间也会有丢丢长
scvi.model.SCVI.setup_anndata(adata, layer = "counts", categorical_covariate_keys=["Sample"], continuous_covariate_keys=['pct_counts_mt', 'total_counts', 'pct_counts_ribo'])
model = scvi.model.SCVI(adata)
#may take a while without GPU
model.train()
adata.obsm['X_scVI'] = model.get_latent_representation()
adata.layers['scvi_normalized'] = model.get_normalized_expression(library_size = 1e4)
sc.pp.neighbors(adata, use_rep = 'X_scVI')
sc.tl.umap(adata)
sc.tl.leiden(adata, resolution = 0.5)
sc.pl.umap(adata, color = ['leiden', 'Sample'], frameon = False)
plt.savefig(dir+"01-intergration_umap.png")
结果图:
保存数据:
adata.write_h5ad(dir + 'integrated.h5ad')
差异表达分析
使用没有矫正的数据做差异表达分析
# 更改聚类数
sc.tl.leiden(adata, resolution = 1)
sc.tl.rank_genes_groups(adata, 'leiden')
# 可视化
sc.pl.rank_genes_groups(adata, n_genes=20, sharey=False)
plt.savefig(dir+"02-intergration_rank_genes_groups.png", dpi=300)
部分cluster的top基因:
markers = sc.get.rank_genes_groups_df(adata, None)
markers = markers[(markers.pvals_adj < 0.05) & (markers.logfoldchanges > .5)]
markers
# 保存
markers.to_csv(dir+'markers.csv')
差异结果:
使用模型标准化后的值做差异表达分析:
markers_scvi = model.differential_expression(groupby = 'leiden')
# 设定阈值
markers_scvi = markers_scvi[(markers_scvi['is_de_fdr_0.05']) & (markers_scvi.lfc_mean > .5)]
markers_scvi
# 保存
markers_scvi.to_csv(dir+'scvi_markers.csv')
结果:
重新聚类后的结果可视化:总共得到34个cluster
sc.pl.umap(adata, color = ['leiden'], frameon = False, legend_loc = "on data")
plt.savefig(dir+"02-intergration_umap_1.png")
## 有多少cluster
adata.obs['leiden']
#Name: leiden, Length: 105264, dtype: category
#Categories (34, object): ['0', '1', '2', '3', ..., '30', '31', '32', '33']
结果图:
细胞类型注释
文章中进行了三次注释,第一次注释大类,主要为9个类:
这几个类的基因文章中没有提供,就用我们自己收集的基因来进行注释好了。
在视频资源中,视频speaker老师给了一张图:是目前免疫细胞分类很详细的一张图了:
https://learn.cellsignal.com/hubfs/landing-pages/2019/18-IMM-18284/18-IMM_18284-Human%20Markers%20PWHO-digital.pdf
基因可视化:
markers = ['EPCAM','KRT8','KRT18','KRT19', # epithelial cells
'CD68', 'CTSS', 'FCN1','CD163', # myeloid cells
'ACTA2', 'DCN', 'ACTB' , # fibroblasts
'PECAM1','CD34','VWF', # endothelial cells
'PTPRC','CD3D','CD3E','CD3G','CD8A', 'CD4', # T and natural killer (NK) lymphocytes
'CD79A', 'MS4A1','CD19', # B lymphocytes and plasma cells
'SNAP25', 'SYT1', # neuronal cells
'CPA3', # mast cells
'CST3', 'LAMP3', 'HLA-DQB2', 'HLA-DPB1', 'BIRC3', # antigen-presenting cells (APCs; primarily dendritic cells)
'MKI67','TOP2A', # Cycling
'HBB','HBD' # Erythroid
]
# 看某一个基因的表达情况
markers[markers.names=="HBB"]
markers[markers.group=="30"]
#, layer = 'scvi_normalized'
# , vmax = 5
sc.pl.umap(adata, color = ["HBB"], frameon = False, layer = 'scvi_normalized', vmax =4)
plt.savefig(dir+"03-intergration_umap_Erythroid_1.png")
一marker为基础,绘制如下类似图,vmax参数可以进行调整让结果看起来更明显,注视不出来的可以看看每个cluster高表达的基因,查一查功能就立马可以推断出来了:
结合marker表达以及cluster特异性高表达基因:
细胞注释如下:
for x in range(0,35):
print(f'"{x}":"", ')
cell_type = {"0":"T Cell",
"1":"Epithelial Cell",
"2":"Myeloid Cell",
"3":"Fibroblast",
"4":"Myeloid Cell",
"5":"T Cell",
"6":"Myeloid Cell",
"7":"Epithelial Cell",
"8":"Endothelial",
"9":"Fibroblast",
"10":"Myeloid Cell",
"11":"pDC",
"12":"Epithelial Cell",
"13":"Fibroblast",
"14":"Fibroblast",
"15":"Epithelial Cell",
"16":"Epithelial Cell",
"17":"Myeloid Cell",
"18":"Epithelial Cell",
"19":"Epithelial Cell",
"20":"Neuronal Cell",
"21":"Epithelial Cell",
"22":"B Cell",
"23":"Mast Cell",
"24":"T Cell",
"25":"pDC",
"26":"Myeloid Cell",
"27":"Endothelial",
"28":"Fibroblast",
"29":"Myeloid Cell",
"30":"Erythroid",
"31":"B Cell",
"32":"Neuronal Cell",
"33":"Myeloid Cell"
}
adata.obs['cell type'] = adata.obs.leiden.map(cell_type)
sc.pl.umap(adata, color = ['cell type'], frameon = False, legend_loc = "on data")
plt.tight_layout()
plt.savefig(dir+"04-intergration_umap_anno.png", dpi=300)
adata
adata.uns['scvi_markers'] = markers_scvi
adata.uns['markers'] = markers
# 保存
adata.write_h5ad(dir + 'integrated_anno.h5ad')
model.save(dir + 'model.model')
细胞注释结果如下:
这里注释出来了一串文献中没有的红细胞。
下次进行详细注释~~~