在scanpy对数据进行聚类操作后,往往需要对簇进行细胞类型标注。本文采取了一种暴力搜索的方法,通过将簇本身的差异基因作为marker基因,和现有的marker基因进行比对,确定簇的细胞类型。(本质上就是数组比对)
代码如下:
# 载入包
import pandas as pd
import scanpy as sc
def make_list(lenth, cluster_num):
o_list = []
for i in range(lenth):
o_list.append(i % cluster_num)
return pd.DataFrame(o_list)
def to_matrix(dic):
cluster_num = pd.DataFrame(dic['names']).values.shape[1]
i1 = pd.DataFrame(pd.DataFrame(dic['names']).values.reshape(-1))
i2 = pd.DataFrame(pd.DataFrame(dic['pvals']).values.reshape(-1))
i3 = pd.DataFrame(pd.DataFrame(dic['logfoldchanges']).values.reshape(-1))
i4 = pd.DataFrame(pd.DataFrame(dic['pvals_adj']).values.reshape(-1))
i5 = pd.DataFrame(pd.DataFrame(dic['scores']).values.reshape(-1))
i6 = make_list(len(i1), cluster_num)
print((i1.shape, i2.shape, i3.shape, i4.shape, i5.shape, i6.shape))
matrix = pd.concat([i1, i2, i3, i4, i5, i6], axis=1)
matrix.columns = ['names', 'pvals', 'logfoldchanges', 'pvals_adj', 'scores', 'cluster']
print('matrix shape:' + f'{matrix.shape}')
return matrix
# 输入qc后的df
def identify_cell(df_c, dic_m):
# 群数量
cluster_num = len(set(df_c['cluster'].values))
# cell_type数量
cell_type_num = len(dic_m.keys())
# 最终细胞类型
cell_type_list = ['unknown'] * cluster_num
# 匹配判断矩阵,0记录cell_type,1记录匹配数,2记录logfoldchanges计算值
match_scores = [[['unknown', 0.0, 0] for _ in range(cell_type_num)] for _ in range(cluster_num)]
# 对聚类群进行marker基因匹配
for cluster_index in range(cluster_num):
# 群
cluster = df_c[df_c.cluster == cluster_index]
# 群基因组
cluster_genes = cluster.names.values
# 群差异倍数归一化
cluster.logfoldchanges = (cluster.logfoldchanges - cluster.logfoldchanges.min()) / (cluster.logfoldchanges.max() - cluster.logfoldchanges.min())
# 查cell_type表的marker基因, 更新匹配判断矩阵
for cell_type_index, cell_type in enumerate(dic_m.keys()):
print(cluster_index, cell_type_index, cell_type)
match_scores[cluster_index][cell_type_index][0] = cell_type
print(match_scores[cluster_index][cell_type_index])
cell_type_genes = dic_m[cell_type]
# cell_type匹配基因个数
match_num = 0
# cell_type所属总基因个数
all_gnum = len(cell_type_genes)
# cell_type匹配基因列表
match_g = []
# 查单个cell的marker基因匹配个数
for g in cell_type_genes:
if g in cluster_genes:
match_num += 1
match_g.append(g)
# if match_num == all_gnum:
# cell_type_list[cluster_index] = cell_type
# print('full match')
# break
if match_num != 0:
# match_scores[cluster_index][cell_type_index][0] = cell_type
match_scores[cluster_index][cell_type_index][1] = match_num / all_gnum
scores = []
for g in match_g:
g_foldchanges = cluster[cluster['names'] == g].logfoldchanges.values
scores.append(g_foldchanges)
sum(pp.minmax_scale(scores))/len(scores)
print(f'scores_list:{scores}')
scores = sum(scores) / len(scores)
print(f'scores:{scores}')
match_scores[cluster_index][cell_type_index][2] = scores
if cell_type_list[cluster_index] == 'unknown':
# 最大匹配数
mn = 0
# 差异倍数评分
ms = 0
# 遍历该群匹配判断矩阵,更新cell_type
for match_l in match_scores[cluster_index]:
if match_l[0] != 'unknown':
print(match_l[0])
if match_l[1] > mn:
mn = match_l[1]
cell_type_list[cluster_index] = match_l[0]
elif match_l[2] > ms:
ms = match_l[2]
cell_type_list[cluster_index] = match_l[0]
print(match_scores)
print(len(match_scores), len(match_scores[0]))
print(cell_type_list)
# print(cell_type_list)
return cell_type_list
if __name__ == '__main__':
# 载入文件
adata = sc.read_h5ad(
'../../project/jupyter_project/logs/UM22_713.h5ad')
s_matrix = to_matrix(adata.uns['rank_genes_groups'])
print(s_matrix, '\n', s_matrix.shape)
# s_matrix.to_csv(r'logs/matrix.csv', header=False, index=False)
dfc = s_matrix
dfm = {'cells type1': ['gene1', 'gene2'],
'cells type2': ['gene1', 'gene3'],}
identify_cell(dfc[dfc.logfoldchanges >= 0.25], dfm)
这里的前置条件是需要先调用
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
计算簇的marker基因
注意
dfm是作为先验的marker基因字典,需要大家自己填充。
s_matrix是生成了一个细胞的marker基因表格,可以单独保存。
本文的上游分析流程参考了这篇博主的文章:传送门
具体工作是在“根据已知的细胞标记,注释细胞类型”下,计算得出了new_cluster_names = [
‘CD4 T’, ‘CD14 Monocytes’,
‘B’, ‘CD8 T’,
‘NK’, ‘FCGR3A Monocytes’,
‘Dendritic’, ‘Megakaryocytes’]
用于标注的细胞类型标签