scanpy细胞类型标注(marker基因对比)

在scanpy对数据进行聚类操作后,往往需要对簇进行细胞类型标注。本文采取了一种暴力搜索的方法,通过将簇本身的差异基因作为marker基因,和现有的marker基因进行比对,确定簇的细胞类型。(本质上就是数组比对)

比对逻辑如下所示:
scanpy细胞类型标注(marker基因对比)_第1张图片

代码如下:

# 载入包
import pandas as pd
import scanpy as sc


def make_list(lenth, cluster_num):
    o_list = []
    for i in range(lenth):
        o_list.append(i % cluster_num)
    return pd.DataFrame(o_list)

def to_matrix(dic):
    cluster_num = pd.DataFrame(dic['names']).values.shape[1]
    i1 = pd.DataFrame(pd.DataFrame(dic['names']).values.reshape(-1))
    i2 = pd.DataFrame(pd.DataFrame(dic['pvals']).values.reshape(-1))
    i3 = pd.DataFrame(pd.DataFrame(dic['logfoldchanges']).values.reshape(-1))
    i4 = pd.DataFrame(pd.DataFrame(dic['pvals_adj']).values.reshape(-1))
    i5 = pd.DataFrame(pd.DataFrame(dic['scores']).values.reshape(-1))
    i6 = make_list(len(i1), cluster_num)
    print((i1.shape, i2.shape, i3.shape, i4.shape, i5.shape, i6.shape))
    matrix = pd.concat([i1, i2, i3, i4, i5, i6], axis=1)
    matrix.columns = ['names', 'pvals', 'logfoldchanges', 'pvals_adj', 'scores', 'cluster']
    print('matrix shape:' + f'{matrix.shape}')
    return matrix

# 输入qc后的df
def identify_cell(df_c, dic_m):
    # 群数量
    cluster_num = len(set(df_c['cluster'].values))
    # cell_type数量
    cell_type_num = len(dic_m.keys())
    # 最终细胞类型
    cell_type_list = ['unknown'] * cluster_num
    # 匹配判断矩阵,0记录cell_type,1记录匹配数,2记录logfoldchanges计算值
    match_scores = [[['unknown', 0.0, 0] for _ in range(cell_type_num)] for _ in range(cluster_num)]
    # 对聚类群进行marker基因匹配
    for cluster_index in range(cluster_num):
        # 群
        cluster = df_c[df_c.cluster == cluster_index]
        # 群基因组
        cluster_genes = cluster.names.values
        # 群差异倍数归一化
        cluster.logfoldchanges = (cluster.logfoldchanges - cluster.logfoldchanges.min()) / (cluster.logfoldchanges.max() - cluster.logfoldchanges.min())
        # 查cell_type表的marker基因, 更新匹配判断矩阵
        for cell_type_index, cell_type in enumerate(dic_m.keys()):
            print(cluster_index, cell_type_index, cell_type)
            match_scores[cluster_index][cell_type_index][0] = cell_type
            print(match_scores[cluster_index][cell_type_index])
            cell_type_genes = dic_m[cell_type]
            # cell_type匹配基因个数
            match_num = 0
            # cell_type所属总基因个数
            all_gnum = len(cell_type_genes)
            # cell_type匹配基因列表
            match_g = []
            # 查单个cell的marker基因匹配个数
            for g in cell_type_genes:
                if g in cluster_genes:
                    match_num += 1
                    match_g.append(g)
            # if match_num == all_gnum:
            #     cell_type_list[cluster_index] = cell_type
            #     print('full match')
            #     break
            if match_num != 0:
                # match_scores[cluster_index][cell_type_index][0] = cell_type
                match_scores[cluster_index][cell_type_index][1] = match_num / all_gnum
                scores = []
                for g in match_g:
                    g_foldchanges = cluster[cluster['names'] == g].logfoldchanges.values
                    scores.append(g_foldchanges)
                sum(pp.minmax_scale(scores))/len(scores)
                print(f'scores_list:{scores}')
                scores = sum(scores) / len(scores)
                print(f'scores:{scores}')
                match_scores[cluster_index][cell_type_index][2] = scores
        if cell_type_list[cluster_index] == 'unknown':
            # 最大匹配数
            mn = 0
            # 差异倍数评分
            ms = 0
            # 遍历该群匹配判断矩阵,更新cell_type
            for match_l in match_scores[cluster_index]:
                if match_l[0] != 'unknown':
                    print(match_l[0])
                    if match_l[1] > mn:
                        mn = match_l[1]
                        cell_type_list[cluster_index] = match_l[0]
                    elif match_l[2] > ms:
                        ms = match_l[2]
                        cell_type_list[cluster_index] = match_l[0]
    print(match_scores)
    print(len(match_scores), len(match_scores[0]))
    print(cell_type_list)
    # print(cell_type_list)
    return cell_type_list


if __name__ == '__main__':
    # 载入文件
    adata = sc.read_h5ad(
        '../../project/jupyter_project/logs/UM22_713.h5ad')
    s_matrix = to_matrix(adata.uns['rank_genes_groups'])
    print(s_matrix, '\n', s_matrix.shape)
    # s_matrix.to_csv(r'logs/matrix.csv', header=False, index=False)

    dfc = s_matrix
    dfm = {'cells type1': ['gene1', 'gene2'],
           'cells type2': ['gene1', 'gene3'],}
    identify_cell(dfc[dfc.logfoldchanges >= 0.25], dfm)

这里的前置条件是需要先调用

sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')

计算簇的marker基因

注意

dfm是作为先验的marker基因字典,需要大家自己填充。

s_matrix是生成了一个细胞的marker基因表格,可以单独保存。

本文的上游分析流程参考了这篇博主的文章:传送门

具体工作是在“根据已知的细胞标记,注释细胞类型”下,计算得出了new_cluster_names = [
‘CD4 T’, ‘CD14 Monocytes’,
‘B’, ‘CD8 T’,
‘NK’, ‘FCGR3A Monocytes’,
‘Dendritic’, ‘Megakaryocytes’]
用于标注的细胞类型标签

你可能感兴趣的:(Bioinformation,python)