MultiVI NaN in latent representation

scvi-tools开发者提供的这个tutorial(https://docs.scvi-tools.org/en/stable/tutorials/notebooks/MultiVI_tutorial.html)里面给的是scRNA-seq和scATAC-seq存在unpaired数据时的处理情况,对于都是paired的数据是不能正确运行的。我在github上提出问题之后,scVI-tools的开发者Adam Gayoso立即对我的问题进行解答,非常感谢Adam Gayoso,不厌其烦帮我解决了fully paired多组学数据的运行问题,以下是正确的运行代码:

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import os
import scanpy as sc
import scvi
from os.path import join
import anndata as ad
import metrics
scvi.settings.seed = 420

data_root = '/home/user/JupyterNotebook/Seurat'
RNA = pd.read_csv(join(data_root, 'Demo/Demo_rna_intersect_dc.tsv'),sep='\t')
RNA.head()

RNA_T=RNA.T
X=RNA_T.values
obs=pd.DataFrame()
var=pd.DataFrame()
var.index=RNA.index.tolist()
obs.index=RNA_T.index.tolist()
obs.index.name='barcode'
obs['batch_id']= 1
var['ID']= RNA_T.columns.tolist()
var['modality']='Gene Expression'
rna_adata=ad.AnnData(X,obs=obs,var=var)
rna_adata.var_names_make_unique()

ATAC = pd.read_csv(join(data_root, 'Demo/Demo_atac_intersect_dc.tsv'),sep='\t')
ATAC.head()
ATAC_T=ATAC.T
X=ATAC_T.values
obs=pd.DataFrame()
var=pd.DataFrame()
var.index=ATAC.index.tolist()
obs.index=ATAC_T.index.tolist()
obs.index.name='barcode'
obs['batch_id']= 1
var['ID']= ATAC_T.columns.tolist()
var['modality']='Peaks'
atac_adata=ad.AnnData(X,obs=obs,var=var)
atac_adata.var_names_make_unique()

multi = pd.concat([RNA, ATAC])
m_m=pd.concat([rna_adata.var['modality'],atac_adata.var['modality']])
m_m=pd.DataFrame(m_m)
multi_T=multi.T
X=multi_T.values
obs=pd.DataFrame()
var=pd.DataFrame()
var.index=multi.index.tolist()
obs.index=multi_T.index.tolist()
obs.index.name='barcode'
obs['batch_id']= 1
obs['modality']='paired'
var['ID']= multi_T.columns.tolist()
var['modality']=m_m['modality']

multi_adata=ad.AnnData(X,obs=obs,var=var)
multi_adata.var_names_make_unique()

sc.pp.filter_genes(multi_adata,min_cells=int(multi_adata.shape[0] * 0.01))

scvi.model.MULTIVI.setup_anndata(multi_adata, batch_key="modality")
mvi = scvi.model.MULTIVI(
    multi_adata,
    n_genes=(multi_adata.var['modality']=='Gene Expression').sum(),
    n_regions=(multi_adata.var['modality']=='Peaks').sum(),
)

os.environ['CUDA_VISIBLE_DEVICES'] = ''
mvi.train()

latent = mvi.get_latent_representation()
multi_adata.obsm["X_MultiVI"] = latent
sc.pp.neighbors(multi_adata, use_rep="X_MultiVI")
sc.tl.leiden(multi_adata, key_added="cluster_mvi", resolution=0.02)
MVI_label=pd.DataFrame(multi_adata.obs['cluster_mvi'])
names=['label']
MVI_label.columns=names
label=pd.read_csv('/home/user/JupyterNotebook/cluster_omit.csv',sep=',',header=0)
Y=np.array(label['Ident_ID'])
len(set(Y))
metrics.clustering_evaluate(Y,MVI_label['label'])

你可能感兴趣的:(Python,python,机器学习)