https://github.com/theislab/single-cell-tutorial/blob/master/supplementary_scripts/Splatter-marker-genes-random-data.ipynb
import scanpy as sc
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import seaborn as sb
import warnings
from rpy2.rinterface import RRuntimeWarning
from rpy2.robjects import pandas2ri
%load_ext rpy2.ipython
# ignore R warning messages
warnings.filterwarnings("ignore", category=RRuntimeWarning)
# Automatically convert rpy2 outputs to pandas dataframes
pandas2ri.activate()
plt.rcParams['figure.figsize']=(8,8) #rescale figures
sc.settings.verbosity = 3
#sc.set_figure_params(dpi=200, dpi_save=300)
#sc.logging.print_versions()
%%R -o data
#suppressMessages(library(splatter))
suppressMessages(library(splatter))
n_cells <- 1000
n_genes <- 10000
params <- newSplatParams(seed=12345,
nGenes = n_genes,
batchCells = n_cells,
dropout.shape = 0.95,
de.prob = 0)
sim <- splatSimulate(params)
data <- counts(sim)
import pandas as pd
data_df = pd.DataFrame(data)
adata = sc.AnnData(data_df.transpose())
adata.obs_names = adata.obs_names.astype(str)
adata.var_names = adata.var_names.astype(str)
#Basic QC filtering
sc.pp.filter_cells(adata, min_counts=1000)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=20)
#Normalize the data
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
sc.pp.log1p(adata)
#Detect highly variable genes
sc.pp.filter_genes_dispersion(adata, flavor='cell_ranger', n_top_genes=4000, log=False, subset=False)
# Pre-process and visualize
sc.pp.pca(adata, svd_solver='arpack', use_highly_variable=True, n_comps=10)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.pl.pca_variance_ratio(adata)
#Plot visualizations
sc.pl.pca(adata)
sc.pl.umap(adata)
#Cluster the data
sc.tl.louvain(adata, resolution=0.5, key_added='louvain')
sc.pl.pca(adata, color='louvain')#, save='_splatter_pca_louvain.pdf')
sc.pl.umap(adata, color='louvain')
sc.tl.rank_genes_groups(adata, groupby='louvain', use_raw=False, method='t-test', key_added='rank_louvain', n_genes=9000)
sc.pl.rank_genes_groups(adata, key='rank_louvain', groups=['0','1'], fontsize=12)
#Extract p-value data
pval_clust0 = [i[0] for i in adata.uns['rank_louvain']['pvals']]
pval_clust1 = [i[1] for i in adata.uns['rank_louvain']['pvals']]
#Plot expected P-value distributions
lab_size = 25
tick_size = 15
p1 = sb.distplot(pval_clust0, kde=False, bins=100)
p1.set_xlabel("P-values",fontsize=lab_size)
p1.set_ylabel("Frequency",fontsize=lab_size)
p1.tick_params(labelsize=tick_size)
f1 = p1.get_figure()
#f1.savefig('figures/hist_pvals_clust0.pdf')
plt.show()
p2 = sb.distplot(pval_clust1, kde=False, bins=100)
p2.set_xlabel("P-values",fontsize=lab_size)
p2.set_ylabel("Frequency",fontsize=lab_size)
p2.tick_params(labelsize=tick_size)
f2 = p2.get_figure()
#f2.savefig('figures/hist_pvals_clust1.pdf')
plt.show()
#Number of 'significant' marker gene after multiple testing correcting
signif_thresh = 0.05
n_signif_markers0 = np.sum([i[0] < signif_thresh for i in adata.uns['rank_louvain']['pvals_adj']])
n_signif_markers1 = np.sum([i[1] < signif_thresh for i in adata.uns['rank_louvain']['pvals_adj']])
print(n_signif_markers0)
print(n_signif_markers1)