单细胞之Seurat包实战
我们演示的数据集是E-MTAB-7704,可以从github中下载。
1. 特征描述
setwd("D:\\R\\ll\\E-MTAB-7704")
library(limma)
library(Seurat)
library(dplyr)
library(magrittr)
ikura_sap<-read.table("rawcounts_ikura_sap.txt",header = T,sep = "\t",row.names = 1,stringsAsFactors = F)
ikura_wt01<-read.table("rawcounts_ikura_wt01.txt",header = T,sep = "\t",row.names = 1,stringsAsFactors = F)
ikura_wt02<-read.table("rawcounts_ikura_wt02.txt",header = T,sep = "\t",row.names = 1,stringsAsFactors = F)
ko <- ikura_sap
wt01 <- ikura_wt01
wt02 <- ikura_wt02
colnames(ko)<-paste("ko",colnames(ko),sep = "_")
colnames(wt01)<-paste("wt01",colnames(wt01),sep = "_")
colnames(wt02)<-paste("wt02",colnames(wt02),sep = "_")
dim(ko)
dim(wt01)
dim(wt02)
data1<-cbind(ko,wt01)
data<-cbind(data1,wt02)
pbmc <- CreateSeuratObject(counts = data,project = "seurat", min.cells = 3, min.features = 50, names.delim = "_")
pbmc[["percent.mt"]] <- PercentageFeatureSet(object = pbmc, pattern = "^mt-")
VlnPlot(object = pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) #保存基因特征小提琴图
pbmc <- subset(x = pbmc, subset = nFeature_RNA > 50 & percent.mt < 5)
tiff("04.featureCor.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")#保存基因特征相关性图
plot1 <- FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt",pt.size=1.5)
plot2 <- FeatureScatter(object = pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",,pt.size=1.5)
CombinePlots(plots = list(plot1, plot2))
dev.off()
pbmc <- NormalizeData(object = pbmc, normalization.method = "LogNormalize", scale.factor = 10000) #对数据进行标准化
pbmc <- FindVariableFeatures(object = pbmc, selection.method = "vst", nfeatures = 1500) #提取那些在细胞间变异系数较大的基因
top10 <- head(x = VariableFeatures(object = pbmc), 10)
tiff("04.featureVar.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw") #保存基因特征方差图
plot1 <- VariableFeaturePlot(object = pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
CombinePlots(plots = list(plot1, plot2)) #输出特征方差图
dev.off()
2. PCA主成分分析
PCA分析
pbmc=ScaleData(pbmc) #PCA降维之前的标准预处理步骤
pbmc=RunPCA(object= pbmc,npcs = 20,pc.genes=VariableFeatures(object = pbmc))
绘制每个PCA成分的相关基因
tiff("05.pcaGene.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
VizDimLoadings(object = pbmc, dims = 1:4, reduction = "pca",nfeatures = 20)
dev.off()
主成分分析图形
tiff("05.PCA.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
DimPlot(object = pbmc, reduction = "pca")
dev.off()
主成分分析热图
tiff("05.pcaHeatmap.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
DimHeatmap(object = pbmc, dims = 1:4, cells = 500, balanced = TRUE,nfeatures = 30,ncol=2)
dev.off()
每个PC的p值分布和均匀分布
pbmc <- JackStraw(object = pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(object = pbmc, dims = 1:20)
tiff("05.pcaJackStraw.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
JackStrawPlot(object = pbmc, dims = 1:20)
dev.off()
3. TSNE聚类分析和marker基因
TSNE聚类分析
pcSelect=20
pbmc <- FindNeighbors(object = pbmc, dims = 1:pcSelect) #计算邻接距离
pbmc <- FindClusters(object = pbmc, resolution = 0.5) #对细胞分组,优化标准模块化
pbmc <- RunTSNE(object = pbmc, dims = 1:pcSelect) #TSNE聚类
tiff("06.TSNE.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
TSNEPlot(object = pbmc, do.label = TRUE, pt.size = 2, label = TRUE) #TSNE可视化
dev.off()
write.table(pbmc$seurat_clusters,file="06.tsneCluster.txt",quote=F,sep="\t",col.names=F)
寻找差异表达的特征
logFCfilter=0.5
adjPvalFilter=0.05
pbmc.markers <- FindAllMarkers(object = pbmc,
only.pos = FALSE,
min.pct = 0.25,
logfc.threshold = logFCfilter)
sig.markers=pbmc.markers[(abs(as.numeric(as.vector(pbmc.markers$avg_logFC)))>logFCfilter & as.numeric(as.vector(pbmc.markers$p_val_adj))
绘制marker在各个cluster的热图
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
tiff("06.tsneHeatmap.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
DoHeatmap(object = pbmc, features = top10$gene) + NoLegend()
dev.off()
绘制marker的小提琴图
tiff("06.markerViolin.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
VlnPlot(object = pbmc, features = c("Icos", "Il2rb", "Rorc", "Tbx21", "Ifng", "Il17a"))
dev.off()
绘制marker在各个cluster的散点图
tiff("06.markerScatter.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
FeaturePlot(object = pbmc, features = c("Icos", "Il2rb", "Rorc", "Tbx21", "Ifng", "Il17a"),cols = c("green", "red"))
dev.off()
绘制marker在各个cluster的气泡图
tiff("06.markerBubble.tiff",units = "in",width = 10,height = 8,res = 600,compression = "lzw")
cluster10Marker=head(top10$gene,10)
DotPlot(object = pbmc, features = cluster10Marker)
dev.off()
4. 生成cluster_fp.csv文件
.row_stats_by_factor = function (data, fact, rowFunction = rowMeans) {
u = as.character(sort(unique(fact)))
fact[is.na(fact)] = F
n=length(u)
centers = matrix(NA,dim(data)[1], n, dimnames = list(rownames(data), u))
for (i in u) {
if(sum(fact==i, na.rm=T)>1) {
centers[,i] = rowFunction(data[,fact==i,drop=F])
} else {
centers[,i] = data[,fact==i]
}
} # much faster than tapply
return(centers)
}
us = as.matrix(pbmc@assays$RNA@data)
f_g_cov = rowSums(us) > 10
clust_geomean = .row_stats_by_factor(us[f_g_cov,], [email protected], function(y) {exp(rowMeans(log(1+y)))-1})
clust_meansize = tapply(colSums(us), [email protected], mean)
ideal_cell_size = pmin(1000, median(clust_meansize))
g_fp = t(ideal_cell_size*t(clust_geomean)/as.vector(clust_meansize))
fp_reg = 0.1
g_fp_n = (fp_reg+g_fp)/apply(fp_reg+g_fp, 1, median)
write.csv(g_fp_n,file = "cluster_fp.csv",quote = F)