R语言~高级PCA分析

主成分分析,不只需要找到几个虚拟但不知具体为何物的主成分因子,还需要知道我们提供的变量在各个主成分因子上的载荷(有正有负)和贡献度,以及具体保留多少个主成分因子才能既尽量保留数据的原始变异又能避免过度冗余。这是一个值得思考的问题。甚至能发nature,science。

加载程序包

packages <- c("ggplot2", "reshape2",  'ggpmisc', 'stringr',
              'vegan', "FactoMineR", "factoextra", "ade4", "ggpubr",'scales')

# Install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}

invisible(lapply(packages, library, character.only = TRUE))
library('dplyr')

source('utils.R')

读入并整理变量数据

dat <- read.table("../Data/GlobalAtlasv2_scores_v5.txt", header = T, sep = "\t")
head(dat)
dat$AridityIndex_wc2 = NULL
names(dat) = gsub('_18s_richness','', names(dat))
names(dat) = gsub('Richness_bacteria','Bacteria', names(dat))
names(dat) = gsub('_wc2','',names(dat))
names(dat) = gsub('WaterHoldingCapacity','WHC',names(dat))
names(dat) = gsub('Proportion_decomposers','Decomposers',names(dat))
names(dat) = gsub('Proportion_symbiotic_fungi','Sym_fungi', names(dat))
names(dat) = gsub('Pathogen_control','Pathogen_ctrl',names(dat))
names(dat) = gsub('Soil_respiration','Soil_resp',names(dat))
names(dat) = gsub('Soil_salinity','Salinity',names(dat))
names(dat) = gsub('Soil_pH','pH', names(dat))
names(dat) = gsub('Clay_silt_c','Clay_silt',names(dat))
names(dat) = gsub('Soil_ORC_g_kg','SOC', names(dat))
names(dat) = gsub("PO4_c", "PO4", names(dat))
names(dat) = gsub('Aridity_Index_v3','AridityIndex', names(dat))
dat$Salinity = log(dat$Salinity)

构建向量存储变量组合,以便后文直接引用

div_var = c("Oomycota","Bacteria", "Fungi", "Nematoda","Ciliophora", 
            "Amoebozoa","Excavata", "Apicomplexa","Platyhelminthes", "Annelida", 
            "Rotifera","Cercozoa", "Ochrophyta", "Dinoflagellata", "Arachnida", 
            "Collembola",  "Neoptera","Myriapoda", "Tardigrada",   "Chlorophyta")

func_var = c("Porosity","WHC","NH4","NO3","PO4","NAG","Soil_resp",
             "NPP","Pathogen_ctrl" ,"Sym_fungi","Decomposers")

env_var = c("pH", "Salinity", "Clay_silt", "SOC", "Plant_cover", "MAT", "PSEA", "MDR", "TSEA",'AridityIndex')

执行PCA分析

pca = PCA(scale(subset(dat, select = func_var)), graph = F, ncp = 5)
ind = get_pca_ind(pca)
dat$PC1 <- ind$coord[,1]
dat$PC2 <- ind$coord[,2]
dat$PC3 <- ind$coord[,3]
dat$PC4 <- ind$coord[,4]
dat$PC5 <- ind$coord[,5]

绘制PCA排序图

Biplot <-fviz_pca_biplot(pca, fill.ind = dat$Eco_corrected, palette = 'lancet',
                         col.ind = "white", geom.ind = "point",
                         label ="none", labelsize = 2,
                         col.var = "black", arrowsize = 0.3,
                         repel=TRUE, pointshape = 21, pointsize = 5, alpha.ci = 0.5) + 
  # scale_fill_manual(breaks = c('Forest', 'Grassland'), values = c('bisque4','chartreuse4',))+
  geom_hline(yintercept = 0, lty = 2, lwd = 0.1)+
  geom_vline(xintercept = 0, lty = 2, lwd = 0.1) +
  labs(fill = "Vegetation",
       title = NULL,
       x = sprintf("PC1  (%.2f %s)", round(pca$eig[1,2],2), '%'), 
       y = sprintf("PC2  (%.2f %s)", round(pca$eig[2,2],2), '%')) + 
  theme_classic()+
  theme(axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6,color = 'black'),
        panel.grid = element_blank(),
        legend.background = element_blank(),
        legend.box.background = element_blank(),
        legend.position = c(0.15,0.2),
        legend.key.height = unit(0.3,'cm'),
        legend.title = element_text(size = 6,color = 'black'),
        legend.text = element_text(size = 6, color = 'black',margin = margin(l = -8)))

判定最优的主成分因子个数

pca1 <- dudi.pca(subset(dat, select = func_var), center = TRUE, scale = TRUE, scannf=FALSE, nf = 5)
test1 <- testdim(pca1, nrepet = 999)
test1

print(paste("Number of statistical significant components according to Dray et al., 2006:", test1$nb.cor))

提取各变量在主成分因子上的载荷并进行显著性判定

var <- pca$var
# (1) bootstrapped egigenvector, p value < 0.05
boot5 <- netoboot(subset(dat, select = func_var), scannf = F, nf = 5, permutations = 1000)

pvalmatrix <- computePval_peres(pca1$c1, boot5)

# (2) fixed threshold
coordPCthr_val <- rbind(abs(var$coord[,1]), abs(var$coord[,2]), abs(var$coord[,3]),
                        abs(var$coord[,4]), abs(var$coord[,5]))#Richman 1988

coordPCthr <- rbind(ifelse(abs(var$coord[,1])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,2])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,3])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,4])>0.3, TRUE, FALSE),
                    ifelse(abs(var$coord[,5])>0.3, TRUE, FALSE))#Richman 1988

# (3) threshold method, their contribution is above 100/ndim
contrPCthr <- rbind(ifelse(var$contrib[,1]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,2]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,3]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,4]>(100/length(func_var)), TRUE, FALSE),
                    ifelse(var$contrib[,5]>(100/length(func_var)), TRUE, FALSE))

contrPCthr_val <- rbind(var$contrib[,1], var$contrib[,2], var$contrib[,3],
                        var$contrib[,4], var$contrib[,5])

df.PCA.sign <- data.frame(rbind(pvalmatrix, contrPCthr_val, coordPCthr_val))

df.PCA.sign$PC <- rep(c("PC1", "PC2", "PC3", "PC4", "PC5"), 3)
df.PCA.sign$Method <- rep(c("Peres-Neto et al., 2003", "Threshold", "Threshold Richmann 1988"),each = 5)
write.table(df.PCA.sign, file = 'PCA_signtest.csv', row.names = F)

## -- define the relevant loadings for the final plotting  
pvalmatrix_TF <- ifelse(pvalmatrix < 0.05, TRUE, FALSE)

三种检验方法均指示显著差才是最终显著

pc1Relevance<-coordPCthr[1,]*pvalmatrix_TF[1,]*contrPCthr[1,]
pc2Relevance<-coordPCthr[2,]*pvalmatrix_TF[2,]*contrPCthr[2,]
pc3Relevance<-coordPCthr[3,]*pvalmatrix_TF[3,]*contrPCthr[3,]
pc4Relevance<-coordPCthr[4,]*pvalmatrix_TF[4,]*contrPCthr[4,]
pc5Relevance<-coordPCthr[5,]*pvalmatrix_TF[5,]*contrPCthr[5,]

整理结果并绘图

df.loadings <- data.frame(EFP = rep(row.names(var$coord),5),
                          val = c(var$coord[,1],var$coord[,2], var$coord[,3], 
                                  var$coord[,4], var$coord[,5]),
                          contr = c(ifelse(pc1Relevance == 1, 'High', 'Low'),
                                    ifelse(pc2Relevance == 1, 'High', 'Low'),
                                    ifelse(pc3Relevance == 1, 'High', 'Low'),
                                    ifelse(pc4Relevance == 1, 'High', 'Low'),
                                    ifelse(pc5Relevance == 1, 'High', 'Low')),
                          PC = c(rep("PC1",length(row.names(var$coord))),
                                 rep("PC2",length(row.names(var$coord))),
                                 rep("PC3",length(row.names(var$coord))),
                                 rep("PC4",length(row.names(var$coord))),
                                 rep("PC5",length(row.names(var$coord)))))

df.loadings$EFP = factor(df.loadings$EFP, 
                         levels = c('Porosity','WHC','Soil_resp','NPP','NO3','NH4','PO4',
                                    'NAG','Pathogen_ctrl','Decomposers','Sym_fungi'),
                         labels = c('Porosity','WHC',bquote('Soil'~'Resp'),'NPP',
                                    bquote("NO"[3]),bquote('NH'[4]),bquote('PO'[4]),
                                    'NAG',bquote('Pathogen'~ 'Ctrl'), bquote('Decomposers'~'(%)'), 
                                    bquote('Sym-fungi'~'(%)')))
                                           

loadings_p <- ggplot(data= subset(df.loadings, !PC %in% c("PC4",'PC5')), 
                     aes(x=EFP, y=val, fill = contr)) + 
  facet_grid(. ~ PC, scales = "free_y") +
  scale_y_continuous(breaks = c(-0.4,0,0.4,0.8), labels = dropLeadingZero)+
  scale_x_discrete(labels = parse(text = levels(df.contrib$EFP)))+
  geom_bar(stat="identity", position=position_dodge()) + coord_flip() +
  scale_fill_manual(values=c("#E69F00", "#999999")) + theme_classic() + 
  labs(y="Loadings", x = NULL) + 
  theme(legend.position='none',
        strip.text = element_text(size = 6,color = 'black'),
        strip.background = element_blank(),
        axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6,color = 'black'))




df.contrib <- data.frame(EFP = rep(row.names(var$contrib),5),
                         val = c(var$contrib[,1],var$contrib[,2],var$contrib[,3],
                                 var$contrib[,4],var$contrib[,5]),
                         contr = c(ifelse(pc1Relevance == 1, 'High', 'Low'),
                                   ifelse(pc2Relevance == 1, 'High', 'Low'),
                                   ifelse(pc3Relevance == 1, 'High', 'Low'),
                                   ifelse(pc4Relevance == 1, 'High', 'Low'),
                                   ifelse(pc5Relevance == 1, 'High', 'Low')),
                         PC = c(rep("PC1",length(row.names(var$contrib))),
                                rep("PC2",length(row.names(var$contrib))),
                                rep("PC3",length(row.names(var$contrib))),
                                rep("PC4",length(row.names(var$contrib))),
                                rep("PC5",length(row.names(var$contrib)))))

df.contrib$EFP = factor(df.contrib$EFP, 
                        levels = c('Porosity','WHC','Soil_resp','NPP','NO3','NH4','PO4',
                                   'NAG','Pathogen_ctrl','Decomposers','Sym_fungi'),
                        labels = c('Porosity','WHC',bquote('Soil'~'Resp'),'NPP',bquote("NO"[3]),
                                   bquote('NH'[4]),bquote('PO'[4]),'NAG',
                                   bquote('Pathogen'~ 'Ctrl'), bquote('Decomposers'~'(%)'), 
                                   bquote('Sym-fungi'~'(%)')))

contribs_p <- ggplot(data=subset(df.contrib, !PC %in% c("PC4",'PC5')), 
                     aes(x=EFP, y=val, fill = contr)) + 
  facet_grid(. ~ PC, scales = "free_y") +
  geom_bar(stat="identity", position=position_dodge()) + coord_flip() +
  theme_classic() + 
  scale_fill_manual(values=c("#E69F00", "#999999")) +
  scale_x_discrete(labels = parse(text = levels(df.contrib$EFP)))+
  labs(y="Contribution [%]", x = NULL) + 
  theme(legend.key.size = unit(0.3, 'cm'),
        legend.title = element_text(size=6,color = 'black'),
        legend.text = element_text(size=6,color = 'black'),
        legend.background = element_blank(),
        legend.position = c(0.9, 0.5),
        strip.text = element_text(size = 6,color = 'black'),
        strip.background = element_blank(),
        axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6,color = 'black'))

exp_var_p <- fviz_eig(pca, addlabels=F, 
               barfill="white", barcolor ="darkblue",
               ncp = 5, labelsize = .1,
               linecolor ="red") + 
  theme_classic() + 
  scale_y_continuous(expand = c(0,0))+
  geom_label(aes(x=1, y=33, label="Stretch it"), vjust= -1)+
  labs(title = NULL, x = "Principal Components (PC)", y = "Explained variance (%)") + 
  theme(axis.title = element_text(size = 6,color = 'black'),
        axis.text = element_text(size = 6, color = 'black'),
        plot.margin = margin(l = 20, b = 6, t = 6, r = 6))


figure2 <- ggarrange(Biplot, exp_var_p, contribs_p, loadings_p,
                     labels = c("a", "b","c","d"), 
                     font.label=list(face = "bold", color="black",size= 10,
                                     axis.text.y = element_text(face="bold")),
                     common.legend = FALSE,
                     nrow = 2, ncol = 2)
Figure2.jpg

你可能感兴趣的:(R语言~高级PCA分析)