R语言绘制PCA双标图、碎石图、变量载荷图和变量贡献图

1、原论文数据双标图

代码:

setwd("D:/Desktop/0000/R") #更改路径

#导入数据
df <- read.table("Input data.csv", header = T, sep = ",")

# -----------------------------------
#所需的包:
packages <- c("ggplot2", "tidyr", "dplyr", "readr", "ggrepel", "cowplot", "factoextra")
#安装你尚未安装的R包
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
invisible(lapply(packages, library, character.only = TRUE))

# -----------------------------------
# 设置一些颜色、文字的基础设置
# Colors:
CatCol <- c(
  CSH = "#586158", DBF = "#C46B39", EBF = "#4DD8C0", ENF = "#3885AB", GRA = "#9C4DC4",
  MF = "#C4AA4D", OSH = "#443396", SAV = "#CC99CC", WET = "#88C44D", WSA = "#AB3232"
)
Three_colorblind <- c("#A8AD6F", "#AD6FA8", "#6FA8AD") #c("#809844", "#4f85b0", "#b07495")
graph_elements_dark <- "black"
plot_elements_light <- "gray75"
plot_elements_dark <- "gray25"

# Transparency:
boot_alpha_main <- 0.9
boot_alpha_small <- 0.05

# Text:
# if (n_pcs > 3) {x_angle <- 270; x_adjust <- 0.25} else {x_angle <- 0; x_adjust <- 0} # option to change orientation of x axis text
x_angle <- 0; x_adjust <- 0
title_text <- 9 # Nature Communications: max 7 pt; cowplot multiplier: 1/1.618; 7 pt : 1/1.618 = x pt : 1; x = 7 / 1/1.618; x = 11.326 (round up to integer)
subtitle_text <- 9
normal_text <- 9 # Nature Communications: min 5 pt; cowplot multiplier: 1/1.618; 5 pt : 1/1.618 = x pt : 1; x = 5 / 1/1.618; x = 8.09 (round up to integer)

# Element dimensions:
plot_linewidth <- 0.33
point_shape <- 18
point_size <- 1.5

# Initialize figure lists:
p_biplot <- list(); p_r2 <- list(); p_load <- list(); p_contr <- list(); col_ii <- list()

# Labels:
veg_sub_labels <- c("All Sites", "All Forests", "Evergreen Needle-Forests") 

# -----------------------------------
#选择PCA所需的数据
codes_4_PCA <- c("SITE_ID", "IGBP", "GPPsat", "wLL", "wNmass", "wLMA", "RECOmax") # 选择需要的列数据
#执行筛选
df_subset <- df %>%
  dplyr::select(all_of(codes_4_PCA))
#运行PCA。dplyr::select(-species):将不需要的列数据去除
pca_result <- FactoMineR::PCA(df_subset %>% dplyr::select(-SITE_ID, -IGBP), scale.unit = T, ncp = 10, graph = F)

# -----------------------------------
#绘图
p1<- fviz_pca_biplot(pca_result,
                     axes = c(1, 2),
                     col.ind = df_subset$IGBP, #"grey50",
                     # col.ind = NA, #plot_elements_light, #"white",
                     geom.ind = "point",
                     palette = CatCol,#'futurama',
                     label = "var",
                     col.var = plot_elements_dark,
                     labelsize = 3,
                     repel = TRUE,
                     pointshape = 16,
                     pointsize = 2,
                     alpha.ind = 0.67,
                     arrowsize = 0.5)

# -----------------------------------
# 它是ggplot2对象,我们在此基础上进一步修改一下标注。
p1<-p1+
  labs(title = "",
       x = "PC1",
       y = "PC2",
       fill = "IGBP") +
  guides(fill = guide_legend(title = "")) +
  theme(title = element_blank(),
        text = element_text(size = normal_text),
        axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_text(size = title_text, face = "bold"),
        axis.text = element_text(size = normal_text),
        #plot.margin = unit(c(0, 0, 0, 0), "cm"),
        # legend.position = "none"
        legend.text = element_text(size = subtitle_text),
        legend.key.height = unit(5, "mm"),
        legend.key.width = unit(2, "mm")
  )
p1

R语言绘制PCA双标图、碎石图、变量载荷图和变量贡献图_第1张图片

参考:Leaf-level coordination principles propagate to the ecosystem scale (https://doi.org/10.1038/s41467-023-39572-5)、主成分分析图。

2、我选用iris数据进行重新绘制测试双标图

代码:

setwd("D:/Desktop/0000/R") #更改路径

#导入数据
df <- read.table("iris1.csv", header = T, sep = ",")

# -----------------------------------
#所需的包:
packages <- c("ggplot2", "tidyr", "dplyr", "readr", "ggrepel", "cowplot", "factoextra")
#安装你尚未安装的R包
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
invisible(lapply(packages, library, character.only = TRUE))

# -----------------------------------
# 设置一些颜色、文字的基础设置
# Colors:
CatCol <- c(
  setosa = "#586158", versicolor = "#C46B39", virginica = "#4DD8C0") # 设置类别颜色
Three_colorblind <- c("#A8AD6F", "#AD6FA8", "#6FA8AD") #c("#809844", "#4f85b0", "#b07495")
graph_elements_dark <- "black"
plot_elements_light <- "gray75"
plot_elements_dark <- "gray25"

# Transparency:
boot_alpha_main <- 0.9
boot_alpha_small <- 0.05

# Text:
# if (n_pcs > 3) {x_angle <- 270; x_adjust <- 0.25} else {x_angle <- 0; x_adjust <- 0} # option to change orientation of x axis text
x_angle <- 0; x_adjust <- 0
title_text <- 9 # Nature Communications: max 7 pt; cowplot multiplier: 1/1.618; 7 pt : 1/1.618 = x pt : 1; x = 7 / 1/1.618; x = 11.326 (round up to integer)
subtitle_text <- 9
normal_text <- 9 # Nature Communications: min 5 pt; cowplot multiplier: 1/1.618; 5 pt : 1/1.618 = x pt : 1; x = 5 / 1/1.618; x = 8.09 (round up to integer)

# Element dimensions:
plot_linewidth <- 0.33
point_shape <- 18
point_size <- 1.5

# Initialize figure lists:
p_biplot <- list(); p_r2 <- list(); p_load <- list(); p_contr <- list(); col_ii <- list()

# Labels:
veg_sub_labels <- c("All Sites", "All Forests", "Evergreen Needle-Forests") 

# -----------------------------------
#选择PCA所需的数据
codes_4_PCA <- c("sepal_length", "sepal_width", "petal_length", "petal_width", "species") # 选择需要的列数据
#执行筛选
df_subset <- df %>%
  dplyr::select(all_of(codes_4_PCA))
#运行PCA。dplyr::select(-species):将不需要的列数据去除
pca_result <- FactoMineR::PCA(df_subset %>% dplyr::select(-species), scale.unit = T, ncp = 10, graph = F)

# -----------------------------------
#绘图
p1<- fviz_pca_biplot(pca_result,
                     axes = c(1, 2),
                     col.ind = df_subset$species, #"grey50",
                     # col.ind = NA, #plot_elements_light, #"white",
                     geom.ind = "point",
                     palette = CatCol,#'futurama',
                     label = "var",
                     col.var = plot_elements_dark,
                     labelsize = 3,
                     repel = TRUE,
                     pointshape = 16,
                     pointsize = 2,
                     alpha.ind = 0.67,
                     arrowsize = 0.5)

# -----------------------------------
# 它是ggplot2对象,我们在此基础上修改一下标注。
p1<-p1+
  labs(title = "",
       x = "PC1",
       y = "PC2",
       fill = "IGBP") +
  guides(fill = guide_legend(title = "")) +
  theme(title = element_blank(),
        text = element_text(size = normal_text),
        axis.line = element_blank(),
        axis.ticks = element_blank(),
        axis.title = element_text(size = title_text, face = "bold"),
        axis.text = element_text(size = normal_text),
        #plot.margin = unit(c(0, 0, 0, 0), "cm"),
        # legend.position = "none"
        legend.text = element_text(size = subtitle_text),
        legend.key.height = unit(5, "mm"),
        legend.key.width = unit(2, "mm")
  )
p1

R语言绘制PCA双标图、碎石图、变量载荷图和变量贡献图_第2张图片

3、iris数据进行绘制碎石图、变量载荷图、变量贡献图

代码:

#加载包
library(dplyr) #用于数据预处理
library(tidyr) #用于数据预处理
library(stringr) #用于字符串处理
library(modelr) #用于自助法重抽样
library(FactoMineR) #用于PCA
library(ade4) #用于PCA
library(factoextra) #用于PCA结果提取及绘图
#所需的包:
packages <- c("ggplot2", "tidyr", "dplyr", "readr", "ggrepel", "cowplot", "factoextra")
#安装你尚未安装的R包
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
  install.packages(packages[!installed_packages])
}
invisible(lapply(packages, library, character.only = TRUE))

setwd("D:/Desktop/0000/R") #更改路径
# 加载数据
df <- read.csv("iris.csv",header = T, row.names = 1) # row.names = 1: 第一列为标签,这时赋值给df时就没有这列了

#重抽样
set.seed(123) #设置随机种子
tt=99 #设置重抽样的次数。iris[,-5]:表示去除第5列,因为这是类别
df_boot <- iris[,-5] %>% modelr::bootstrap(n = tt) #重抽样,结果是一个列表,包含499个数据框

#使用循环对每一个数据集进行PCA
#初始化3个空变量
N_PCS <- tibble() #使用维数检验保留的PC数量
pca_stats <- tibble() #变量的贡献和载荷
R2 <- c() #解释方差占比

#使用循环对每一个数据集进行PCA
#初始化3个空变量
N_PCS <- tibble() #使用维数检验保留的PC数量
pca_stats <- tibble() #变量的贡献和载荷
R2 <- c() #解释方差占比

#循环
for (j in 1:tt) {
  ##提取第j次bootstrap的数据
  dat <- df_boot %>% 
    slice(j) %>% # 选择第j行
    pull(strap) %>% # 提取列表
    as.data.frame() # 提取数据集
  
  #使用FactoMineR包执行PCA
  pca_result <- FactoMineR::PCA(dat, scale.unit = T, ncp = 4, graph = F) # ncp = 4:降维几个主成分,设置最大即为全部
  #使用ade4包执行PCA
  # center:指定是否对数据进行中心化,默认为 TRUE。中心化意味着将数据减去各自的均值,使得数据在每个维度上的平均值为零。
  # scale:指定是否对数据进行缩放,默认为 TRUE。缩放意味着将数据除以各自的标准差,使得数据在每个维度上的标准差为一。
  # scannf:指定是否计算特征值和特征向量,默认为 FALSE。如果设置为 TRUE,则会计算特征值和特征向量。
  pca1 <- ade4::dudi.pca(dat, center = TRUE, scale = TRUE, scannf = FALSE, nf = 4) # nf= 4:降维几个主成分,设置最大即为全部
  #检测不确定性和显著性
  #执行维数检验
  pc_tested <-testdim(pca1, nrepet = 999)
  
  ###提取bootstrap数据集的PCA结果
  N_PCS <- N_PCS %>% 
    bind_rows(tibble(strap = j, n_pcs = pc_tested$nb.cor)) #第j次运行的PCA
  
  pca_stats <- bind_rows(pca_stats,
                         pca_result$var$contrib %>% # add contributions
                           as_tibble(rownames = "var") %>%
                           pivot_longer(cols = !var, names_to = "PC", values_to = "contrib") %>% 
                           left_join(pca_result$var$coord %>% # add loadings
                                       as_tibble(rownames = "var") %>%
                                       pivot_longer(cols = !var, names_to = "PC", values_to = "loading"),
                                     by = c("var", "PC")
                           ) %>% 
                           mutate(PC = str_sub(PC, start = 5), #提取PC名称中的数字
                                  strap = j) # bootstrap run number
  ) #得到变量贡献和载荷
  
  R2 <- bind_rows(R2,
                  tibble(PC = pca_result[["eig"]]%>% rownames(),
                         exp_var = pca_result[["eig"]][,2],
                         strap = j) %>% 
                    mutate(PC = str_sub(PC, start = 6)) #提取PC名称中的数字
  )
}

#保留的PC数量
N_PCS <- N_PCS %>%
  group_by(n_pcs) %>% 
  summarise(n_rep = n()) %>% #对重复值进行计数
  mutate(retained = n_rep / tt * 100) #计算运行次数百分比
pc_ret <- N_PCS %>% filter(retained == max(retained))
#输出结果的摘要
print(paste0("Number of statistical significant components according to Dray method (Dray et al., 2008) was ",pc_ret[1,1], " in ", round(pc_ret[1,3], digits = 1), "% of runs."))


n_pcs <- NA #保留PC数的初始设置
# n_pcs <- 2 #可以手动设置保留PC数
if (is.na(n_pcs)) {
  n_pcs <- N_PCS %>% 
    filter(retained == max(retained)) %>% 
    select(n_pcs) %>% 
    unlist() %>% unname()
} #按照Dray等人的方法设置保留PC数

##变量贡献和载荷
pca_stats <- pca_stats %>% 
  group_by(PC, var) %>% 
  mutate(
    contrib_mean = mean(contrib),
    contrib_median = median(contrib),
    contrib_std = sd(contrib),
    # contrib_q25 = quantile(contrib, 0.25), contrib_q75 = quantile(contrib, 0.75),
    loading_mean = mean(loading),
    loading_median = median(loading),
    loading_std = sd(loading),
    # loading_q25 = quantile(loading, 0.25), loading_q75 = quantile(loading, 0.75)
  ) %>% 
  ungroup() %>% 
  dplyr::rename(contrib_boot = contrib, loading_boot = loading) #重命名以免后续的匹配过程出现混乱

##修改PC名称
pca_stats <- pca_stats %>%
  mutate(PC_name = paste0("PC", PC))

##解释方差占比
R2 <- R2 %>% 
  group_by(PC) %>% 
  mutate(
    R2_mean = mean(exp_var),
    R2_median = median(exp_var),
    R2_std = sd(exp_var),
    # R2_q25 = quantile(exp_var, 0.25), R2_q75 = quantile(exp_var, 0.75)
  ) %>% 
  ungroup() %>% 
  dplyr::rename(R2_boot = exp_var) #重命名以免后续的匹配过程出现混乱

##添加到pca_stats的表格中
pca_stats <- pca_stats %>% left_join(R2, by = c("PC", "strap"))

#对原始数据的PCA
pca_result <- FactoMineR::PCA(iris[,-5], scale.unit = T, ncp = 4, graph = F)

#添加原始数据计算得到的实际值
pca_stats <- pca_stats %>% 
  dplyr::left_join( #添加原始数据的R2(不是bootstrapping的均值)
    tibble(PC = pca_result[["eig"]] %>% rownames(),
           R2 = pca_result[["eig"]][,2]
    ) %>% 
      mutate(PC = str_sub(PC, start = 6)), #提取PC数
    by = "PC"
  ) %>% 
  dplyr::left_join( #添加原始数据的变量贡献(不是bootstrapping的均值)
    pca_result$var$contrib %>% #添加贡献
      as_tibble(rownames = "var") %>%
      pivot_longer(cols = !var, names_to = "PC", values_to = "contrib") %>% 
      mutate(PC = str_sub(PC, start = 5)), #提取PC数
    by = c("PC", "var")
  ) %>%
  dplyr::left_join( #添加原始数据的变量载荷(不是bootstrapping的均值)
    pca_result$var$coord %>% #添加载荷
      as_tibble(rownames = "var") %>%
      pivot_longer(cols = !var, names_to = "PC", values_to = "loading") %>% 
      mutate(PC = str_sub(PC, start = 5)), # extract PC numbers
    by = c("PC", "var")
  )

## 添加PC数的保留百分比(在自助法中PC被保留得有多频繁)
pca_stats <- pca_stats %>% 
  dplyr::left_join(N_PCS %>% dplyr::mutate(PC = n_pcs %>% as.character) %>% dplyr::select(PC, retained),
                   by = "PC"
  ) 

# -----------------------------------
# 绘制图碎石图
dat_boot <- pca_stats %>%
  dplyr::select(PC_name, PC, R2_boot) %>% unique()%>% #去除重复
  dplyr::mutate(PC = as.character(PC))

dat_true <- pca_stats %>%
  dplyr::select(PC_name, PC, R2, R2_median, R2_std) %>% unique() %>% #去除重复
  dplyr::mutate(PC = as.character(PC))

p2 <- ggplot(data = dat_true, aes(x = PC_name, y = R2, group = 1)) + # x = PC -> only numbers on axis, x = PC_name -> can give problems with PC10 being ordered before PC2;
  # group 1 是用来避免某些warning/error的
  geom_errorbar(aes(ymin = R2 - R2_std, ymax = R2 + R2_std),
                color = Three_colorblind[1], linewidth = plot_linewidth, width = 0.4) + # bootstrapping的标准差
  # geom_bar(stat = "identity", position = position_dodge(), fill = Three_colorblind[1], width = 0.61) + #b07a4f, #9c6a5e, #643c3c
  geom_line(color = Three_colorblind[1]) +
  geom_point(color = Three_colorblind[1], size = point_size) + #实际值
  geom_jitter(data = dat_boot, aes(x = PC_name, y = R2_boot, group = 1), alpha = 0.1,
              color = "black", shape = point_shape, size = 0.5, width = 0.1) + #每次自助样本的值
  geom_point(aes(x = PC_name, y = R2_median), color = plot_elements_dark,
             alpha = boot_alpha_main, shape = point_shape, size = point_size) + #添加自助法得到的中位数值
  geom_text(aes(x = PC_name, y = R2 + R2_std + 2, label = paste0(R2 %>% round(digits = 1), "%")),
            nudge_x = 0.33, size = 2) + #添加数值标注
  labs(title = "", x = "", y = "Explained variance") +
  theme_classic() +
  theme(title = element_blank(),
        text = element_text(size = normal_text),
        axis.line = element_line(color = graph_elements_dark),
        axis.ticks.x = element_line(color = graph_elements_dark),
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = title_text, face = "bold"),
        # axis.title.x = element_blank(), #已经在'labs'中指定
        axis.text = element_text(size = normal_text),
        axis.text.y = element_blank(),
        plot.margin = unit(c(0, 1, 0, 1), "cm"),
        legend.position = "none"
  ) +
  NULL
p2

# -----------------------------------
# 绘制变量载荷图
dat_boot <- pca_stats %>%
  dplyr::filter(PC <= n_pcs[1]) %>% #去除额外的PC
  dplyr::select(PC_name, var, loading_boot) %>% unique() #去除重复

dat_true <- pca_stats %>%
  dplyr::filter(PC <= n_pcs[1]) %>% #去除额外的PC
  dplyr::select(PC_name, var, loading, loading_median, loading_std) %>% unique() #去除重复

p3 <- ggplot(data = dat_true, aes(x = var, y = loading)) +
  facet_grid(. ~ PC_name, scales = "free_y") +
  geom_errorbar(aes(ymin = loading - loading_std, ymax = loading + loading_std), # loading_q25, ymax = loading_q75
                color = Three_colorblind[2], linewidth = plot_linewidth, width = 0.9) + # standard error = std from bootstrapping
  geom_bar(stat = "identity", position = position_dodge(), fill = Three_colorblind[2]) + #b07a4f, #9c6a5e, #643c3c
  geom_hline(yintercept = 0, color = graph_elements_dark) +
  geom_jitter(data = dat_boot, aes(x = var, y = loading_boot), alpha = boot_alpha_small, color = plot_elements_dark,
              shape = point_shape, size = 0.2, width = 0.1) + #每次自助抽样的值
  geom_point(aes(x = var, y = loading_median), alpha = boot_alpha_main, shape = point_shape,
             size = point_size, color = plot_elements_dark) + #添加自助法得到的中位数值
  coord_flip() +  #对调坐标轴以更好地展示图形
  scale_y_continuous(breaks = waiver(), n.breaks = 4) + #修改x轴(对调后,这就是y轴)
  labs(y = "Loadings", x = "", title = "") +
  theme_classic() +
  theme(title = element_text(size = normal_text, face = "bold"),
        text = element_text(size = normal_text),
        axis.line.x = element_line(color = graph_elements_dark),
        axis.line.y = element_blank(),
        axis.ticks.x = element_line(color = graph_elements_dark),
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = title_text),
        axis.text = element_text(size = normal_text),
        axis.text.x = element_text(angle = x_angle, vjust = x_adjust),
        legend.position = "none",
        legend.title = element_text(size = title_text),
        legend.text = element_text(size = subtitle_text),
        legend.key.height = unit(1.0, "mm"),
        legend.key.width = unit(1.0, "mm"),
        plot.margin = unit(c(0, 0, 0, 0), "cm"),
        strip.text = element_text(face = "bold", size = title_text),
        strip.background = element_blank()
  ) +
  NULL
p3

# -----------------------------------
# 绘制变量贡献图
dat_boot <- pca_stats %>%
  dplyr::filter(PC <= n_pcs[1]) %>% #去除额外的PC
  dplyr::select(PC_name, var, contrib_boot) %>% unique() #去除重复

dat_true <- pca_stats %>%
  dplyr::filter(PC <= n_pcs[1]) %>% # remove additional PCs
  dplyr::select(PC_name, var, contrib, contrib_median, contrib_std) %>% unique() #去除重复

p4<- ggplot(data = dat_true, aes(x = var, y = contrib)) +
  facet_grid(. ~ PC_name, scales = "free_y") +
  geom_errorbar(aes(ymin = contrib_median - contrib_std, ymax = contrib_median + contrib_std), # ymin = contrib_q25, ymax = contrib_q75
                color = Three_colorblind[3], linewidth = plot_linewidth, width = 0.9) + # standard error = standard deviation from bootstrapping
  geom_bar(stat = "identity", position = position_dodge(), fill = Three_colorblind[3]) + #4f85b0, #59918e, #3c6464
  geom_hline(yintercept = 0, color = graph_elements_dark) +
  geom_jitter(data = dat_boot, aes(x = var, y = contrib_boot), alpha = boot_alpha_small, color = plot_elements_dark,
              shape = point_shape, size = 0.2, width = 0.1) + #每次自助抽样的值
  geom_point(aes(x = var, y = contrib_median), alpha = boot_alpha_main, shape = point_shape,
             size = point_size, color = plot_elements_dark) + #添加自助法得到的中位数值
  coord_flip() + #对调坐标轴以更好地展示图形
  scale_y_continuous(breaks = waiver(), n.breaks = 4) + #添加自助法得到的中位数值
  labs(y = "Contribution [%]", x = "", title = "") +
  theme_classic() +
  theme(title = element_text(size = normal_text, face = "bold"),
        text = element_text(size = normal_text),
        axis.line.x = element_line(color = graph_elements_dark),
        axis.line.y = element_blank(),
        axis.ticks.x = element_line(color = graph_elements_dark),
        axis.ticks.y = element_blank(),
        axis.title = element_text(size = title_text),
        axis.text = element_text(size = normal_text),
        axis.text.x = element_text(angle = x_angle, vjust = x_adjust),
        legend.position = "none",
        legend.title = element_text(size = title_text),
        legend.text = element_text(size = subtitle_text),
        legend.key.height = unit(1.0, "mm"),
        legend.key.width = unit(1.0, "mm"),
        # plot.margin = unit(c(0, 0, 0, 0), "cm"),
        strip.text = element_text(face = "bold", size = title_text),
        strip.background = element_blank()
  ) +
  NULL
p4

# -----------------------------------
# 拼图
library(patchwork)
p2+p3/p4

R语言绘制PCA双标图、碎石图、变量载荷图和变量贡献图_第3张图片

文中用到的数据代码:R语言绘制 PCA 双标图、碎石图、变量载荷图和变量贡献图(self).zip - 蓝奏云

你可能感兴趣的:(绘图,r语言,python,开发语言)