Introduction

In this document, Comparing the output from lefse through different applications:

XMAS2 (R package)
lefse-conda (command line)
lefse-galaxy (from the galaxy platfrom)

In all cases, using the same dataset, amplicon_ps and Zeybel_Gut, which are included in the XMAS package.

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE)

library(XMAS2)
library(dplyr)
library(ggplot2)
library(devtools)
library(tibble)
library(tidyr)
library(magrittr)
library(readr)
library(VennDiagram)
library(purrr)

# rm(list = ls())
options(stringsAsFactors = F)
options(future.globals.maxSize = 1000 * 1024^2)

Dataset

16s genus

data("amplicon_ps")
amplicon_ps_genus <- summarize_taxa(amplicon_ps, taxa_level = "Genus")
amplicon_ps_genus

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 406 taxa and 34 samples ]
## sample_data() Sample Data:       [ 34 samples by 8 sample variables ]
## tax_table()   Taxonomy Table:    [ 406 taxa by 6 taxonomic ranks ]

metagenomics species

data("Zeybel_Gut")
Zeybel_ps_species <- summarize_taxa(Zeybel_Gut, taxa_level = "Species")
Zeybel_ps_species

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 372 taxa and 42 samples ]
## sample_data() Sample Data:       [ 42 samples by 51 sample variables ]
## tax_table()   Taxonomy Table:    [ 372 taxa by 7 taxonomic ranks ]

Preparing for lefse galaxy and conda

1st row: class (required)
2nd row: subclass (optional)
3rd row: sampleID (required)
rownames: taxon
data format: splitted by "\t"

prepare_lefse <- function(ps,
                          Class,
                          Class_names,
                          Subclass = NULL,
                          cutoff = 10) {
    
    # ps = amplicon_ps_genus
    # Class = "SampleType"
    # Class_names = c("gut", "tongue")
    # Subclass = NULL
    # cutoff = 10
    
    sam_tab <- phyloseq::sample_data(ps) %>%
        data.frame()
    colnames(sam_tab)[which(colnames(sam_tab) == Class)] <- "CompClass"
    
    if (is.null(Subclass)) {
        sam_tab_final <- sam_tab %>%
            dplyr::select(CompClass) %>%
            tibble::rownames_to_column("TempRowNames") %>%
            dplyr::filter(CompClass %in% Class_names) %>%
            dplyr::select(all_of(c("TempRowNames", "CompClass"))) %>%
            tibble::column_to_rownames("TempRowNames")
    } else {
        sam_tab_final <- sam_tab %>%
            dplyr::select(all_of(c("CompClass", Subclass))) %>%
            tibble::rownames_to_column("TempRowNames") %>%
            dplyr::filter(CompClass %in% Class_names) %>%
            dplyr::select(all_of(c("TempRowNames", "CompClass", Subclass))) %>%
            tibble::column_to_rownames("TempRowNames")
    }
    
    colnames(sam_tab_final)[which(colnames(sam_tab_final) == "CompClass")] <- Class
    
    phyloseq::sample_data(ps) <- phyloseq::sample_data(sam_tab_final)
    otu_tab <- phyloseq::otu_table(ps) %>%
        data.frame()
    otu_tab_final <- otu_tab[rowSums(otu_tab) > cutoff, colSums(otu_tab) > cutoff, F]
    phyloseq::otu_table(ps) <- phyloseq::otu_table(as.matrix(otu_tab_final), taxa_are_rows = TRUE)
    
    lefse_data <- sam_tab_final %>% 
        tibble::rownames_to_column("Sample") %>%
        dplyr::inner_join(otu_tab_final %>% 
                              t() %>% data.frame() %>%
                              tibble::rownames_to_column("Sample"),
                          by = "Sample") %>%
        dplyr::select(all_of(Class), Sample, all_of(Subclass), everything()) %>%
        #stats::setNames(c(Class, "Sample", Subclass, rownames(otu_tab_final))) %>%
        t() %>% data.frame()
    
    lefse_data_nosub <- sam_tab_final %>% 
        tibble::rownames_to_column("Sample") %>%
        dplyr::inner_join(otu_tab_final %>% 
                              t() %>% data.frame() %>%
                              tibble::rownames_to_column("Sample"),
                          by = "Sample") %>%
        dplyr::select(-Sample) %>%
        dplyr::select(all_of(Class), all_of(Subclass), everything()) %>%
        t() %>% data.frame()
    
    res <- list(ps=ps,
                lefse=lefse_data,
                lefse_nosub=lefse_data_nosub)
    
    return(res)
}

amplicon_ps_genus_lefse <- prepare_lefse(
                          ps = amplicon_ps_genus,
                          Class = "SampleType",
                          Class_names = c("gut", "tongue"),
                          cutoff = 10)

write.table(amplicon_ps_genus_lefse$lefse, "amplicon_ps_genus_lefse.tsv", quote = F, sep = "\t", col.names = F)
write.table(amplicon_ps_genus_lefse$lefse_nosub, "amplicon_ps_genus_lefse_nosub.tsv", quote = F, sep = "\t", col.names = F)

Zeybel_ps_species_lefse <- prepare_lefse(
                          ps = Zeybel_ps_species,
                          Class = "LiverFatClass",
                          Class_names = c("Mild", "Moderate"),
                          cutoff = 1e-4)

write.table(Zeybel_ps_species_lefse$lefse, "Zeybel_ps_species_lefse.tsv", quote = F, sep = "\t", col.names = F)
write.table(Zeybel_ps_species_lefse$lefse_nosub, "Zeybel_ps_species_lefse_nosub.tsv", quote = F, sep = "\t", col.names = F)

Run lefse independently with the three applications (R, conda, galaxy)

Running lefse in R (XMAS2)

Perform the analysis with the run_lefse2 function:

amplicon_ps_genus

# run_lefse
amplicon_xmas2_output <- run_lefse(
                          ps = amplicon_ps_genus_lefse$ps,
                          group = "SampleType",
                          group_names = c("gut", "tongue"),
                          norm = "CPM") %>% 
    dplyr::mutate(app_name = "xmas_lefse") %>% 
    dplyr::arrange(LDA_Score)
head(amplicon_xmas2_output)


# run_lefse2
amplicon_xmas2_output2 <- run_lefse2(
                          ps = amplicon_ps_genus_lefse$ps,
                          group = "SampleType",
                          group_names = c("gut", "tongue"),
                          norm = "CPM") %>% 
    dplyr::mutate(app_name = "xmas_lefse2") %>% 
    dplyr::arrange(LDA_Score)
head(amplicon_xmas2_output2)

##                            TaxaID             Block Enrichment LDA_Score
## 1                  g__Bacteroides 8_gut vs 9_tongue        gut -5.464661
## 2 g__Lachnospiraceae_unclassified 8_gut vs 9_tongue        gut -4.451749
## 3                  g__Lachnospira 8_gut vs 9_tongue        gut -4.404415
## 4 g__Ruminococcaceae_unclassified 8_gut vs 9_tongue        gut -4.394309
## 5             g__Faecalibacterium 8_gut vs 9_tongue        gut -4.206251
## 6        g__Phascolarctobacterium 8_gut vs 9_tongue        gut -4.183912
##   EffectSize Log2FoldChange (Median)\ngut_vs_tongue Median Abundance\n(All)
## 1   5.734982                               8.706188               4552.3520
## 2   2.296951                               3.048798              11431.9387
## 3   6.503718                                     NA                  0.0000
## 4   5.962548                               8.833464                945.6265
## 5   5.492180                                     NA                  0.0000
## 6   6.206589                                     NA                  0.0000
##   Median Abundance\ngut Median Abundance\ntongue
## 1             604210.78                 1446.655
## 2              64739.42                 7823.286
## 3              50368.43                    0.000
## 4              53763.30                  117.855
## 5              19194.92                    0.000
## 6              18238.84                    0.000
##   Log2FoldChange (Mean)\ngut_vs_tongue Mean Abundance\n(All)
## 1                             7.995845             277146.67
## 2                             3.186478              34604.22
## 3                                   NA              22767.22
## 4                             7.701566              26586.43
## 5                                   NA              13991.79
## 6                                   NA              13444.82
##   Mean Abundance\ngut Mean Abundance\ntongue Occurrence (100%)\n(All)
## 1           586352.50              2297.0454                   100.00
## 2            65446.48              7188.8734                   100.00
## 3            48380.35                 0.0000                    47.06
## 4            56192.47               269.9452                    76.47
## 5            29732.55                 0.0000                    47.06
## 6            28570.25                 0.0000                    47.06
##   Occurrence (100%)\ngut Occurrence (100%)\ntongue Odds Ratio (95% CI)
## 1                    100                    100.00    4.7e-14 (-60;60)
## 2                    100                    100.00  4.4e-24 (-110;110)
## 3                    100                      0.00                
## 4                    100                     55.56  7.2e-41 (-180;180)
## 5                    100                      0.00                
## 6                    100                      0.00                
##     app_name
## 1 xmas_lefse
## 2 xmas_lefse
## 3 xmas_lefse
## 4 xmas_lefse
## 5 xmas_lefse
## 6 xmas_lefse

Zeybel_ps_species

# run_lefse
MGS_xmas2_output <- run_lefse(
                          ps = Zeybel_ps_species_lefse$ps,
                          group = "LiverFatClass",
                          group_names = c("Mild", "Moderate"),
                          norm = "CPM") %>% 
    dplyr::mutate(app_name = "xmas_lefse") %>% 
    dplyr::arrange(LDA_Score)
head(MGS_xmas2_output)


# run_lefse2
MGS_xmas2_output2 <- run_lefse2(
                          ps = Zeybel_ps_species_lefse$ps,
                          group = "LiverFatClass",
                          group_names = c("Mild", "Moderate"),
                          norm = "CPM") %>% 
    dplyr::mutate(app_name = "xmas_lefse2") %>% 
    dplyr::arrange(LDA_Score)
head(MGS_xmas2_output2)

##                            TaxaID                  Block Enrichment LDA_Score
## 1         s__Butyricimonas_virosa 12_Mild vs 12_Moderate   Moderate  2.462833
## 2       s__Bacteroides_salyersiae 12_Mild vs 12_Moderate   Moderate  2.628574
## 3 s__Bacteroides_thetaiotaomicron 12_Mild vs 12_Moderate   Moderate  3.177126
## 4           s__Bacteroides_clarus 12_Mild vs 12_Moderate   Moderate  3.221333
## 5        s__Bacteroides_coprocola 12_Mild vs 12_Moderate   Moderate  3.310338
## 6 s__Bacteroides_cellulosilyticus 12_Mild vs 12_Moderate   Moderate  3.332193
##   EffectSize Log2FoldChange (Median)\nMild_vs_Moderate Median Abundance\n(All)
## 1   1.668501                                        NA               0.8999998
## 2   1.573816                                        NA               0.0000000
## 3   2.775490                                        NA             223.8648101
## 4   2.331714                                        NA               0.0000000
## 5   1.367262                                        NA               0.0000000
## 6   3.145457                                        NA              82.9748375
##   Median Abundance\nMild Median Abundance\nModerate
## 1                      0                   262.9238
## 2                      0                     0.0000
## 3                      0                  1123.0082
## 4                      0                     0.0000
## 5                      0                     0.0000
## 6                      0                  2707.7169
##   Log2FoldChange (Mean)\nMild_vs_Moderate Mean Abundance\n(All)
## 1                               -2.329681              353.2093
## 2                                      NA              262.0187
## 3                               -2.241224             2413.3440
## 4                                      NA             1283.6658
## 5                                      NA             1704.3673
## 6                               -3.325019             3170.2327
##   Mean Abundance\nMild Mean Abundance\nModerate Occurrence (100%)\n(All)
## 1             117.2101                 589.2084                    50.00
## 2               0.0000                 524.0375                    16.67
## 3             842.6510                3984.0371                    66.67
## 4               0.0000                2567.3315                    20.83
## 5               0.0000                3408.7346                    16.67
## 6             575.2844                5765.1810                    62.50
##   Occurrence (100%)\nMild Occurrence (100%)\nModerate Odds Ratio (95% CI)
## 1                   25.00                       75.00       3.9 (6.6;1.2)
## 2                    0.00                       33.33                
## 3                   41.67                       91.67      3.1 (5.3;0.87)
## 4                    0.00                       41.67                
## 5                    0.00                       33.33                
## 6                   41.67                       83.33          17 (23;11)
##     app_name
## 1 xmas_lefse
## 2 xmas_lefse
## 3 xmas_lefse
## 4 xmas_lefse
## 5 xmas_lefse
## 6 xmas_lefse

Running lefse-conda (command line)

lefse-conda installation and version

Note: I installed lefse following the instructions from this site
after installing conda.

## Add channels
conda config --add channels defaults
conda config --add channels bioconda
conda config --add channels conda-forge
conda config --add channels biobakery

## Install lefse
conda create -n lefse -c biobakery lefse -y

Conda and lefse versions:

conda --version
#> conda 4.12.0

conda list | grep -e "lefse"
# packages in environment at /home/samuel/miniconda3/envs/lefse:
#> lefse                     1.1.2              pyhdfd78af_0    bioconda

Run lefse-conda

Generate a tabular dataset (amplicon_ps_genus_lefse or Zeybel_ps_species_lefse) compatible with lefse-conda and
lefse-galaxy using the get_dataset.R script.
Run the script run_lefse.sh (linux) with the following parameters:

# In general
# ./run_lefse.sh    

# in my case (Hua Zou)

## amplicon_ps_genus_lefse 
./run_lefse.sh /Users/zouhua/opt/anaconda3/bin/activate lefse /usr/local/bin/R amplicon_ps_genus_lefse

## Zeybel_ps_species_lefse 
./run_lefse.sh /Users/zouhua/opt/anaconda3/bin/activate lefse /usr/local/bin/R Zeybel_ps_species_lefse

Note: All script files, get_dataset.R and run_lefse.sh, and this rmarkdown
document must be in the same directory.

Import output from lefse-conda into R

get_lefse_python <- function(datres, 
                            Class_names,
                            name = "lefse_conda",
                            LDA_names = "lefse_conda_LDA",
                            LDA_cutoff = 2) {
    
    # datres = "amplicon_ps_genus_lefse.res"
    # Class_names = c("gut", "tongue")
    # LDA_cutoff = 2
    
    col_names <- c(
        "TaxaID", "log_hi_class_avg", "Enrichment", "lefse_conda_LDA", "pval")
    lefse_conda <- readr::read_tsv(datres, show_col_types = FALSE, col_names = FALSE ) %>% 
        magrittr::set_colnames(col_names) %>% 
        dplyr::filter(!is.na(lefse_conda_LDA)) %>%
        dplyr::mutate(
            lefse_conda_LDA = ifelse(
                Enrichment == Class_names[1], -lefse_conda_LDA, lefse_conda_LDA),
            app_name = name) %>% 
        dplyr::filter(abs(lefse_conda_LDA) >= LDA_cutoff) %>%
        dplyr::arrange(lefse_conda_LDA)
    
    colnames(lefse_conda)[which(colnames(lefse_conda) == "lefse_conda_LDA")] <- LDA_names
    
    return(lefse_conda)
}

amplicon_ps_genus_lefse_conda <- get_lefse_python(
                    datres = "amplicon_ps_genus_lefse.res", 
                    Class_names = c("gut", "tongue"),
                    LDA_names = "lefse_conda_LDA")
head(amplicon_ps_genus_lefse_conda)

Zeybel_ps_species_lefse_conda <- get_lefse_python(
                    datres = "Zeybel_ps_species_lefse.res", 
                    Class_names = c("Mild", "Moderate"),
                    LDA_names = "lefse_conda_LDA")
head(Zeybel_ps_species_lefse_conda)

## # A tibble: 6 × 6
##   TaxaID              log_hi_class_avg Enrichment lefse_conda_LDA pval  app_name
##                                                   
## 1 s__Butyricimonas_v…             2.77 Moderate              2.46 0.02… lefse_c…
## 2 s__Bacteroides_sal…             2.72 Moderate              2.47 0.03… lefse_c…
## 3 s__Bacteroides_cla…             3.41 Moderate              3.07 0.01… lefse_c…
## 4 s__Bacteroides_the…             3.60 Moderate              3.26 0.01… lefse_c…
## 5 s__Bacteroides_cop…             3.53 Moderate              3.28 0.03… lefse_c…
## 6 s__Bacteroides_int…             3.54 Moderate              3.32 0.03… lefse_c…

Running lefse from galaxy

Using the amplicon_ps_genus_lefse_nosub.txt or Zeybel_ps_species_lefse_nosub.txt file (no subjects included) as input for lefse from the galaxy platform of the Huttenhower lab at galaxy.

The conditions as follow:

alpha were 0.05 for both KW and Wilcox,
2.0 for LDA.
TSS normalization was applied as well.

converting the output into compared format:

amplicon_ps_genus_lefse_nosub.res
Zeybel_ps_species_lefse_nosub.res

amplicon_ps_genus_lefse_galaxy <- get_lefse_python(
                        datres = "amplicon_ps_genus_lefse_nosub.res",
                        name = "lefse_galaxy",
                        Class_names = c("gut", "tongue"),
                        LDA_names = "lefse_galaxy_LDA")
head(amplicon_ps_genus_lefse_galaxy)

Zeybel_ps_species_lefse_galaxy <- get_lefse_python(
                        datres = "Zeybel_ps_species_lefse_nosub.res",
                        name = "lefse_galaxy",
                        Class_names = c("Mild", "Moderate"),
                        LDA_names = "lefse_galaxy_LDA")
head(amplicon_ps_genus_lefse_galaxy)

## # A tibble: 6 × 6
##   TaxaID             log_hi_class_avg Enrichment lefse_galaxy_LDA pval  app_name
##                                                   
## 1 g__Bacteroides                 5.77 gut                   -5.46 0.00… lefse_g…
## 2 g__Lachnospiracea…             4.82 gut                   -4.45 0.00… lefse_g…
## 3 g__Ruminococcacea…             4.75 gut                   -4.41 0.00… lefse_g…
## 4 g__Lachnospira                 4.68 gut                   -4.38 0.00… lefse_g…
## 5 g__Phascolarctoba…             4.46 gut                   -4.18 0.00… lefse_g…
## 6 g__Faecalibacteri…             4.47 gut                   -4.18 0.00… lefse_g…

Extracting results from XMAS2 results

run_lefse (lefser R package)
run_lefse2 (microbiomeMarker R package)

get_lefse_R <- function(datres,
                        name = "Rrun_lefse",
                        LDA_names = "lefse_R_LDA",
                        LDA_cutoff = 2) {
    
    # datres = amplicon_xmas2_output
    # name = "Rrun_lefse"
    # LDA_cutoff = 2
    
    col_names <- c(
        "TaxaID", "Block", "Enrichment", "LDA_Score", "EffectSize")
    lefse_R <- datres %>% 
        dplyr::select(all_of(col_names)) %>% 
        dplyr::mutate(app_name = name) %>% 
        dplyr::filter(abs(LDA_Score) >= LDA_cutoff) %>%
        dplyr::arrange(LDA_Score)
    
    colnames(lefse_R)[which(colnames(lefse_R) == "LDA_Score")] <- LDA_names
    
    return(lefse_R)
}

amplicon_ps_genus_lefse_R <- get_lefse_R(
                        datres = amplicon_xmas2_output, 
                        name = "Rrun_lefse",
                        LDA_names = "lefse_R_LDA")
head(amplicon_ps_genus_lefse_R)

amplicon_ps_genus_lefse_R2 <- get_lefse_R(
                        datres = amplicon_xmas2_output2, 
                        name = "Rrun_lefse2",
                        LDA_names = "lefse_R2_LDA")
head(amplicon_ps_genus_lefse_R2)


Zeybel_ps_species_lefse_R <- get_lefse_R(
                        datres = MGS_xmas2_output, 
                        name = "Rrun_lefse",
                        LDA_names = "lefse_R_LDA")
head(Zeybel_ps_species_lefse_R)

Zeybel_ps_species_lefse_R2 <- get_lefse_R(
                        datres = MGS_xmas2_output2, 
                        name = "Rrun_lefse2",
                        LDA_names = "lefse_R2_LDA")
head(Zeybel_ps_species_lefse_R2)

##                            TaxaID                  Block Enrichment
## 1         s__Butyricimonas_virosa 12_Mild vs 12_Moderate   Moderate
## 2       s__Bacteroides_salyersiae 12_Mild vs 12_Moderate   Moderate
## 3 s__Bacteroides_thetaiotaomicron 12_Mild vs 12_Moderate   Moderate
## 4     s__Bacteroides_intestinalis 12_Mild vs 12_Moderate   Moderate
## 5        s__Bacteroides_coprocola 12_Mild vs 12_Moderate   Moderate
## 6           s__Bacteroides_clarus 12_Mild vs 12_Moderate   Moderate
##   lefse_R2_LDA EffectSize    app_name
## 1     2.735285   1.668501 Rrun_lefse2
## 2     2.776971   1.573816 Rrun_lefse2
## 3     3.437617   2.775490 Rrun_lefse2
## 4     3.449263   1.184353 Rrun_lefse2
## 5     3.457030   1.367262 Rrun_lefse2
## 6     3.474700   2.331714 Rrun_lefse2

Comparison of lefse-conda with XMAS2

Number of features reported as significant

amplicon_ps_genus_lefse

plot_signif_taxa_num <- function(dat1, dat2, dat3, dat4) {

    # dat1 = amplicon_ps_genus_lefse_conda
    # dat2 = amplicon_ps_genus_lefse_galaxy
    # dat3 = amplicon_ps_genus_lefse_R
    # dat4 = amplicon_ps_genus_lefse_R2
                     
    combined_outputs <- dplyr::bind_rows(dat1, dat2, dat3, dat4) %>% 
       dplyr::mutate(LDA = coalesce(lefse_conda_LDA, 
                                    lefse_galaxy_LDA, 
                                    lefse_R_LDA, 
                                    lefse_R2_LDA))
    
    pl <- combined_outputs %>% 
        count(app_name) %>% 
        ggplot(aes(app_name, n)) +
        geom_col() +
        geom_label(aes(label = n)) +
        ggtitle('Number of significiant features identified by the different applications using lefse') 
    
    return(pl)
}

plot_signif_taxa_num(dat1 = amplicon_ps_genus_lefse_conda, 
                     dat2 = amplicon_ps_genus_lefse_galaxy, 
                     dat3 = amplicon_ps_genus_lefse_R, 
                     dat4 = amplicon_ps_genus_lefse_R2)

Zeybel_ps_species_lefse

plot_signif_taxa_num(dat1 = Zeybel_ps_species_lefse_conda, 
                     dat2 = Zeybel_ps_species_lefse_galaxy, 
                     dat3 = Zeybel_ps_species_lefse_R, 
                     dat4 = Zeybel_ps_species_lefse_R2)

Overlap of features reported as significant

amplicon_ps_genus_lefse

plot_signif_taxa_venn <- function(dat1, dat2, dat3, dat4) {
    
    # dat1 = amplicon_ps_genus_lefse_conda
    # dat2 = amplicon_ps_genus_lefse_galaxy
    # dat3 = amplicon_ps_genus_lefse_R
    # dat4 = amplicon_ps_genus_lefse_R2
    
    set1 = dat1$TaxaID
    set2 = dat2$TaxaID
    set3 = dat3$TaxaID
    set4 = dat4$TaxaID
    
    grid.newpage()
    venn_object <- venn.diagram(
        x = list(set1, set2, set3, set4),
        category.names = c("lefse-conda", "lefse-galaxy", 
                           "run_lefse(lefser)", "run_lefse2(microbiomeMarker)"),
        filename = NULL
    )
    grid.draw(venn_object)    
}

plot_signif_taxa_venn(dat1 = amplicon_ps_genus_lefse_conda, 
                      dat2 = amplicon_ps_genus_lefse_galaxy, 
                      dat3 = amplicon_ps_genus_lefse_R, 
                      dat4 = amplicon_ps_genus_lefse_R2)

Zeybel_ps_species_lefse

plot_signif_taxa_venn(dat1 = Zeybel_ps_species_lefse_conda, 
                      dat2 = Zeybel_ps_species_lefse_galaxy, 
                      dat3 = Zeybel_ps_species_lefse_R, 
                      dat4 = Zeybel_ps_species_lefse_R2)

LDA scores' comparison

LDA scores of the 14 overlapping features are similar.

amplicon_ps_genus_lefse

amplicon_joint_output <- purrr::reduce(
    .x = list(amplicon_ps_genus_lefse_conda, amplicon_ps_genus_lefse_galaxy, 
              amplicon_ps_genus_lefse_R, amplicon_ps_genus_lefse_R2),
    .f = ~ inner_join(.x, .y, by = "TaxaID")) %>% 
    dplyr::select(TaxaID, lefse_conda_LDA, lefse_galaxy_LDA, 
                  lefse_R_LDA, lefse_R2_LDA)
amplicon_joint_output

## # A tibble: 57 × 5
##    TaxaID              lefse_conda_LDA lefse_galaxy_LDA lefse_R_LDA lefse_R2_LDA
##                                                        
##  1 g__Bacteroides                -5.47            -5.46       -5.46        -5.77
##  2 g__Lachnospiraceae…           -4.47            -4.45       -4.45        -4.77
##  3 g__Ruminococcaceae…           -4.46            -4.41       -4.39        -4.74
##  4 g__Lachnospira                -4.36            -4.38       -4.40        -4.69
##  5 g__Faecalibacterium           -4.20            -4.18       -4.21        -4.48
##  6 g__Phascolarctobac…           -4.15            -4.18       -4.18        -4.47
##  7 g__Clostridiales_u…           -4.03            -4.06       -4.04        -4.38
##  8 g__Akkermansia                -3.91            -4.04       -4.11        -4.23
##  9 g__Oscillospira               -3.87            -3.89       -3.86        -4.19
## 10 g__Rikenellaceae_u…           -3.87            -3.91       -3.86        -4.23
## # … with 47 more rows

Zeybel_ps_species_lefse

MGS_joint_output <- purrr::reduce(
    .x = list(Zeybel_ps_species_lefse_conda, Zeybel_ps_species_lefse_galaxy, 
              Zeybel_ps_species_lefse_R, Zeybel_ps_species_lefse_R2),
    .f = ~ inner_join(.x, .y, by = "TaxaID")) %>% 
    dplyr::select(TaxaID, lefse_conda_LDA, lefse_galaxy_LDA, 
                  lefse_R_LDA, lefse_R2_LDA)
MGS_joint_output

## # A tibble: 11 × 5
##    TaxaID              lefse_conda_LDA lefse_galaxy_LDA lefse_R_LDA lefse_R2_LDA
##                                                        
##  1 s__Butyricimonas_v…            2.46             2.82        2.46         2.74
##  2 s__Bacteroides_sal…            2.47             3.16        2.63         2.78
##  3 s__Bacteroides_cla…            3.07             3.29        3.22         3.47
##  4 s__Bacteroides_the…            3.26             3.28        3.18         3.44
##  5 s__Bacteroides_cop…            3.28             3.24        3.31         3.46
##  6 s__Bacteroides_int…            3.32             3.31        3.49         3.45
##  7 s__Bacteroides_cel…            3.41             3.36        3.33         3.65
##  8 s__Bacteroides_egg…            3.52             3.59        3.41         3.82
##  9 s__Parabacteroides…            3.59             3.68        3.55         3.96
## 10 s__Barnesiella_int…            3.68             3.72        3.63         3.96
## 11 s__Prevotella_sp_C…            4.04             4.26        4.32         4.74

XMAS2 LDA scores vs lefse-conda LDA scores

amplicon_ps_genus_lefse

run_lefse (lefser R package) vs lefse-conda

amplicon_joint_output %>% 
    ggplot(aes(lefse_conda_LDA, lefse_R_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-conda and run_lefse")

run_lefse2 (microbiomeMarker R package) vs lefse-conda

amplicon_joint_output %>% 
    ggplot(aes(lefse_conda_LDA, lefse_R2_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-conda and run_lefse2")

Zeybel_ps_species_lefse

run_lefse (lefser R package) vs lefse-conda

MGS_joint_output %>% 
    ggplot(aes(lefse_conda_LDA, lefse_R_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-conda and run_lefse")

run_lefse2 (microbiomeMarker R package) vs lefse-conda

MGS_joint_output %>% 
    ggplot(aes(lefse_conda_LDA, lefse_R2_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-conda and run_lefse2")

XMAS2 LDA scores vs lefse-galaxy LDA scores

amplicon_ps_genus_lefse

run_lefse (lefser R package) vs lefse-galaxy

amplicon_joint_output %>% 
    ggplot(aes(lefse_galaxy_LDA, lefse_R_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-galaxy and run_lefse")

run_lefse2 (microbiomeMarker R package) vs lefse-galaxy

amplicon_joint_output %>% 
    ggplot(aes(lefse_galaxy_LDA, lefse_R2_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-galaxy and run_lefse2")

Zeybel_ps_species_lefse

run_lefse (lefser R package) vs lefse-galaxy

MGS_joint_output %>% 
    ggplot(aes(lefse_galaxy_LDA, lefse_R_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-galaxy and run_lefse")

run_lefse2 (microbiomeMarker R package) vs lefse-galaxy

MGS_joint_output %>% 
    ggplot(aes(lefse_galaxy_LDA, lefse_R2_LDA)) + 
    geom_point(size = 3, shape = 1) +
    geom_hline(yintercept = 0, linetype = "dashed") +
    geom_vline(xintercept = 0, linetype = "dashed") +
    ggtitle("Comparison of LDA scores of features reported as significant 
    by both lefse-galaxy and run_lefse2")

Results:

The overlap between run_lefse2 (microbiomeMarker R package) and lefse-conda or lefse-galaxy have the similar LDA scores. However, the overlap between run_lefse (lefser R package) and lefse-conda or lefse-galaxy seem have slightly different LDA scores.

Differences bewteen XMAS2 LDA scores and lefse-conda

setdiff(amplicon_ps_genus_lefse_conda$TaxaID, amplicon_ps_genus_lefse_R$TaxaID)

## [1] "g__Holdemania"                  "g__Alcaligenaceae_unclassified"

setdiff(amplicon_ps_genus_lefse_conda$TaxaID, amplicon_ps_genus_lefse_R2$TaxaID)

## character(0)

Data and code availability

lefse_comparison

Session info

devtools::session_info()

## ─ Session info ───────────────────────────────────────────────────────────────
##  setting  value
##  version  R version 4.1.2 (2021-11-01)
##  os       macOS Big Sur 10.16
##  system   x86_64, darwin17.0
##  ui       X11
##  language (EN)
##  collate  en_US.UTF-8
##  ctype    en_US.UTF-8
##  tz       Asia/Shanghai
##  date     2022-07-19
##  pandoc   2.17.1.1 @ /Applications/RStudio.app/Contents/MacOS/quarto/bin/ (via rmarkdown)
## 
## ─ Packages ───────────────────────────────────────────────────────────────────
##  package              * version  date (UTC) lib source
##  ade4                   1.7-18   2021-09-16 [1] CRAN (R 4.1.0)
##  annotate               1.72.0   2021-10-26 [1] Bioconductor
##  AnnotationDbi          1.56.2   2021-11-09 [1] Bioconductor
##  ape                    5.6-2    2022-03-02 [1] CRAN (R 4.1.2)
##  assertthat             0.2.1    2019-03-21 [1] CRAN (R 4.1.0)
##  Biobase                2.54.0   2021-10-26 [1] Bioconductor
##  BiocGenerics           0.40.0   2021-10-26 [1] Bioconductor
##  BiocParallel           1.28.3   2021-12-09 [1] Bioconductor
##  biomformat             1.22.0   2021-10-26 [1] Bioconductor
##  Biostrings             2.62.0   2021-10-26 [1] Bioconductor
##  bit                    4.0.4    2020-08-04 [1] CRAN (R 4.1.0)
##  bit64                  4.0.5    2020-08-30 [1] CRAN (R 4.1.0)
##  bitops                 1.0-7    2021-04-24 [1] CRAN (R 4.1.0)
##  blob                   1.2.2    2021-07-23 [1] CRAN (R 4.1.0)
##  brio                   1.1.3    2021-11-30 [1] CRAN (R 4.1.0)
##  bslib                  0.3.1    2021-10-06 [1] CRAN (R 4.1.0)
##  cachem                 1.0.6    2021-08-19 [1] CRAN (R 4.1.0)
##  callr                  3.7.0    2021-04-20 [1] CRAN (R 4.1.0)
##  caTools                1.18.2   2021-03-28 [1] CRAN (R 4.1.0)
##  cli                    3.3.0    2022-04-25 [1] CRAN (R 4.1.2)
##  cluster                2.1.2    2021-04-17 [1] CRAN (R 4.1.2)
##  codetools              0.2-18   2020-11-04 [1] CRAN (R 4.1.2)
##  coin                   1.4-2    2021-10-08 [1] CRAN (R 4.1.0)
##  colorspace             2.0-3    2022-02-21 [1] CRAN (R 4.1.2)
##  crayon                 1.5.0    2022-02-14 [1] CRAN (R 4.1.2)
##  data.table             1.14.2   2021-09-27 [1] CRAN (R 4.1.0)
##  DBI                    1.1.2    2021-12-20 [1] CRAN (R 4.1.0)
##  DelayedArray           0.20.0   2021-10-26 [1] Bioconductor
##  desc                   1.4.1    2022-03-06 [1] CRAN (R 4.1.2)
##  DESeq2                 1.34.0   2021-10-26 [1] Bioconductor
##  devtools             * 2.4.3    2021-11-30 [1] CRAN (R 4.1.0)
##  digest                 0.6.29   2021-12-01 [1] CRAN (R 4.1.0)
##  dplyr                * 1.0.8    2022-02-08 [1] CRAN (R 4.1.2)
##  edgeR                  3.36.0   2021-10-26 [1] Bioconductor
##  ellipsis               0.3.2    2021-04-29 [1] CRAN (R 4.1.0)
##  evaluate               0.15     2022-02-18 [1] CRAN (R 4.1.2)
##  fansi                  1.0.2    2022-01-14 [1] CRAN (R 4.1.2)
##  farver                 2.1.0    2021-02-28 [1] CRAN (R 4.1.0)
##  fastmap                1.1.0    2021-01-25 [1] CRAN (R 4.1.0)
##  foreach                1.5.2    2022-02-02 [1] CRAN (R 4.1.2)
##  formatR                1.11     2021-06-01 [1] CRAN (R 4.1.0)
##  fs                     1.5.2    2021-12-08 [1] CRAN (R 4.1.0)
##  futile.logger        * 1.4.3    2016-07-10 [1] CRAN (R 4.1.0)
##  futile.options         1.0.1    2018-04-20 [1] CRAN (R 4.1.0)
##  genefilter             1.76.0   2021-10-26 [1] Bioconductor
##  geneplotter            1.72.0   2021-10-26 [1] Bioconductor
##  generics               0.1.2    2022-01-31 [1] CRAN (R 4.1.2)
##  GenomeInfoDb           1.30.1   2022-01-30 [1] Bioconductor
##  GenomeInfoDbData       1.2.7    2022-03-09 [1] Bioconductor
##  GenomicRanges          1.46.1   2021-11-18 [1] Bioconductor
##  ggplot2              * 3.3.5    2021-06-25 [1] CRAN (R 4.1.0)
##  glmnet                 4.1-3    2021-11-02 [1] CRAN (R 4.1.0)
##  glue                   1.6.2    2022-02-24 [1] CRAN (R 4.1.2)
##  gplots                 3.1.1    2020-11-28 [1] CRAN (R 4.1.0)
##  gtable                 0.3.0    2019-03-25 [1] CRAN (R 4.1.0)
##  gtools                 3.9.2    2021-06-06 [1] CRAN (R 4.1.0)
##  highr                  0.9      2021-04-16 [1] CRAN (R 4.1.0)
##  hms                    1.1.1    2021-09-26 [1] CRAN (R 4.1.0)
##  htmltools              0.5.2    2021-08-25 [1] CRAN (R 4.1.0)
##  httr                   1.4.2    2020-07-20 [1] CRAN (R 4.1.0)
##  igraph                 1.2.11   2022-01-04 [1] CRAN (R 4.1.2)
##  IRanges                2.28.0   2021-10-26 [1] Bioconductor
##  iterators              1.0.14   2022-02-05 [1] CRAN (R 4.1.2)
##  jquerylib              0.1.4    2021-04-26 [1] CRAN (R 4.1.0)
##  jsonlite               1.8.0    2022-02-22 [1] CRAN (R 4.1.2)
##  KEGGREST               1.34.0   2021-10-26 [1] Bioconductor
##  KernSmooth             2.23-20  2021-05-03 [1] CRAN (R 4.1.2)
##  knitr                  1.37     2021-12-16 [1] CRAN (R 4.1.0)
##  labeling               0.4.2    2020-10-20 [1] CRAN (R 4.1.0)
##  lambda.r               1.2.4    2019-09-18 [1] CRAN (R 4.1.0)
##  lattice                0.20-45  2021-09-22 [1] CRAN (R 4.1.2)
##  libcoin                1.0-9    2021-09-27 [1] CRAN (R 4.1.0)
##  lifecycle              1.0.1    2021-09-24 [1] CRAN (R 4.1.0)
##  limma                  3.50.1   2022-02-17 [1] Bioconductor
##  locfit                 1.5-9.5  2022-03-03 [1] CRAN (R 4.1.2)
##  magrittr             * 2.0.2    2022-01-26 [1] CRAN (R 4.1.2)
##  MASS                   7.3-55   2022-01-13 [1] CRAN (R 4.1.2)
##  Matrix                 1.4-0    2021-12-08 [1] CRAN (R 4.1.0)
##  MatrixGenerics         1.6.0    2021-10-26 [1] Bioconductor
##  matrixStats            0.61.0   2021-09-17 [1] CRAN (R 4.1.0)
##  memoise                2.0.1    2021-11-26 [1] CRAN (R 4.1.0)
##  metagenomeSeq          1.36.0   2021-10-26 [1] Bioconductor
##  mgcv                   1.8-39   2022-02-24 [1] CRAN (R 4.1.2)
##  modeltools             0.2-23   2020-03-05 [1] CRAN (R 4.1.0)
##  multcomp               1.4-18   2022-01-04 [1] CRAN (R 4.1.2)
##  multtest               2.50.0   2021-10-26 [1] Bioconductor
##  munsell                0.5.0    2018-06-12 [1] CRAN (R 4.1.0)
##  mvtnorm                1.1-3    2021-10-08 [1] CRAN (R 4.1.0)
##  nlme                   3.1-155  2022-01-13 [1] CRAN (R 4.1.2)
##  permute                0.9-7    2022-01-27 [1] CRAN (R 4.1.2)
##  phyloseq               1.38.0   2021-10-26 [1] Bioconductor
##  pillar                 1.7.0    2022-02-01 [1] CRAN (R 4.1.2)
##  pkgbuild               1.3.1    2021-12-20 [1] CRAN (R 4.1.0)
##  pkgconfig              2.0.3    2019-09-22 [1] CRAN (R 4.1.0)
##  pkgload                1.2.4    2021-11-30 [1] CRAN (R 4.1.0)
##  plyr                   1.8.6    2020-03-03 [1] CRAN (R 4.1.0)
##  png                    0.1-7    2013-12-03 [1] CRAN (R 4.1.0)
##  prettyunits            1.1.1    2020-01-24 [1] CRAN (R 4.1.0)
##  processx               3.5.2    2021-04-30 [1] CRAN (R 4.1.0)
##  ps                     1.6.0    2021-02-28 [1] CRAN (R 4.1.0)
##  purrr                * 0.3.4    2020-04-17 [1] CRAN (R 4.1.0)
##  R6                     2.5.1    2021-08-19 [1] CRAN (R 4.1.0)
##  RColorBrewer           1.1-2    2014-12-07 [1] CRAN (R 4.1.0)
##  Rcpp                   1.0.8.2  2022-03-11 [1] CRAN (R 4.1.2)
##  RCurl                  1.98-1.6 2022-02-08 [1] CRAN (R 4.1.2)
##  readr                * 2.1.2    2022-01-30 [1] CRAN (R 4.1.2)
##  remotes                2.4.2    2021-11-30 [1] CRAN (R 4.1.0)
##  reshape2               1.4.4    2020-04-09 [1] CRAN (R 4.1.0)
##  rhdf5                  2.38.1   2022-03-10 [1] Bioconductor
##  rhdf5filters           1.6.0    2021-10-26 [1] Bioconductor
##  Rhdf5lib               1.16.0   2021-10-26 [1] Bioconductor
##  rlang                  1.0.2    2022-03-04 [1] CRAN (R 4.1.2)
##  rmarkdown              2.13     2022-03-10 [1] CRAN (R 4.1.2)
##  rprojroot              2.0.2    2020-11-15 [1] CRAN (R 4.1.0)
##  RSQLite                2.2.10   2022-02-17 [1] CRAN (R 4.1.2)
##  rstudioapi             0.13     2020-11-12 [1] CRAN (R 4.1.0)
##  S4Vectors              0.32.3   2021-11-21 [1] Bioconductor
##  sandwich               3.0-1    2021-05-18 [1] CRAN (R 4.1.0)
##  sass                   0.4.0    2021-05-12 [1] CRAN (R 4.1.0)
##  scales                 1.1.1    2020-05-11 [1] CRAN (R 4.1.0)
##  sessioninfo            1.2.2    2021-12-06 [1] CRAN (R 4.1.0)
##  shape                  1.4.6    2021-05-19 [1] CRAN (R 4.1.0)
##  stringi                1.7.6    2021-11-29 [1] CRAN (R 4.1.0)
##  stringr                1.4.0    2019-02-10 [1] CRAN (R 4.1.0)
##  SummarizedExperiment   1.24.0   2021-10-26 [1] Bioconductor
##  survival               3.3-1    2022-03-03 [1] CRAN (R 4.1.2)
##  testthat               3.1.2    2022-01-20 [1] CRAN (R 4.1.2)
##  TH.data                1.1-0    2021-09-27 [1] CRAN (R 4.1.0)
##  tibble               * 3.1.6    2021-11-07 [1] CRAN (R 4.1.0)
##  tidyr                * 1.2.0    2022-02-01 [1] CRAN (R 4.1.2)
##  tidyselect             1.1.2    2022-02-21 [1] CRAN (R 4.1.2)
##  tzdb                   0.2.0    2021-10-27 [1] CRAN (R 4.1.0)
##  usethis              * 2.1.5    2021-12-09 [1] CRAN (R 4.1.0)
##  utf8                   1.2.2    2021-07-24 [1] CRAN (R 4.1.0)
##  vctrs                  0.3.8    2021-04-29 [1] CRAN (R 4.1.0)
##  vegan                  2.5-7    2020-11-28 [1] CRAN (R 4.1.0)
##  VennDiagram          * 1.7.3    2022-04-12 [1] CRAN (R 4.1.2)
##  vroom                  1.5.7    2021-11-30 [1] CRAN (R 4.1.0)
##  withr                  2.5.0    2022-03-03 [1] CRAN (R 4.1.2)
##  Wrench                 1.12.0   2021-10-26 [1] Bioconductor
##  xfun                   0.30     2022-03-02 [1] CRAN (R 4.1.2)
##  XMAS2                * 2.1.6    2022-07-19 [1] local
##  XML                    3.99-0.9 2022-02-24 [1] CRAN (R 4.1.2)
##  xtable                 1.8-4    2019-04-21 [1] CRAN (R 4.1.0)
##  XVector                0.34.0   2021-10-26 [1] Bioconductor
##  yaml                   2.3.5    2022-02-21 [1] CRAN (R 4.1.2)
##  zlibbioc               1.40.0   2021-10-26 [1] Bioconductor
##  zoo                    1.8-9    2021-03-09 [1] CRAN (R 4.1.0)
## 
##  [1] /Library/Frameworks/R.framework/Versions/4.1/Resources/library
## 
## ──────────────────────────────────────────────────────────────────────────────

Reference

lefse_comparison

数据分析：Comparing outputs from XMAS2, lefse-conda, and lefse-galaxy using the same in-house datasets...

Introduction

Dataset

16s genus

metagenomics species

Preparing for lefse galaxy and conda

Run lefse independently with the three applications (R, conda, galaxy)

Running lefse in R (XMAS2)

Running lefse-conda (command line)

lefse-conda installation and version

Run lefse-conda

Import output from lefse-conda into R

Running lefse from galaxy

Extracting results from XMAS2 results

Comparison of lefse-conda with XMAS2

Number of features reported as significant

Overlap of features reported as significant

LDA scores' comparison

XMAS2 LDA scores vs lefse-conda LDA scores

amplicon_ps_genus_lefse

Zeybel_ps_species_lefse

XMAS2 LDA scores vs lefse-galaxy LDA scores

amplicon_ps_genus_lefse

Zeybel_ps_species_lefse

Differences bewteen XMAS2 LDA scores and lefse-conda

Data and code availability

Session info

Reference

你可能感兴趣的:(数据分析：Comparing outputs from XMAS2, lefse-conda, and lefse-galaxy using the same in-house datasets...)