WDL学习笔记

官网是用来学习的最好的地方(然而我也没怎么看...)!

WDL是什么?

  • WDL是由Broad Institute开发的一种流程开发语言,全称workflow description language

WDL编辑器

使用Sublime Txet就可以,安装WDL插件就可以漂亮地显示WDL代码了。

一个WDL代码示例

这是一个我老板用来跑RRBS的pipeline,以此来学习。

workflow bismarkRRBS {

    String file_basename
    String masterFolder
    File Fastq_R1
    File? Fastq_R2
    Boolean paired
    Boolean MspI
    Int clip_R1
    Int? clip_R2
    Int three_prime_clip_R1
    Int? three_prime_clip_R2
    String genome_folder
    Int threadN

    String targetFolder = masterFolder + "/" + file_basename + "/"

    call makeDir as md1 {
        input:
            targetFolder = targetFolder
    }

    call makeDir as md2 {
        input:
            targetFolder = md1.outFolder + "report"
    }

    call trim_galore {
        input:
            Fastq_R1 = Fastq_R1,
            Fastq_R2 = Fastq_R2,
            clip_R1  = clip_R1,
            clip_R2  = clip_R2,
            three_prime_clip_R1 = three_prime_clip_R1,
            three_prime_clip_R2 = three_prime_clip_R2,
            paired   = paired,
            MspI     = MspI,
            targetFolder = md1.outFolder,
            reportFolder = md2.outFolder,
            file_basename = file_basename
    }

    call bismark {
        input:
            R1_trimmed = trim_galore.R1_trimmed,
            R2_trimmed = trim_galore.R2_trimmed,
            paired  = paired,
            threadN = threadN,
            file_basename = file_basename,
            genome_folder = genome_folder,
            targetFolder  = md1.outFolder,
            reportFolder  = md2.outFolder
    }

    call processBam {
        input:
            file_basename = file_basename,
            alignedBam    = bismark.alignedBam,
            genome_folder = genome_folder,
            alignedReport = bismark.alignedReport,
            paired  = paired,
            threadN = threadN,
            targetFolder = md1.outFolder,
            reportFolder = md2.outFolder
    }

    output {

        File DedupBam = processBam.DedupBam
        File DedupBam_index = processBam.DedupBam_index
    }


}


task trim_galore {

    File Fastq_R1
    File? Fastq_R2
    Boolean paired
    Boolean MspI
    Int clip_R1
    Int? clip_R2
    Int three_prime_clip_R1
    Int? three_prime_clip_R2
    String targetFolder
    String reportFolder
    String file_basename

    String s1 = if paired then "--paired" else ""
    String s2 = s1 + if (clip_R1 > 0) then " --clip_R1 " + clip_R1 else ""
    String s3 = s2 + if (three_prime_clip_R1 > 0) then " --three_prime_clip_R1 " + three_prime_clip_R1 else ""
    String s4 = s3 + if(paired && clip_R2 > 0) then " --clip_R2 " + clip_R2 else ""
    String s5 = s4 + if(MspI) then " --rrbs " else ""
    String commandText = s5 + if(paired && three_prime_clip_R2 > 0) then " --three_prime_clip_R2 " + three_prime_clip_R2 else ""

    command {

        mkdir trimmed 
        trim_galore ${commandText} -o trimmed --basename ${file_basename} ${Fastq_R1} ${Fastq_R2}

        cp trimmed/*report.txt ${reportFolder}
    }

    output{
        File R1_trimmed = if paired then "trimmed/${file_basename}_R1_val_1.fq.gz" else "trimmed/${file_basename}_trimmed.fq.gz"
        File R2_trimmed = if paired then "trimmed/${file_basename}_R2_val_2.fq.gz" else "/dev/null"
    }
}

task bismark {

    File R1_trimmed
    File? R2_trimmed
    Boolean paired
    Int threadN
    String file_basename
    String genome_folder
    String targetFolder
    String reportFolder

    String commandText = if paired then  " -1 " + R1_trimmed + " -2 " + R2_trimmed else R1_trimmed

    command {

        bismark --genome_folder ${genome_folder} \
            ${commandText} \
            -p ${threadN} \
            --rg_id ${file_basename} \
            --rg_sample ${file_basename} \
            --basename ${file_basename} \
            --temp_dir temp_dir \
            -o aligned

        cp aligned/*report.txt ${reportFolder}
    }

    output {

        File alignedBam = glob("aligned/*.bam")[0]
        File alignedReport = glob("aligned/*report.txt")[0]
    }
}

task processBam {

    File alignedBam
    File alignedReport
    File genome_folder
    String file_basename
    Boolean paired
    Int threadN
    String targetFolder
    String reportFolder

    String commandText  = if paired then "-p" else "-s"

    command {

        mkdir Dedup
        mkdir Call

        sambamba sort --natural-sort -m 8GB --tmpdir temp_dir -t ${threadN} -o ${file_basename}_nSorted.bam ${alignedBam}
        
        bismark_methylation_extractor ${commandText} --comprehensive --bedGraph --gzip --genome_folder ${genome_folder} \
            --multicore ${threadN} \
            -o Call ${file_basename}_nSorted.bam

        mv Call/*splitting_report.txt ${file_basename}_splitting_report.txt
        mv Call/*M-bias.txt ${file_basename}_M-bias.txt

        bismark2report --alignment_report ${alignedReport} \
            --mbias_report ${file_basename}_M-bias.txt \
            --splitting_report ${file_basename}_splitting_report.txt \
            -o ${file_basename}_report.html

        sambamba sort -m 8GB --tmpdir temp_dir -t ${threadN} -o ${file_basename}.bam ${file_basename}_nSorted.bam

        cp ${file_basename}.bam ${targetFolder}
        cp ${file_basename}.bam.bai ${targetFolder}
        cp ${file_basename}*.txt ${reportFolder}
        cp ${file_basename}_report.html ${targetFolder}
        cp -R Call ${targetFolder}
    }

    output {

        File DedupBam = "${file_basename}.bam"
        File DedupBam_index = "${file_basename}.bam.bai"
    }
}

task makeDir {

    String targetFolder

    command {
        
        if [ ! -d $targetFolder ] ; then
            mkdir ${targetFolder}
        else
            rm -rf ${targetFolder}
            mkdir ${targetFolder}
        fi
    }

    output {
        String outFolder = "${targetFolder}"
    }
}

WDL使用流程

WDL 脚本核心结构包括 5 个基本组件:workflow, task, call, command 以及 outputworkflow 即是描述整个流程的框架,其中调用( call )了不同的 task 。每一个 task 中又定义了各自的命令 command 和输出结果 output,位于workflow 外部。通过json文件传入参数。

结构概览:

# workflow部分
workflow your_workflow_name {
    String var1
    Int var2
    File var3
    ...
    
    call task_name1 {
        input:
            var1 = ...
            ...
    }
    
    ...
    
    output {
        ...
    }
}

# task部分
task task_name1 {
    String var1
    Int var2
    File var3
    ...
    
    command {
        ...
    }
    
    output{
    ...
    {
{

...

编写workflow部分

这一部分就是编写你的整个流程,包括声明变量、调用(call)任务(task),定义输出等部分。

语法结构为:

workflow workflow_name {
    var_type1 var1
    var_type2 var2
    ...
    
    call task1 {
        input:
            ...
    }
    
    call task1 {
        input:
            ...
    }
    
    output {
        File ...
    # 其实不太理解workflow部分的output有什么作用
}

在 WDL 中共有五种基本变量类型:

  • String 表示字符串
  • Float 为浮点数
  • Int 为整数
  • Boolean 为布尔型
  • File 表示文件

创建变量时,必须声明变量类型:String str1 = "hello"

另外,WDL 还支持复合类型:

  • Array 为数组
  • Map 为字典
  • Object 为对象

有些声明加上了一个 ? ,这表示该变量为可选变量。

使用变量:${var_name}

字符可以使用+直接相加,合并后的字符之间无空格,相对应R语言的paste0()函数

if语句:if xxx then xxx else,可读性很高

按照示例代码解释一遍:

# 创建名为bismarkRRBS的workflow
workflow bismarkRRBS {

# 声明所有要使用的变量
    String file_basename
    String masterFolder
    File Fastq_R1
    File? Fastq_R2
    Boolean paired
    Boolean MspI
    Int clip_R1
    Int? clip_R2
    Int three_prime_clip_R1
    Int? three_prime_clip_R2
    String genome_folder
    Int threadN

    String targetFolder = masterFolder + "/" + file_basename + "/"

# 调用任务。要读懂call部分,还是要先看定义的task部分
    call makeDir as md1 {
        input:
            targetFolder = targetFolder
    }

    call makeDir as md2 {
        input:
            targetFolder = md1.outFolder + "report"
    }

    call trim_galore {
        input:
            Fastq_R1 = Fastq_R1,
            Fastq_R2 = Fastq_R2,
            clip_R1  = clip_R1,
            clip_R2  = clip_R2,
            three_prime_clip_R1 = three_prime_clip_R1,
            three_prime_clip_R2 = three_prime_clip_R2,
            paired   = paired,
            MspI     = MspI,
            targetFolder = md1.outFolder,
            reportFolder = md2.outFolder,
            file_basename = file_basename
    }

    call bismark {
        input:
            R1_trimmed = trim_galore.R1_trimmed,
            R2_trimmed = trim_galore.R2_trimmed,
            paired  = paired,
            threadN = threadN,
            file_basename = file_basename,
            genome_folder = genome_folder,
            targetFolder  = md1.outFolder,
            reportFolder  = md2.outFolder
    }

    call processBam {
        input:
            file_basename = file_basename,
            alignedBam    = bismark.alignedBam,
            genome_folder = genome_folder,
            alignedReport = bismark.alignedReport,
            paired  = paired,
            threadN = threadN,
            targetFolder = md1.outFolder,
            reportFolder = md2.outFolder
    }

    output {

        File DedupBam = processBam.DedupBam
        File DedupBam_index = processBam.DedupBam_index
    }

}

编写task部分

task部分语法:

task task_name {
    var_type1 var1
    ...
    
    command {
        ...
    }
    output {
        ...
    }
}       

task的output部分可以被后面的task所调用,不用output声明则不能被调用。

以结构最简单的创建目录task为例:

# 创建名为makeDir的task
task makeDir {

# 声明该task中要使用的变量
    String targetFolder
# 编写该task的命令
    command {
        
        if [ ! -d $targetFolder ] ; then
            mkdir ${targetFolder}
        else
            rm -rf ${targetFolder}
            mkdir ${targetFolder}
        fi
    }
    
# 该task的定义了名为outFolder内容为"${targetFolder}"的一个变量
    output {
        String outFolder = "${targetFolder}"
    }
}

创建json文件

编写好的wdl脚本只是一个pipeline,没有具体(specific)的参数。所用使用json文件进行传参。json文件以.json为后缀,其格式为:

{
    “workflow_name.var1":"specific_var1",
    “workflow_name.var1":"specific_var2",
    ...
}

需要注意的是,除了最后一行不需要加逗号,其余行必须加逗号。

创建json文件的代码:

# 创建包含配置信息的list
xList = list(var1 = ..., 
             var2 = ...,
             var3 = ...,
             ...)

# 编写writeJSON函数
writeJSON <- function(xList, mainTag, fileOut){

    names(xList) = paste0(mainTag, ".", names(xList))
    NM  = names(xList)
    OUT = file(fileOut, "w")
    cat("{\n", file = OUT, append = FALSE, sep = "")

    for(i in 1:length(xList)){

        value = xList[[i]]
        if(i < length(xList))
            cat("\t\"", NM[i], "\":\"", value, "\",\n", file=OUT, append = TRUE, sep = "")
        else
            cat("\t\"", NM[i], "\":\"", value, "\"\n", file=OUT, append = TRUE, sep = "")
    }
    cat("}\n", file = OUT, append = TRUE, sep = "")

    close(OUT)
}

JSON = paste0("JSON/", Tag, ".json")
writeJSON(xList, "workflow_name", JSON)

Tag指的是数据样本的Tag。

writeJSON函数可以写到另外一个文件中,然后再source调用比较方便。

运行WDL脚本

不需要安装,只需要用调用java包即可。

wget https://github.com/broadinstitute/cromwell/releases/download/47/cromwell-47.jar
java -jar cromwell-47.jar run test.wdl --inputs test.json 

递交任务

通过使用qsub命令来对资源使用限量进行设置,并重定向stderr和stdout。

cmd = "java -jar cromwell-47.jar run test.wdl --inputs test.json"
cmd = paste0("echo ", "\"", cmd, "\"")
cmd = paste(cmd, "| qsub -N", jobName)
cmd = paste(cmd, "-l h_cpu=720:00:00 -V -m n -cwd -pe smp", Ncpu)
cmd = paste0(cmd, " -l h_vmem=", Memory, "G")
cmd = paste(cmd, "-o", outFile, "-e", errFile)
systerm(cmd)

同样可以写成一个函数,source调用。

多样本处理

以上讲的是单个或单组样本的workflow,但一般情况下,会得到多个raw_data,这样就需要循环上面的workflow。写个for循环即可,从1样本的数量循环,提取每个样本的Tag,并以此作为jobName和要保存的文件的文件夹名。

你可能感兴趣的:(WDL学习笔记)