2021-11-26

# -*- coding: utf-8 -*-

# callCodeML

# Verson: 2.0

# Date: 2021.11.15

# This is used to call Codeml to calculate the positive selection of gene in Branch-Sites Model.(Require codon sequence)

# Usage: python3 callCodeml.py seqDir treeFile

# Refactoring code and running logic to support multiprocess

import re

import sys

import os

from scipy.stats import chi2

import time

def help_info():

    print("This script is used to call codeML.\nUsage: python3 callCodeml.py Dir treeFile\n")

def getArgs():

    global workDir

    global tree_file

    global dir_path


    start_time = time.strftime('%Y%m%d-%H%M%S') # to keep the working dircitory name

    workDir = f"{os.getcwd()}/WorkingDir_{start_time}"

    args = sys.argv[1:]

    if len(args) == 0:

        help_info()

    if os.path.isdir(args[0]) is True:

        dir_path = os.path.abspath(args[0])  # path of seqences

        seqs = os.listdir(dir_path)

        seqs = [x for x in seqs if not str(x).startswith('.') ] # remove hided files

        tree_file = os.path.abspath(args[1])

        print(f"The drictory is: {dir_path}\nThe treeFile is: {tree_file}")

        #os.chdir(os.path.abspath(args[0]))

        return (dir_path, seqs, tree_file)

    else:

        help_info()

def create_dir(name):

    try:

        os.mkdir(f'{workDir}/')

    except:pass

    try:

        os.mkdir(f'{workDir}/{name}')

    except:pass

    try:

        os.mkdir(f'{workDir}/{name}/null')

        os.mkdir(f'{workDir}/{name}/alte')

    except:pass

def creat_ctl(name, tree_file):


    ctl_null = f'''

    seqfile = {dir_path}/{name}

    treefile = {tree_file}

    outfile = {workDir}/{name}/null/{name}_null.res

    noisy = 9 

    verbose = 0 

    runmode = 0 

    seqtype = 1 

    CodonFreq = 2 

    clock = 0 

    aaDist = 0

    model = 0

    NSsites = 0 

    icode = 0 

    Mgene = 0

    fix_kappa = 0 

    kappa = 2 

    fix_omega = 0 

    omega = 1 

    fix_alpha = 1 

    alpha = .0 

    Malpha = 0 

    ncatG = 3 

    getSE = 0 

    RateAncestor = 0 

    fix_blength = 0 

    method = 0 

    Small_Diff = .45e-6

    cleandata = 1

    '''

    ctl_alte = ctl_null.replace(f'{workDir}/{name}/null/{name}_null.res',  f'{workDir}/{name}/alte/{name}_alte.res')\

        .replace('model = 0', 'model = 2')


    with open(f'{workDir}/{name}/null/null_profile.ctl', 'w') as f:

        f.write(ctl_null)


    with open(f'{workDir}/{name}/alte/alte_profile.ctl', 'w') as f:

        f.write(ctl_alte)

def run_create(dir_path, seqs, tree_file):


    for i in (seqs):


        name= i.split('/')[-1]

        create_dir(name)

        creat_ctl(name, tree_file)


    with open(f'{workDir}/codeml.sh', 'w') as f:

        f.write('''

#!/bin/bash

list=`ls -1 |grep OG`

count=`ls -1 |grep OG|wc|awk '{print $1}'`

echo "Total number: $count"

dir=`pwd`

echo "Start Now..."

for i in $list

do

    {

    cd $dir/$i/null

    codeml ./null_profile.ctl  >log.txt

    cd $dir/$i/alte

    codeml ./alte_profile.ctl  >log.txt

    #echo "Finished $i..."

    }&

done

wait

''')


    print(f"{time.strftime('%Y-%m-%d %H:%M:%S')} Files created.\n")



def run_codeml():

    os.chdir(workDir)

    os.system('bash codeml.sh')

    print(f"\n{time.strftime('%Y-%m-%d %H:%M:%S')} Codeml Done...\n")

def run_stat(name):

    def get_res(path):

        with open(path, "r") as f:

            t =  f.read()

            kappa = re.findall(r"kappa.+", t)[0].split(' ')[-1].replace('\n', '')

            ls = re.findall(r'lnL.+', t)[0].split(' ')

            ls=[x for x in ls if x!='']

            lnL = ls[4]

            np = ls[3].replace("):","")

            return([lnL, np, kappa])

    path1 = f'{workDir}/{name}/alte/{name}_alte.res'

    path2 = f'{workDir}/{name}/null/{name}_null.res'


    try:

        res_alte = get_res(path1)

        res_null = get_res(path2)


        lnL0 = float(res_null[0])

        lnL1 = float(res_alte[0])

        np0 = int(res_null[1])

        np1 = int(res_alte[1])

        kappa = float(res_alte[2])

        lnl = abs(lnL0 - lnL1 )*2

        np = abs(np0 - np1)

        pvalue = 1 - chi2.cdf(lnl, np)

        #if pvalue <= 0.05:

        with open(f'{workDir}/result.tsv', 'a') as f:

            f.write(f"{name}\t{lnL0}\t{lnL1}\t{np0}\t{np1}\t{kappa}\t{pvalue}\n")

    except:

        print(f"{time.strftime('%Y-%m-%d %H:%M:%S')}  {name} failed to stats.")

def main(): 


    arges = getArgs()

    run_create(arges[0], arges[1], arges[2])

    run_codeml()

    for i in  os.listdir(workDir):

        run_stat(i)

if __name__ == "__main__":

    t0 = time.time()


    main()


    print(f'Total time used: {time.time() - t0} S\n')

你可能感兴趣的:(2021-11-26)