【多线程优化】fastq文件MD5批量校验、批量解压、压缩处理脚本设计

目的:

  1. 检查从下机fastq.gz文件的完整性
  2. 可以只生成MD5文件,但不做检查
  3. 可以多线程批量解压fastq.gz
  4. 可以多线程批量压缩fastq
# -*- coding:utf-8 -*-
# Use for MD5check and compress|decompress 
# Author:Robin; Created in 20200316
import os
import sys
import time
import getopt
import re
#import filecmp

def parameter_check():
    try:
        opts, args  = getopt.getopt(sys.argv[1:],
                                    'hm:f:t:n:cdps', ['help', 'md5_file=', 'file_type=',
                                    'threads', 'new_md5_file=', 'check', 'decompress', 'press', 'sum'])
        if len(opts) == 0:
            usage()
            sys.exit()
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err))  # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

    for opt, arg in opts:
        if opt in ['-h', '--help']:
            usage()
            sys.exit()
        elif opt in ['-m', '--md5_file']:
            raw_md5 = arg
            print('raw_md5:{}'.format(raw_md5))
        elif opt in ['-f', '--file_type']:
            file_type = arg
            print('file_type:{}'.format(file_type))
        elif opt in ['-n', '--new_md5_file']:
            new_md5_file = arg
            print('new_md5_file:{}'.format(new_md5_file))
        elif opt in ['-c', '--check']:
            print('Running M5Dcheck function')
            M5Dcheck(raw_md5, file_type, new_md5_file)
        elif opt in ['-s', '--sum']:
            MD5sum(file_type, new_md5_file)
        elif opt in ['-t', '--threads']:
            cores = arg
        elif opt in ['-d', '--decompress']:
            decompress(cores, file_type)
        elif opt in ['-p', '--press']:
            compress(cores, file_type)

    return(raw_md5, file_type)



def MD5sum(file_type, new_md5_file):
    print('正在运行MD5SUM...')
    os.system('>CheckMD5.txt')
    cmd = 'find *{}  -print0| xargs -0 md5sum >>{}'.format(file_type, new_md5_file)
    stat = os.system(cmd)
    if stat:
        print('生成MD5文件有问题,请检查!')
    else:
        print('MD5文件已生成:./{}'.format(new_md5_file))


def M5Dcheck(raw_md5, file_type, new_md5_file):
    MD5sum(file_type, new_md5_file)
    raw_dict = dict()
    new_dict = dict()

    i=j=0
    with open(raw_md5, 'r') as f1:
        for line in f1:
            line = line.strip()
#            print(line)
            md5, _, check = line.split(' ')
            if re.search(r'/', check):
                check = check.split('/')[1]
            raw_dict[check] = md5

    with open(new_md5_file, 'r') as f2:
        for line in f2:
            line = line.strip()
#            print(line)
            md5, _, check = line.split(' ')
            new_dict[check] = md5

    for check in raw_dict.keys():
        if new_dict.get(check) != raw_dict[check]:
            print("文件{}的MD5值校验失败!".format(check))
            i+=1
        else:
            print('恭喜,文件{}校验成功!\n'.format(check))
            j+=1

    print('总文件数:{}个\t验证成功:{}个\t验证失败:{}个'.format(i+j, j, i))



def usage():
    print('''
    Usage:

          MD5文件校验:python ParallelMd5DeCompress.py -m <'文件的原本MD5文件'> -f <'*.文件后缀名'> -n <'新生成MD5的文件名'> -c
          只生成MD5检验文件:python ParallelMd5DeCompress.py -f <'*.文件后缀名'>  -n <'新生成MD5的文件名'> -s
          文件解压:python ParallelMd5DeCompress.py -c  -f  <'*.文件后缀名'> -d
          文件压缩:python ParallelMd5DeCompress.py -c  -f  <'*.文件后缀名'> -p
         ''')


def decompress(cores, file_type):
    #os.system('gunzip 1-Input_sequence_R2.fastq.gz')  # 89s
    pattern = 'ls {}'.format(file_type)
    fq_list = os.popen(pattern)

    status = 1
    for fq in fq_list:
        print(fq)
        status = os.system('pigz -dk -p {} {}'.format(cores ,fq))
        print(status)

        if status:
            print('{}文件解压失败,请检查!'.format(fq))
        else:
            print('{}文件解压成功!'.format(fq))
def compress(cores, file_type):
    pattern = 'ls {}'.format(file_type)
    fq_list = os.popen(pattern)
    status = 1
    for fq in fq_list:
        print(fq)
        status = os.system('pigz -k -p {} {}'.format(cores ,fq))
        print(status)

        if status:
            print('{}文件压缩成功!'.format(fq))
        else:
            print('{}文件压缩失败,请检查!'.format(fq))


if __name__ == "__main__":

    start = time.time()
    parameter_check()
    end = time.time()
    print("用时: {}s".format(end-start))

运行结果展示:

  1. 测试:
$ls
CheckMD5.txt  compress_MD5check.py  ParallelMd5DeCompress.py  raw.md5  test1.fq.gz  test2.fq.gz
$python ParallelMd5DeCompress.py

    Usage:

          MD5文件校验:python ParallelMd5DeCompress.py -m <'文件的原本MD5文件'> -f <'*.文件后缀名'> -n <'新生成MD5的文件名'> -c
          只生成MD5检验文件:python ParallelMd5DeCompress.py -f <'*.文件后缀名'>  -n <'新生成MD5的文件名'> -s
          文件解压:python ParallelMd5DeCompress.py -c  -f  <'*.文件后缀名'> -d
          文件压缩:python ParallelMd5DeCompress.py -c  -f  <'*.文件后缀名'> -p
         
$python ParallelMd5DeCompress.py -m 'raw.md5' -f '*.fq.gz' -n 'new2.md5' -c
raw_md5:raw.md5
file_type:*.fq.gz
new_md5_file:new2.md5
Running M5Dcheck function
正在运行MD5SUM...
MD5文件已生成:./new2.md5
恭喜,文件test1.fq.gz校验成功!

恭喜,文件test2.fq.gz校验成功!

总文件数:2个 验证成功:2个 验证失败:0个
用时: 0.06068730354309082s

2、实际运行:

$jobs
[1]+  运行中               python /share/nas1/Data/Users/luohb/Pipline/MD5check/test_dir/ParallelMd5DeCompress.py -m 'MD5.txt' -f '*.fq.gz' -n 'review.md5' -c | tee log &

你可能感兴趣的:(【多线程优化】fastq文件MD5批量校验、批量解压、压缩处理脚本设计)