今天开始所有的工作脚本全都从perl转变到python,开发速度明显降低了不少,相信以后随着熟练度提升会好起来。贴一下今天一个工作代码,由于之前去一家小公司测序时,序列长度竟然都没有达到要求,为了之后的索赔事宜,写了个脚本统计所有序列的结果,主要包括总的reads数,bases数,和达到测序策略要求长度的reads数(双端),bases数,高质量(Q30)bases数,高质量reads数(双端)等等......多个文件的统计工作一般会写一个单独处理一个文件的脚本,然后再写一个脚本用来生成多个文件的处理的shell脚本,然后想办法并行处理这个shell就可以,效率会快很多。由于测序数据往往比较大,IO操作时,逐行读取是上策。
统计单个文件测序数据情况脚本:
1 from __future__ import division 2 from Bio import SeqIO as fq 3 import os 4 import sys 5 import re 6 read1_gzfile = sys.argv[1] 7 read2_gzfile = sys.argv[2] 8 gz_handle1 = os.popen( 'gunzip -cd %s' % read1_gzfile ) 9 gz_handle2 = os.popen( 'gunzip -cd %s' % read2_gzfile ) 10 basename1 = os.path.basename(read1_gzfile) 11 basename1 = re.match('(\S+)_R1_001\.fastq\.gz',basename1).group(1) 12 basename2 = os.path.basename(read2_gzfile) 13 basename2 = re.match('(\S+)_R2_001\.fastq\.gz',basename2).group(1) 14 if basename1 != basename2: 15 raise 'Two Read are not mapped!' 16 cwd = os.getcwd() 17 out_handle = open('%s/%s.stat'%(cwd,basename1),'w') 18 out_handle.write('AllReadsNum\tRead1_PE300_ReadsNum\tRead2_PE300_ReadsNum\tUseful_ReadsNum(Read1>=300 and Read2>=300)\tAll_Bases\tRead1_Q30_Bases(PE300)\tRead2_Q30_Bases(PE300)\tQ30_PE_Reads(Q30>50%)\tUseful_Bases(All)\tUseful_Ratio\n') 19 20 AllReadsNum = 0 21 AllBases = 0 22 Read1_PE300_ReadsNum = 0 23 Read2_PE300_ReadsNum = 0 24 Useful_ReadsNum = 0 25 Read1_Q30_Bases = 0 26 Read2_Q30_Bases = 0 27 Q30_PE_Reads = 0 28 Useful_Bases = 0 29 30 def PE300(seq): 31 if len(seq) >= 300: 32 return True 33 else: 34 return False 35 36 def Q30(qual_list): 37 num = 0 38 for qual in qual_list: 39 if qual >= 30: 40 num += 1 41 return num 42 43 reads2 = fq.parse(gz_handle2,'fastq') 44 for read1 in fq.parse(gz_handle1,'fastq'): 45 read2 = reads2.next() 46 seq1 = read1.seq 47 qual1 = read1.letter_annotations['phred_quality'] 48 seq2 = read2.seq 49 qual2 = read2.letter_annotations['phred_quality'] 50 AllReadsNum += 1 51 AllBases += len(seq1) 52 AllBases += len(seq2) 53 R1_300 = PE300(seq1) 54 R2_300 = PE300(seq2) 55 if R1_300 and R2_300: 56 Useful_ReadsNum +=1 57 R1_Q30 = Q30(qual1) 58 R2_Q30 = Q30(qual2) 59 Read1_Q30_Bases += R1_Q30 60 Read2_Q30_Bases += R2_Q30 61 if ( R1_Q30 / len(seq1) >= 0.5 ) and ( R2_Q30 / len(seq2) >= 0.5 ): 62 Q30_PE_Reads += 1 63 Useful_Bases += R1_Q30 64 Useful_Bases += R2_Q30 65 elif R1_300: 66 Read1_PE300_ReadsNum += 1 67 elif R2_300: 68 Read2_PE300_ReadsNum += 1 69 70 Useful_Ratio = Useful_Bases / AllBases 71 out_handle.write('%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%i\t%f\n'%(AllReadsNum,Read1_PE300_ReadsNum,Read2_PE300_ReadsNum,Useful_ReadsNum,AllBases,Read1_Q30_Bases,Read2_Q30_Bases,Q30_PE_Reads,Useful_Bases,Useful_Ratio))
生成脚本:
1 import os 2 out = open('summary.sh','w') 3 cwd = os.getcwd() 4 with open('templist') as gzfiles: 5 for gzfile1 in gzfiles: 6 gzfile2 = gzfiles.next() 7 out.write('python %s/summary.py %s %s\n'%(cwd,gzfile1.strip(),gzfile2.strip()))
使用qsub_sge方法,并行投递生成的summary.sh就完成了