我们在上一篇文章中看了最后的输出结果如下:
pos TAIR bur can ct edi hi kn ler mt no oy po rsch sf tsu wil ws wu zu 1000 G - A - - - - - - - - - - - - - - - - 10000003 C - - - - - G - - - - - - - - - - - - 10000013 A - - R - - - - - - - - - - - - - - - 10000021 G - - - - - - - - - - - C - - - - - - 10000034 C - - - - - - - - - - - - - - - - Y - 10000047 G R - - - - - - - - - - - - - - - R - 10000049 A R - - - - - - - - - - - - - - - R -
但是这一部分信息,也是我们需要的,所以我们要到原来的.maf文件中去回对,也就是把每个位置的每个样品的碱基信息进行统计。其实关键就是一个回对的过程。
所以我们的解决思路就是:
1.用合并后的信息,建立一个hash,方便后面的查找
2.打开.maf以后,我们对每一个块,(这里说的块,包括score部分,ref序列,sample序列),我们先看在ref中是否有这个点的信息,因为,我设置的score的起始位点是90000.所以会有很多无法匹配的位点。如果有,那么取sample中对应位置的碱基,保存到hash中。
3.重复回对每一个文件,最终得到最后的结果。
程序如下;
#!/usr/bin/perl use strict; use warnings; my %position; my @information1; my @mout; my @score; my @info1; my @info2; my @sequen1; my @sequen2; my $cout1; my $cout; my @samples; my $samples; my $key1; my $value; my $z; open (DNA,"Chr5_join.txt")||die("can not open");#这里有需要替换成不同的染色体信息 while(<DNA>) { @information1=split; $position{$information1[0]}{sample0}=$information1[1]; } open (DNA1,"TAIR_vs_bur.maf")||die("can not open"); $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $z=<DNA1>; $/="\n\n"; while(<DNA1>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless ($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample1}=$sequen2[$cout]; } else { next; } } } } open (DNA2,"TAIR_vs_can.maf")||die("can not open"); $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $z=<DNA2>; $/="\n\n"; while(<DNA2>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless ($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample2}=$sequen2[$cout]; } else { next; } } } } open (DNA3,"TAIR_vs_ct.maf")||die("can not open"); $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $z=<DNA3>; $/="\n\n"; while(<DNA3>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample3}=$sequen2[$cout]; } else { next; } } } } open (DNA4,"TAIR_vs_edi.maf")||die("can not open"); $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $z=<DNA4>; $/="\n\n"; while(<DNA4>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample4}=$sequen2[$cout]; } else { next; } } } } open (DNA5,"TAIR_vs_hi.maf")||die("can not open"); $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $z=<DNA5>; $/="\n\n"; while(<DNA5>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample5}=$sequen2[$cout]; } else { next; } } } } open (DNA6,"TAIR_vs_kn.maf")||die("can not open"); $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $z=<DNA6>; $/="\n\n"; while(<DNA6>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample6}=$sequen2[$cout]; } else { next; } } } } open (DNA7,"TAIR_vs_ler.maf")||die("can not open"); $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $z=<DNA7>; $/="\n\n"; while(<DNA7>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample7}=$sequen2[$cout]; } else { next; } } } } open (DNA8,"TAIR_vs_mt.maf")||die("can not open"); $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $z=<DNA8>; $/="\n\n"; while(<DNA8>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample8}=$sequen2[$cout]; } else { next; } } } } open (DNA9,"TAIR_vs_no.maf")||die("can not open"); $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $z=<DNA9>; $/="\n\n"; while(<DNA9>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample9}=$sequen2[$cout]; } else { next; } } } } open (DNA10,"TAIR_vs_oy.maf")||die("can not open"); $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $z=<DNA10>; $/="\n\n"; while(<DNA10>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample10}=$sequen2[$cout]; } else { next; } } } } open (DNA11,"TAIR_vs_po.maf")||die("can not open"); $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $z=<DNA11>; $/="\n\n"; while(<DNA11>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample11}=$sequen2[$cout]; } else { next; } } } } open (DNA12,"TAIR_vs_rsch.maf")||die("can not open"); $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $z=<DNA12>; $/="\n\n"; while(<DNA12>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample12}=$sequen2[$cout]; } else { next; } } } } open (DNA13,"TAIR_vs_sf.maf")||die("can not open"); $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $z=<DNA13>; $/="\n\n"; while(<DNA13>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample13}=$sequen2[$cout]; } else { next; } } } } open (DNA14,"TAIR_vs_tsu.maf")||die("can not open"); $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $z=<DNA14>; $/="\n\n"; while(<DNA14>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample14}=$sequen2[$cout]; } else { next; } } } } open (DNA15,"TAIR_vs_wil.maf")||die("can not open"); $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $z=<DNA15>; $/="\n\n"; while(<DNA15>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample15}=$sequen2[$cout]; } else { next; } } } } open (DNA16,"TAIR_vs_ws.maf")||die("can not open"); $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $z=<DNA16>; $/="\n\n"; while(<DNA16>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample16}=$sequen2[$cout]; } else { next; } } } } open (DNA17,"TAIR_vs_wu.maf")||die("can not open"); $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $z=<DNA17>; $/="\n\n"; while(<DNA17>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample17}=$sequen2[$cout]; } else { next; } } } } open (DNA18,"TAIR_vs_zu.maf")||die("can not open"); $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $z=<DNA18>; $/="\n\n"; while(<DNA18>) { @mout = split/\n/,$_; @score= split/=/, $mout[0]; unless($score[1]<90000) { @info1 = split/\s+/,$mout[1]; @info2 = split/\s+/,$mout[2]; @sequen1 = split// ,$info1[6]; @sequen2 = split// ,$info2[6]; for($cout1=$info1[2]+1;$cout1<$info1[3]+$info1[2]+1;$cout1++) { if(exists $position{$cout1}) { $cout=$cout1-$info1[2]-1; $position{$cout1}{sample18}=$sequen2[$cout]; } else { next; } } } } @samples=qw/sample0 sample1 sample2 sample3 sample4 sample5 sample6 sample7 sample8 sample9 sample10 sample11 sample12 sample13 sample14 sample15 sample16 sample17 sample18/; open(ALL,">all_information.txt")||die("can not open!"); foreach $key1(sort keys %position) { print ALL "$key1 "; foreach $samples(@samples) { if(exists $position{$key1}{$samples}) { $value = $position{$key1}{$samples}; print ALL "$value "; } else { print ALL "- " } } print ALL "\n"; }
10001656 G G - - G - G G G G - G A G G A G G G 10001667 G G - - G - G G G G - G G G G S G G G 10001670 G G - - G T G G G G - G G G G G G G G 10001672 T T - - T G T T T T - T T T T T T T T 10001673 G G - - G A G G G G - G G G G G G G G 10001696 G G - - G G G G G G - G A G G A G G G 10001713 C C - - C C C C C C - C C C C Y C C C 10001768 C C - - C C C C C C - C C C C Y C C C 10001791 A A - - G G G G A A - A G A G G A A G 10001812 G G - - G G G G A A - G G G G G G G G 10001816 A A - - A A A A A A - A G A A G A A A 10001820 C C - - C C C C C C - C C C C S C C C我们可以看到好多位置的信息已经补充完整了。
然后一个新的问题就是,我们知道基因在测序的过程中会有很多的重复区域,我们也要把相应的重复区域删除。
删除重复区域其实也是一个hash应用的过程。没有多少亮点。
我们首先来看一下这个repeat区域的格式:
5 0 4758 5 5248 5289 5 56677 56887 5 63698 63796 5 64015 64047 5 64771 71199 5 78471 78531 5 138655 138720 5 138941 139927 5 140759 140948 5 162806 162844 5 301999 302104
我们来看一下代码:
#!/usr/bin/perl # move repeat arear out use strict; use warnings; my %position; my @pos; my $cout; my $key; my $value; open (ALL,"all_information.txt")||die("can not open!"); while(<ALL>) { $position{$1}=$2 if $_=~/^(\d+)\s(.+)$/; } open (POS,"sepChr5.txt")||die("can not open!"); while(<POS>) { @pos=split; for($cout=$pos[1]+1;$cout<$pos[2]+1;$cout++) { if(exists $position{$cout}) { delete $position{$cout}; } else { next; } } } open (RESULT,">without_repeat_information.txt")||die ("can not open!"); foreach $key(sort keys %position) { print RESULT "$key "; $value=$position{$key}; print RESULT "$value \n"; }