一个关于位点的所在位置的程序

这还是那个妹子写的,博主提供了微量的帮忙和测试,觉得很不错,自己又懒得写,分享下:

#!perl -w
use strict;
die "Usage : perl $0   [gff]" unless (@ARGV >=2);
my ($in, $out, $gff) = @ARGV;
$gff ||= 'refGene.ann.gz';

open IN, $in || die $!;
my ($tmp, $line, $rs, $chr, $site , $start, $ori, $num, $end, @samps, $ref, $alt, %infos, $dep, @tmps);
my %refs = ('M' => 'AC', 'R' => 'AG', 'W' => 'AT', 'S' => 'CG', 'Y' => 'CT', 'K' => 'GT');
while()
{
	chomp;
	($chr, $start, $end) = (split)[1,2,2];
	$infos{$chr}{$start}{'end'} = $end;
	@{$infos{$chr}{$start}{'ann'}} = ();
	$infos{$chr}{$start}{'gene'} = '-';
}
close IN;


open GFF, "gzip -dc $gff |" || die $!;
my ($up, $down);
my ($name, $id, %ids, $info, $region);
 while()
{
	chomp;
	($chr, $region, $start, $end, $num, $ori, $info) = (split /\t/, $_)[0,2,3,4,5,6,8];
	if($region =~ /intergenic/){
		for $site (sort {$a <=> $b} keys %{$infos{$chr}}){
			next unless ($site >= $start and $infos{$chr}{$site}{'end'} <= $end and $infos{$chr}{$site}{'gene'} eq '-');
			$up = $site - $start;
			$down = $end - $site;
			my ($g1, $g2) = $info =~ /name=([\.\-\w]+);.+name=([\.\-\w]+);/;
			push @{$infos{$chr}{$site}{'ann'}}, "$g1(dist=$up),$g2(dist=$down):$region";
		}
	}elsif($region =~ /RNA/)
	{
		next unless ($info =~ /ID=([-\w]+); name=([-\.\w]+);/);
		($id, $name) = ($1, $2);
		$ids{$id} = $name;
	}else
	{
		next unless ($info =~ /Parent=([-\w]+);/);
		$id = $1;
		$name = $ids{$id};
		for $site (sort {$a <=> $b} keys %{$infos{$chr}})
		{
			next unless ($site >= $start and $infos{$chr}{$site}{'end'} <= $end);
			my $tmp = $region;
			if ($region !~ /UTR/){
					$tmp .= $num;
			}
			push @{$infos{$chr}{$site}{'ann'}}, "$name($id):$tmp";
			if($infos{$chr}{$site}{'gene'} eq '-' or $infos{$chr}{$site}{'gene'} =~ /$name/){
				$infos{$chr}{$site}{'gene'} = $name;
			}else{
				$infos{$chr}{$site}{'gene'} .= ",$name";
			}
		}
	}
}
close GFF;


if($out =~ /\.gz$/){open OUT, "| gzip > $out" || die $!;}
else{open OUT, "> $out" || die $!;}
print OUT "#Chromosome\tStart\tEnd\tGene\tRegion\n";
for $chr (sort keys %infos)
{
	for $site (sort {$a <=> $b} keys %{$infos{$chr}})
	{
		print OUT "$chr\t$site\t$infos{$chr}{$site}{'end'}\t$infos{$chr}{$site}{'gene'}\t";
		my $i = 1;
		for my $tmp ( @{$infos{$chr}{$site}{'ann'}}){
			if ($i == 1){print OUT "$tmp";}
			else {print OUT ";$tmp";}
			$i ++;
		}
		print OUT "\n";
	}
}
close OUT;

承接之前的加强版的gff格式。

你可能感兴趣的:(生物信息学)