linux下perl处理文本---使用hash处理

 
 
# Usage: perl gather_family_protein_gene.pl -a name_of_animal -e ../Extract_result -n GATHER


use strict;
use warnings;


use Getopt::Long;
use File::Basename;


#The input options list
my ($animalName,$extractDir,$saveName,$help,$errorLog);
GetOptions(
  'help|h' => \$help,
  'a:s' => \$animalName,
  'e:s' => \$extractDir,
  'n:s' => \$saveName,
  'r:s' => \$errorLog
);
sub usage{
  print <<USAGE
usage:
#version:       perl $0 [options]
#author:        Oshyn Song <dualyangsong\@gmail.com>
#history:       2013-12-17
#desc:          Gather the extract result to a file by species
options:
-h  --help:print the info
-a  :input the animal species filename
-e  :input the extract result directory
-n  :the save result filaname
-r	:the error log file name
#perl $0
USAGE
}


#Change the STDERR to errorlog file
if (!defined $errorLog){
	$errorLog = "errorlog";
}
if (! open (STDERR, ">> ${errorLog}")){
	die "Can not open errorlog $!";
}

#Test if given the necessary options
if (defined $help || !(defined $animalName && defined $extractDir && defined $saveName)){
	&usage();
	exit 0;
}

print "Start process...\n";
#Open the names of all animal species file
if (! open (ANIMALNAME,"< ${animalName}")){
	die "Can not open file of ${animalName} $!";
}
print "Open the file of animal species name successfully.\n\n";

#Read the animal species name every line
foreach(<ANIMALNAME>){
	chomp;
	my $animal_name = $_;
	$animal_name = substr($animal_name,0,index($animal_name,"."));
	if (!opendir TFFLIST,"${extractDir}/${animal_name}" ){
		die "Can not open directory of ${extractDir}/${animal_name}. $!";
	}	
	print "process ${animal_name}...\n";
	
	#Open the result file
	if (! open OUT,">> ${extractDir}/${animal_name}/${saveName}"){
		die "Can not open ${extractDir}/${animal_name}/${saveName}. $!";
	}
	
	#Read every filename and open it
	my $filename;
	my %gather;
	foreach $filename (readdir TFFLIST){
		next if $filename =~ /^\./;
		my $filepath = "${extractDir}/${animal_name}/$filename";
		next unless -f $filepath and -r $filepath;
		
		if (! open FILE, "${filepath}"){
			die "Can not open the file : ${filepath} $!";
		}
		my $line;
		while(defined ($line = <FILE>)){
			chomp($line);
			if ($line =~ /^(ENS[\w]+?[\d]{11})[\t]([0-9e\-\.]+)$/){
				my $protein = $1;
				my $evalue = $2;
				my $tfname = substr($filename,0,index($filename,"."));
				if (!exists $gather{$protein}){
					$gather{$protein} = "${tfname}=>${evalue}";
				}else{
					$gather{$protein} = "$gather{$protein}\t|\t${tfname}=>${evalue}";
				}
			}
			if ($line =~ /^(ENS[\w]+?[\d]{11})[\t](ENS[\w]+?[\d]{11})/){
				my $p = $1;
				my $gene = $2;
				if (exists $gather{$p}){
					unless (substr($gather{$p},0,3) eq "ENS"){
						$gather{$p} = "${gene}\t$gather{$p}";
					}
				}
			}
		}
	}
	close FILE;
	
	while(my($key,$value) = each %gather){
		print OUT "${key}=>${value}\n";
	}
	close OUT;
	print "${animal_name} process finished!\n\n";
	closedir TFFLIST;
}
close ANIMALNAME;


处理后的结果如下:


你可能感兴趣的:(linux下perl处理文本---使用hash处理)