access NCBI database programitically-a snp case
有时候,我们需要大批量的查询ncbi数据库,这时,我们很可能需要用到ncbi为我门提供的程序化接口:Eutils。
Eutils有五大部分组成,具体的doc参见ncbi的online book。这里提供一个检索snp信息的例子:
上面的过程将读入名为snps.txt所存储的refsnp号,然后利用efetch检索数据库信息,最后从xml结果中抽提有用的信息。
Eutils有五大部分组成,具体的doc参见ncbi的online book。这里提供一个检索snp信息的例子:
use LWP::Simple;
use List::MoreUtils qw/ uniq /;
sub xmlparse{
# parameter: 1,xmlstring 2,out file descriptor
$snpfetchrst = @_[ 0];
$outpfile = @_[ 1];
# get rsID, validation info, allele info, gene info, maf allele info
my $rsid;
my $taxid;
my $content;
my @geneid;
my $alleinfo;
my $filtr = false;
# print table head
#print $outpfile "refSNP","\t","alleles","\t","minoralle","\t","entrezGeneID","\n";
while ( $snpfetchrst =~ /<Rs rsId=\ " (\d+)\ ".*?taxId=\ " (\d+)\ ">(.*?)<\/Rs>/sg) {
@geneid = ();
$filtr = false;
$rsid = $ 1;
$taxid = $ 2;
$content = $ 3;
if( $taxid != 9606){
next;
}
else{
if (( $content =~ /<Validation (.*?)\/>/sg)|( $content =~ /<Validation .*?>(.*?)<\/Validation>/sg)) {
$valid_state = $ 1;
# print $valid_state,"\n";
if( $valid_state){ # validation state
$filtr = true;
if ( $content =~ /<Sequence.*?>(.*?)<\/Sequence>/sg) {
$seqinfo = $ 1;
$alleinfo = $ 1 if ( $seqinfo =~ /<Observed>(.*?)<\/Observed>/);
}
if ( $content =~ /<Assembly.*?reference= " true ">(.*?)<\/Assembly>/sg){
$assemContent = $ 1;
if( $assemContent =~ /<Component.*?>(.*?)<\/Component>/sg){
$compcontent = $ 1;
if( $compcontent =~ /<MapLoc.*?>(.*?)<\/MapLoc>/sg){
$funcinfo = $ 1;
# print $funcinfo,"\n";
while( $funcinfo =~ /<FxnSet geneId=\ " (\d+)\ ".*?\/>/sg){
# print $1,"\n";
push( @geneid,$ 1);
}
@geneid = uniq( @geneid); # require moreUtils module
}
}
}
if ( $content =~ /<Frequency.*?allele=\ " (.*?)\ ".*?\/>/sg){
$malle = $ 1;
}
}
else{
next;
}
}
else{
$filtr = false;
next;
}
}
if( $filtr) { print $outpfile ( $rsid, " \t ", $alleinfo, " \t ", $malle, " \t ", join( ' , ', @geneid), " \n ");}
}
} # end of xml_parse
my $utils = " http://www.ncbi.nlm.nih.gov/entrez/eutils ";
open( $refsnp, " < ", " snps.txt ");
while (< $refsnp>) {
chomp;
$_ =~ /(\d+)/;
$rsid = $ 1;
push( @rss, $rsid);
}
close( $refsnp);
my $id_list = join( ' , ', @rss);
# print $id_list,"\n";
#assemble the epost URL as an HTTP POST call
$url = $utils . " /epost.fcgi ";
$url_params = " db=snp&id=$id_list ";
# create HTTP user agent
$ua = new LWP::UserAgent;
$ua->agent( " elink/1.0 " . $ua->agent);
# create HTTP request object
$req = new HTTP::Request POST => " $url ";
$req->content_type( ' application/x-www-form-urlencoded ');
$req->content( " $url_params ");
# post the HTTP request
$response = $ua->request( $req);
$epostrst = $response->content;
my $querykey;
my $webEnv;
if( $epostrst =~ /<QueryKey>(.*?)<\/QueryKey>/sg){
$querykey = $ 1;
# print $querykey,"\n";
}
if( $epostrst =~ /<WebEnv>(.*?)<\/WebEnv>/sg){
$webEnv = $ 1;
# print $webEnv,"\n";
}
my $count = @rss;
my $retmax = 600;
my $efetch = $utils. " /efetch.fcgi? ";
open( my $outpf, " >> ", " snp_queryinfo.txt ");
if( defined( $querykey)&& defined( $webEnv)){
for( $retstart = 0; $retstart < $count; $retstart += $retmax){
my $progress = $retstart/ $count;
my $prog = sprintf( " %.2f ", $progress);
print " progression: $prog\n ";
$efetch_url = $efetch. " db=snp&WebEnv=$webEnv&query_key=$querykey ";
$efetch_url .= " &retstart=$retstart&retmax=$retmax&retmode=xml ";
# print $efetch_url,"\n";
$efetch_out = get( $efetch_url);
# print $efetch_out,"\n";
xmlparse( $efetch_out, $outpf);
}
}
close( $outpf);
use List::MoreUtils qw/ uniq /;
sub xmlparse{
# parameter: 1,xmlstring 2,out file descriptor
$snpfetchrst = @_[ 0];
$outpfile = @_[ 1];
# get rsID, validation info, allele info, gene info, maf allele info
my $rsid;
my $taxid;
my $content;
my @geneid;
my $alleinfo;
my $filtr = false;
# print table head
#print $outpfile "refSNP","\t","alleles","\t","minoralle","\t","entrezGeneID","\n";
while ( $snpfetchrst =~ /<Rs rsId=\ " (\d+)\ ".*?taxId=\ " (\d+)\ ">(.*?)<\/Rs>/sg) {
@geneid = ();
$filtr = false;
$rsid = $ 1;
$taxid = $ 2;
$content = $ 3;
if( $taxid != 9606){
next;
}
else{
if (( $content =~ /<Validation (.*?)\/>/sg)|( $content =~ /<Validation .*?>(.*?)<\/Validation>/sg)) {
$valid_state = $ 1;
# print $valid_state,"\n";
if( $valid_state){ # validation state
$filtr = true;
if ( $content =~ /<Sequence.*?>(.*?)<\/Sequence>/sg) {
$seqinfo = $ 1;
$alleinfo = $ 1 if ( $seqinfo =~ /<Observed>(.*?)<\/Observed>/);
}
if ( $content =~ /<Assembly.*?reference= " true ">(.*?)<\/Assembly>/sg){
$assemContent = $ 1;
if( $assemContent =~ /<Component.*?>(.*?)<\/Component>/sg){
$compcontent = $ 1;
if( $compcontent =~ /<MapLoc.*?>(.*?)<\/MapLoc>/sg){
$funcinfo = $ 1;
# print $funcinfo,"\n";
while( $funcinfo =~ /<FxnSet geneId=\ " (\d+)\ ".*?\/>/sg){
# print $1,"\n";
push( @geneid,$ 1);
}
@geneid = uniq( @geneid); # require moreUtils module
}
}
}
if ( $content =~ /<Frequency.*?allele=\ " (.*?)\ ".*?\/>/sg){
$malle = $ 1;
}
}
else{
next;
}
}
else{
$filtr = false;
next;
}
}
if( $filtr) { print $outpfile ( $rsid, " \t ", $alleinfo, " \t ", $malle, " \t ", join( ' , ', @geneid), " \n ");}
}
} # end of xml_parse
my $utils = " http://www.ncbi.nlm.nih.gov/entrez/eutils ";
open( $refsnp, " < ", " snps.txt ");
while (< $refsnp>) {
chomp;
$_ =~ /(\d+)/;
$rsid = $ 1;
push( @rss, $rsid);
}
close( $refsnp);
my $id_list = join( ' , ', @rss);
# print $id_list,"\n";
#assemble the epost URL as an HTTP POST call
$url = $utils . " /epost.fcgi ";
$url_params = " db=snp&id=$id_list ";
# create HTTP user agent
$ua = new LWP::UserAgent;
$ua->agent( " elink/1.0 " . $ua->agent);
# create HTTP request object
$req = new HTTP::Request POST => " $url ";
$req->content_type( ' application/x-www-form-urlencoded ');
$req->content( " $url_params ");
# post the HTTP request
$response = $ua->request( $req);
$epostrst = $response->content;
my $querykey;
my $webEnv;
if( $epostrst =~ /<QueryKey>(.*?)<\/QueryKey>/sg){
$querykey = $ 1;
# print $querykey,"\n";
}
if( $epostrst =~ /<WebEnv>(.*?)<\/WebEnv>/sg){
$webEnv = $ 1;
# print $webEnv,"\n";
}
my $count = @rss;
my $retmax = 600;
my $efetch = $utils. " /efetch.fcgi? ";
open( my $outpf, " >> ", " snp_queryinfo.txt ");
if( defined( $querykey)&& defined( $webEnv)){
for( $retstart = 0; $retstart < $count; $retstart += $retmax){
my $progress = $retstart/ $count;
my $prog = sprintf( " %.2f ", $progress);
print " progression: $prog\n ";
$efetch_url = $efetch. " db=snp&WebEnv=$webEnv&query_key=$querykey ";
$efetch_url .= " &retstart=$retstart&retmax=$retmax&retmode=xml ";
# print $efetch_url,"\n";
$efetch_out = get( $efetch_url);
# print $efetch_out,"\n";
xmlparse( $efetch_out, $outpf);
}
}
close( $outpf);
上面的过程将读入名为snps.txt所存储的refsnp号,然后利用efetch检索数据库信息,最后从xml结果中抽提有用的信息。