access NCBI database programitically-a snp case

access NCBI database programitically-a snp case

有时候,我们需要大批量的查询ncbi数据库,这时,我们很可能需要用到ncbi为我门提供的程序化接口:Eutils。
Eutils有五大部分组成,具体的doc参见ncbi的online book。这里提供一个检索snp信息的例子:



use LWP::Simple;
use List::MoreUtils qw/ uniq /;

sub xmlparse{
     # parameter: 1,xmlstring 2,out file descriptor
     $snpfetchrst =  @_[ 0];
     $outpfile =  @_[ 1];
     # get rsID, validation info, allele info, gene info, maf allele info
     my  $rsid;
     my  $taxid;
     my  $content;
     my  @geneid;
     my  $alleinfo;
     my  $filtr = false;
     # print table head
    #print $outpfile "refSNP","\t","alleles","\t","minoralle","\t","entrezGeneID","\n";

     while ( $snpfetchrst =~ /<Rs rsId=\ " (\d+)\ ".*?taxId=\ " (\d+)\ ">(.*?)<\/Rs>/sg) {
         @geneid = ();
         $filtr = false;
         $rsid = $ 1;
         $taxid = $ 2;
         $content = $ 3;
         if( $taxid !=  9606){
             next;
        }
            else{
             if (( $content =~ /<Validation (.*?)\/>/sg)|( $content =~ /<Validation .*?>(.*?)<\/Validation>/sg)) {
                 $valid_state = $ 1;
                 # print $valid_state,"\n";
                 if( $valid_state){     # validation state
                     $filtr = true;
                     if ( $content =~ /<Sequence.*?>(.*?)<\/Sequence>/sg) {
                     $seqinfo = $ 1;
                     $alleinfo = $ 1  if ( $seqinfo =~ /<Observed>(.*?)<\/Observed>/);
                    }
                           if ( $content =~ /<Assembly.*?reference= " true ">(.*?)<\/Assembly>/sg){
                         $assemContent = $ 1;
                         if( $assemContent =~ /<Component.*?>(.*?)<\/Component>/sg){
                             $compcontent = $ 1;
                             if( $compcontent =~ /<MapLoc.*?>(.*?)<\/MapLoc>/sg){
                                 $funcinfo = $ 1;
                                 # print $funcinfo,"\n";
                                 while( $funcinfo =~ /<FxnSet geneId=\ " (\d+)\ ".*?\/>/sg){
                                     # print $1,"\n";
                                     push( @geneid,$ 1);
                                }
                                 @geneid = uniq( @geneid);     # require moreUtils module        
                            }
                        }    
                    }
                     if ( $content =~ /<Frequency.*?allele=\ " (.*?)\ ".*?\/>/sg){
                         $malle = $ 1;        
                    }
                }
                       else{
                     next;
                }
               }
             else{
                 $filtr = false;
                 next;
            }
        }
         if( $filtr) { print  $outpfile ( $rsid, " \t ", $alleinfo, " \t ", $malle, " \t ", join( ' , ', @geneid), " \n ");}
    }
}     # end of xml_parse

my  $utils =  " http://www.ncbi.nlm.nih.gov/entrez/eutils ";

open( $refsnp, " < ", " snps.txt ");

while (< $refsnp>) {
     chomp;
     $_ =~ /(\d+)/;
     $rsid = $ 1;
     push( @rss$rsid);
}
close( $refsnp);

my  $id_list =  join( ' , ', @rss);
# print $id_list,"\n";

#assemble  the epost URL as an HTTP POST call

$url =  $utils .  " /epost.fcgi ";

$url_params =  " db=snp&id=$id_list ";

# create HTTP user agent
$ua = new LWP::UserAgent;
$ua->agent( " elink/1.0  " .  $ua->agent);


# create HTTP request object
$req = new HTTP::Request POST =>  " $url ";
$req->content_type( ' application/x-www-form-urlencoded ');
$req->content( " $url_params ");


# post the HTTP request
$response =  $ua->request( $req); 
$epostrst =  $response->content;

my  $querykey;
my  $webEnv;

if( $epostrst =~ /<QueryKey>(.*?)<\/QueryKey>/sg){
     $querykey = $ 1;
     # print $querykey,"\n";
}
if( $epostrst =~ /<WebEnv>(.*?)<\/WebEnv>/sg){
     $webEnv = $ 1;
     # print $webEnv,"\n";
}
my  $count =  @rss;
my  $retmax =  600;
my  $efetch =  $utils. " /efetch.fcgi? ";
open( my  $outpf, " >> ", " snp_queryinfo.txt ");
if( defined( $querykey)&& defined( $webEnv)){
     for( $retstart =  0; $retstart <  $count; $retstart +=  $retmax){
         my  $progress =  $retstart/ $count;
         my  $prog =  sprintf" %.2f  ", $progress);
         print  " progression: $prog\n ";
         $efetch_url =  $efetch. " db=snp&WebEnv=$webEnv&query_key=$querykey ";
         $efetch_url .=  " &retstart=$retstart&retmax=$retmax&retmode=xml ";
         # print $efetch_url,"\n";
         $efetch_out = get( $efetch_url);
         # print $efetch_out,"\n";
        xmlparse( $efetch_out, $outpf);
    }
}
close( $outpf);


上面的过程将读入名为snps.txt所存储的refsnp号,然后利用efetch检索数据库信息,最后从xml结果中抽提有用的信息。




你可能感兴趣的:(access NCBI database programitically-a snp case)