#! /usr/local/bin/perl -w # get_web_data.pl # Use LWP to retrieve a series of GenBank reports (or other data) from NCBI # # WI Biocomputing course - Unix and Programming Skills for Biologists - March 2003 # # Note that this allows you to get lots of GenBank reports at once, but so does # Batch Entrez at http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide # and http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Protein # # For more info about URLs for Entrez queries, see # http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html # use LWP; $browser = LWP::UserAgent->new(); # Set URL and query terms for this series of queries $nt_url = "http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=n&form=6&dopt=g&html=no&uid="; # $pro_url = "http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=n&form=6&dopt=g&html=no&uid="; # $abstract_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&dopt=Abstract&uid="; # $medline_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&dopt=Abstract&uid="; # Select which type of query you want to generate (see above) $url = $nt_url; print "File name with list of GIs/UIs (one per line): "; chomp ($file = <STDIN>); open (LIST, $file) || die "cannot open $file for reading: $!"; @gi = <LIST>; for ($i = 0; $i <= $#gi; $i++) { chomp($gi[$i]); $this_url = "$url$gi[$i]"; print "Requesting page for $gi[$i]. . . "; $webdoc = $browser->request(HTTP::Request->new(POST=>$this_url)); if ($webdoc->is_success) { print " got it\n"; $out = $webdoc->content; # Split report into array of lines # @lines = split(/\n/, $out); # Print each page as a separate file $outFile = "$gi[$i]_gb.txt"; open(OUT, ">$outFile") || die "cannot open $outFile for writing: $!"; print OUT $out; close(OUT); } else { print "\nProblems doing query for GI:$gi[$i]\n"; } }