get_web_data.pl

#! /usr/local/bin/perl -w

# get_web_data.pl
# Use LWP to retrieve a series of GenBank reports (or other data) from NCBI 
#
# WI Biocomputing course - Unix and Programming Skills for Biologists - March 2003
#
# Note that this allows you to get lots of GenBank reports at once, but so does
# Batch Entrez at http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Nucleotide
# and http://www.ncbi.nlm.nih.gov/entrez/batchentrez.cgi?db=Protein
# 
# For more info about URLs for Entrez queries, see
# http://www.ncbi.nlm.nih.gov/entrez/query/static/linking.html
# 

use LWP;
$browser = LWP::UserAgent->new();

# Set URL and query terms for this series of queries
$nt_url = "http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=n&form=6&dopt=g&html=no&uid=";

# $pro_url = "http://www.ncbi.nlm.nih.gov/htbin-post/Entrez/query?db=n&form=6&dopt=g&html=no&uid=";
# $abstract_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&dopt=Abstract&uid=";
# $medline_url = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=PubMed&dopt=Abstract&uid=";

# Select which type of query you want to generate (see above)
$url = $nt_url;

print "File name with list of GIs/UIs (one per line): ";
chomp ($file = <STDIN>);
open (LIST, $file) || die "cannot open $file for reading: $!";
@gi = <LIST>;

for ($i = 0; $i <= $#gi; $i++)
{
   chomp($gi[$i]);
   $this_url = "$url$gi[$i]";
   print "Requesting page for $gi[$i]. . . ";

   $webdoc = $browser->request(HTTP::Request->new(POST=>$this_url));

   if ($webdoc->is_success)
   {
      print " got it\n";
      $out = $webdoc->content;

      # Split report into array of lines
      # @lines = split(/\n/, $out);

      # Print each page as a separate file   
      $outFile = "$gi[$i]_gb.txt";
      open(OUT, ">$outFile") || die "cannot open $outFile for writing: $!";

      print OUT $out;
      close(OUT);
   }
   else
   {
      print "\nProblems doing query for GI:$gi[$i]\n";
   }
}