iterate_seqs.pl

#!/usr/local/bin/perl -w

# Split all the sequences in a big sequence file into separate files
# Sequence format may be modified in the process
# WI Biocomputing course - Unix and Programming Skills for Biologists - Feb 2004

# Use BioPerl
use Bio::SeqIO;

# Get arguments or prompt the user for them (See below for subroutine)
getArgsAndPrompt();

# Read the input file
$in  = Bio::SeqIO->new('-file' => "$inFile",  '-format' => $inFormat);

# Go through all the sequences in the file
while ($seqobj = $in->next_seq())
{
   # Get the seq ID
   $seqID = $seqobj->display_id();
   
   # Name output files
   $seqFile = $seqID . ".$outFormat";

   # Print out a fasta file   

   $out = Bio::SeqIO->new('-format' => $outFormat, -file => ">$seqFile");
   $out->write_seq($seqobj);

   print "Creating $seqFile\n";
   
   #
   # An EMBOSS command could be inserted here, to analyze every sequence as it's created.
   # The file name of each single sequence is $seqFile
   #
}

print "\nAll done\n";

######################  Subroutines below  ############################################

sub getArgsAndPrompt
{
   # If you don't get any arguments, prompt the user
   if (! $ARGV[0])
   {
      print "\nSplit a file of multiple sequences into separate files\n";
      print "\nUSAGE: $0 fastaInput inputFormat outputFormat\n";
      print "What is the fasta file to process? ";

      # Get input and remove the newline character at the end
      chomp ($inFile = <STDIN>);
   }
   else
   {
      $inFile = $ARGV[0];
   }

   if (! $ARGV[1])
   {
      print "\nWhat is the format of your input sequence file?\n";
      print "Options: fasta, embl, genbank, pir, swiss, gcg\n";

      # Get input and remove the newline character at the end
      chomp ($inFormat = <STDIN>);
   }
   else
   {
      $seqFormat = $ARGV[1];
   }   
   
   if (! $ARGV[2])
   {
      print "\nWhat format files should be created?\n";
      print "Options: fasta, embl, genbank, pir, swiss, gcg, raw\n";

      # Get input and remove the newline character at the end
      chomp ($outFormat = <STDIN>);
   }
   else
   {
      $seqFormat = $ARGV[2];
   }   
}