align_SOLUTION.pl

#!/usr/local/bin/perl -w

# Analyze a list of human-mouse homology pairs

# Input data file - pairs of human and mouse RefSeq accession numbers
$hs_mm_pairs = "human_mouse_pairs.txt";

# Files used in data processing
$humanSeq = "mouse.fa";
$mouseSeq = "human.fa";
$seqPairFile = "seqPair.fa";
$bigOutputFile = "alignment-all.txt";

# Remove $bigOutputFile from any previous script outputs
if (-e "$bigOutputFile")
{ unlink($bigOutputFile); }


# Open the homology data file
open (HOMOLOGY, "$hs_mm_pairs") || die "Major problem: cannot open $hs_mm_pairs for reading: $!";

while (<HOMOLOGY>)      # Read one line at a time
{
   chomp($_);         # Get rid of new line at end of each line
   @data = split(/\t/, $_);   # Split the line in to tab-delimited fields

   # Get human and mouse accessions 
   $humanAcc = $data[0];
   $mouseAcc = $data[1];

   print "$humanAcc vs. $mouseAcc\n";

   # Get a sequence from a BLAST-formatted database using the "fastacmd" syntax
   # fastacmd -d database -s accession               and append (>>) to a file

   # Print human sequence to file (overwriting if existing) 
   `fastacmd -d nr -s $humanAcc > $humanSeq`;

   # Change format of sequence headers to make alignments clearer 
   `sed 's/>gi|.*ref|/>/' $humanSeq > hs.tmp; mv hs.tmp $humanSeq`;

   # Print mouse sequence to file (overwriting if existing) 
   `fastacmd -d nr -s $mouseAcc > $mouseSeq`;

   # Change format of sequence headers to make alignments clearer 
   `sed 's/>gi|.*ref|/>/' $mouseSeq > mm.tmp; mv mm.tmp $mouseSeq`;

   # Concatenate both sequences to same file (if doing clustal alignment)
   `cat $humanSeq $mouseSeq > $seqPairFile`;

   # Do alignments (perform subroutines below)
   doClustal();
   doGlobal();
   doLocal();
}

# Close file handles and delete temporary files
close (HOMOLOGY);
unlink ($humanSeq, $mouseSeq, $seqPairFile);

print "\nSee $bigOutputFile for output\n\n";

##########  subroutines  ##########

sub doClustal
{
   $clustalOut = "clustal.aln";

   # Add sequence headers to big alignment file 
   `grep ">" $seqPairFile >> $bigOutputFile`;

   `clustalw -INFILE=$seqPairFile -TYPE=DNA -OUTFILE=$clustalOut`;

   # Append alignment to big file and delete the small files
   `cat $clustalOut >> $bigOutputFile`;
   unlink("$clustalOut", "seqPair.dnd");
}

sub doGlobal
{
   $globalOut = "global.aln";

   # Add sequence headers to big alignment file 
   `grep ">" $seqPairFile >> $bigOutputFile`;

   `needle $humanSeq $mouseSeq -outfile $globalOut -auto`;

   # Append alignment to big file and delete the small file
   `cat $globalOut >> $bigOutputFile`;
   unlink("$globalOut");
}

sub doLocal
{
   $localOut = "local.aln";

   # Add sequence headers to big alignment file 
   `grep ">" $seqPairFile >> $bigOutputFile`;

   `water $humanSeq $mouseSeq -outfile $localOut -auto`;

   # Append alignment to big file and delete the small file
   `cat $localOut >> $bigOutputFile`;
   unlink("$localOut");
}