align.pl

#!/usr/local/bin/perl -w
# Unix and Programming Skills for Biologists - class 2
# Analyze a list of homologous gene pairs

# Input data file - pairs of human and mouse RefSeq accession numbers
$hs_mm_pairs = "human_mouse_pairs.txt";

# Files used in data processing
$humanSeq = "human.fa";
$mouseSeq = "mouse.fa";
$seqPairFile = "seqPair.fa";
$bigOutputFile = "alignment-all.txt";

# Remove $bigOutputFile from any previous script outputs
if (-e "$bigOutputFile")            # if the file exists
    { unlink($bigOutputFile); }         # delete it [Perl's unlink = Unix's rm]

# Open the homology data file
open (HOMOLOGY, "$hs_mm_pairs") || die "Major problem: cannot open $hs_mm_pairs for reading: $!";

while (<HOMOLOGY>)   # Read one line at a time
{
   chomp($_);               # Delete newline at end of each line
   @data = split(/\t/, $_);   # Split the line into tab-delimited fields

   # Get human ($humanAcc) and mouse ($mouseAcc) accessions from @data
   # 0
   


      
   print "$humanAcc vs. $mouseAcc\n";      # for debugging
   # Get a human sequence using the "fastacmd" syntax and redirect output to $humanSeq            
   # 1
   

 
   # Change format of sequence headers to make alignments clearer 
   `sed 's/>gi|.*ref|/>/' $humanSeq > hs.tmp; mv hs.tmp $humanSeq`;

   # Get a mouse sequence and do the same thing
   # 2
   
   
   
   # Change format of sequence headers to make alignments clearer 
   `sed 's/>gi|.*ref|/>/' $mouseSeq > mm.tmp; mv mm.tmp $mouseSeq`;
   
   # Concatenate both sequences to $seqPairFile (if doing clustal alignment)
   # 3
   
   
   
   # Do alignment for each sequence, choosing an informative filename for output
   # 4
   
   

   # Append sequence headers and alignment to $bigOutputFile
   # 5
   


}

print "\nSee $bigOutputFile for output\n\n";