#!/usr/local/bin/perl -w
# Unix, Perl, and Python - class 2
# Align a list of human-mouse homology pairs
# Input data file - pairs of human and mouse RefSeq accessions
$hs_mm_pairs = "human_mouse_pairs.txt";
# Files used in data processing
$humanSeq = "mouse.fa";
$mouseSeq = "human.fa";
$seqPairFile = "seqPair.fa";
$bigOutputFile = "alignment-all.txt";
# Remove $bigOutputFile from any previous script outputs
if (-e "$bigOutputFile") # if the file exists
{ unlink($bigOutputFile); } # delete it [Perl's unlink = Unix's rm]
# Open the homology data file
open (HOMOLOGY, "$hs_mm_pairs") || die "Major problem: cannot open $hs_mm_pairs for reading: $!";
while (<HOMOLOGY>) # Read one line at a time
{
chomp($_); # Delete newline at end of each line
@data = split(/\t/, $_); # Split the line into tab-delimited fields
# Get human ($humanAcc) and mouse ($mouseAcc) accessions from @data
# 0
$humanAcc = $data[0];
$mouseAcc = $data[1];
print STDERR "$humanAcc vs. $mouseAcc\n";
# Get a sequence from a BLAST-formatted database using the "fastacmd" syntax
# fastacmd -d database -s accession and append (>>) to a file
# Get a human sequence using the "fastacmd" syntax and redirect output to $humanSeq
# 1
`fastacmd -d nr -s $humanAcc > $humanSeq`;
# Change format of sequence headers to make alignments clearer
`perl -i -pe 's/>gi.+ref\\|/>/' $humanSeq`;
# Get a mouse sequence using the "fastacmd" syntax and redirect output to $mouseSeq
# 2
`fastacmd -d nr -s $mouseAcc > $mouseSeq`;
# Change format of sequence headers to make alignments clearer
`perl -i -pe 's/>gi.+ref\\|/>/' $mouseSeq`;
# Concatenate both sequences ($humanSeq and $mouseSeq) to $seqPairFile (if doing clustal alignment)
# 3
`cat $humanSeq $mouseSeq > $seqPairFile`;
# Do alignments (perform subroutines below)
doClustal();
doGlobal();
doLocal();
}
# Close file handles and delete temporary files
close (HOMOLOGY);
unlink ($humanSeq, $mouseSeq, $seqPairFile);
print "\nSee $bigOutputFile for output\n\n";
########## subroutines ##########
sub doClustal
{
$clustalOut = "clustal.aln";
# Do alignment (#4)
`clustalw -INFILE=$seqPairFile -TYPE=DNA -OUTFILE=$clustalOut`;
# Add sequence headers to big alignment file (#5)
`grep ">" $seqPairFile >> $bigOutputFile`;
# Append alignment to big file and delete the small files (#5)
`cat $clustalOut >> $bigOutputFile`;
unlink("$clustalOut", "seqPair.dnd");
}
sub doGlobal
{
$globalOut = "global.aln";
# Do alignment (#4)
`needle $humanSeq $mouseSeq -outfile $globalOut -auto`;
# Add sequence headers to big alignment file (#5)
`grep ">" $seqPairFile >> $bigOutputFile`;
# Append alignment to big file and delete the small file (#5)
`cat $globalOut >> $bigOutputFile`;
unlink("$globalOut");
}
sub doLocal
{
$localOut = "local.aln";
# Do alignment (#4)
`water $humanSeq $mouseSeq -outfile $localOut -auto`;
# Add sequence headers to big alignment file (#5)
`grep ">" $seqPairFile >> $bigOutputFile`;
# Append alignment to big file and delete the small file (#5)
`cat $localOut >> $bigOutputFile`;
unlink("$localOut");
}
syntax highlighted by Code2HTML, v. 0.9.1