#!/usr/local/bin/perl -w # Unix and Programming Skills for Biologists - class 2 # Analyze a list of homologous gene pairs # Input data file - pairs of human and mouse RefSeq accession numbers $hs_mm_pairs = "human_mouse_pairs.txt"; # Files used in data processing $humanSeq = "human.fa"; $mouseSeq = "mouse.fa"; $seqPairFile = "seqPair.fa"; $bigOutputFile = "alignment-all.txt"; # Remove $bigOutputFile from any previous script outputs if (-e "$bigOutputFile") # if the file exists { unlink($bigOutputFile); } # delete it [Perl's unlink = Unix's rm] # Open the homology data file open (HOMOLOGY, "$hs_mm_pairs") || die "Major problem: cannot open $hs_mm_pairs for reading: $!"; while (<HOMOLOGY>) # Read one line at a time { chomp($_); # Delete newline at end of each line @data = split(/\t/, $_); # Split the line into tab-delimited fields # Get human ($humanAcc) and mouse ($mouseAcc) accessions from @data # 0 print "$humanAcc vs. $mouseAcc\n"; # for debugging # Get a human sequence using the "fastacmd" syntax and redirect output to $humanSeq # 1 # Change format of sequence headers to make alignments clearer `sed 's/>gi|.*ref|/>/' $humanSeq > hs.tmp; mv hs.tmp $humanSeq`; # Get a mouse sequence and do the same thing # 2 # Change format of sequence headers to make alignments clearer `sed 's/>gi|.*ref|/>/' $mouseSeq > mm.tmp; mv mm.tmp $mouseSeq`; # Concatenate both sequences to $seqPairFile (if doing clustal alignment) # 3 # Do alignment for each sequence, choosing an informative filename for output # 4 # Append sequence headers and alignment to $bigOutputFile # 5 } print "\nSee $bigOutputFile for output\n\n";