#!/usr/local/bin/perl -w # Use BioPerl to convert sequence formats # Use GD to generate a PNG figure # WIBR - Unix and Programming Skills for Biologists - class 3 # Input file in GenBank format $inFile = "seqs.gb"; use Bio::SeqIO; # for sequence handling use GD; # for graphics # Set input format (fasta, genbank, embl, pir, gcg, etc.) $in = Bio::SeqIO->new('-file' => "$inFile", '-format' => 'Genbank'); # Set width of each nt block on image $ntWidth = 15; # Set number of nts per line in figure $ntPerLine = 50; # Iterate through the sequences while ($seqobj = $in->next_seq()) { readSequence(); setupImage(); # Split sequence ($rawseq) into an array (@nt) of nts @nt = split(//, $rawseq); for ($i = 0; $i <= $#nt; $i++) { $thisNT = $nt[$i]; # print "$nt[$i] "; # Create coordinates for boxes and letters $x1 += $ntWidth; $x2 = $x1 + $ntWidth; $y2 = $y1 + $ntWidth; # Print boxes only for nts in the coding region (CDS) # See parseSeqFeatures subroutine for variable to use. # using the GD command: # $img->filledRectangle($x1, $y1, $x2, $y2, $ntToColor{$thisNT}); # 1 - uncomment the following line # $img->filledRectangle($x1, $y1, $x2, $y2, $ntToColor{$thisNT}); # Print nucleotide in center of box # 2 - uncomment the following line # $img->string(gdGiantFont, $x1 + 3, $y1, $thisNT, $black); # Handle end of row => start new row if (($i + 1) % $ntPerLine == 0) { $y1 += 2 * $ntWidth; $x1 = 25; # Print position of nucleotide on left $img->string(gdTinyFont, 5, $y1 + 4, $i + 2, $black); } } # Print this image to a file as a PNG graphic open(OUT_IMG, ">$imagefile") || die "Cannot write to $imagefile: $!\n"; print OUT_IMG $img->png; close OUT_IMG; # Make a file of alternating DNA and protein sequences (in fasta format) # 5 - uncomment the following line # printSeqs(); } # Print final message $imagefileList = join (" ", @imagefiles); print "\nFor images, see\n$imagefileList\n"; ######################### SUBROUTINES ######################### sub readSequence { # Get raw sequence for this sequence $rawseq = $seqobj->seq(); # Get length (regular Perl command) $seqLength = length($rawseq); # Get sequence ID $id = $seqobj->display_id(); # Get sequence description # $desc = $seqobj->desc(); # Parse sequence features (CDS, etc.) parseSeqFeatures(); # Print sequence in fasta format (for debugging, at least) # $out->write_seq($seqobj); # Write title of figure $title = "$id ($seqLength nt, from $inFile)"; # If the CDS was found, add the nt for the CDS start and end to the title # See the parseSeqFeatures subroutine to find the variable names # 3 } sub setupImage { # Make name of output file and add to array of all images $imagefile = $id . ".png"; push (@imagefiles, $imagefile); # Calculate size of image to create $imageHeight = $seqLength / $ntPerLine * $ntWidth * 2 + 80; $imageWidth = $ntPerLine * $ntWidth + 100; # Create a new image $img = new GD::Image($imageWidth, $imageHeight); defineColors(); defineNtColors(); # Initialize location of first nt box $x1 = 25; $y1 = 40; # Write a title across the top - see documentation for choices of font sizes $img->string(gdGiantFont, 45, 5, $title, $black); # Print 1 next to 1st nt $img->string(gdTinyFont, 5, $y1 + 4, "1", $black); } sub defineColors { # Describe colors you want to use: each integer is 0-255 for Red, Green, and Blue # Change the nt boxes to your 4 favorite colors # To do this change, the numbers below (and adjust variable names if desired). # If you do this adjust the hash in the defineNtColors subroutine # 4 $white = $img->colorAllocate(255,255,255); $black = $img->colorAllocate(0,0,0); $green = $img->colorAllocate(0, 255, 0); $blue = $img->colorAllocate(0, 0, 255); $red = $img->colorAllocate(255, 0, 0); $yellow = $img->colorAllocate(255, 255, 0); } sub defineNtColors { # Make hash to convert nts to colors $ntToColor{"A"} = $red; $ntToColor{"G"} = $green; $ntToColor{"C"} = $blue; $ntToColor{"T"} = $yellow; } sub parseSeqFeatures { # Look at sequence features (but just save CDS data) foreach $feat ($seqobj->all_SeqFeatures()) { $featName = $feat->primary_tag; if ($featName eq "CDS") { $cdsStart = $feat->start; $cdsEnd = $feat->end; } foreach $tag ($feat->all_tags()) { if ($tag eq "translation") { # Get protein sequence @protein = $feat->each_tag_value("translation"); $protein = join ("", @protein); $protId = "$id"; # Make a sequence object of the protein $protSeqObj = Bio::PrimarySeq->new('-seq' => $protein, '-format' => 'raw', -id => $protId, -type=> 'protein'); } } } } sub printSeqs { $outDna = Bio::SeqIO->new('-format' => 'Fasta'); $outDna->write_seq($seqobj); $outProt = Bio::SeqIO->new('-format' => 'Fasta'); $outProt->write_seq($protSeqObj); }