#!/usr/local/bin/perl -w # oligos.pl # Create and analyze an overlapping series of oligos # WI Biocomputing course - Unix and Programming Skills for Biologists - March 2003 # Example of input taken as multiple arguments # Check input and give info if arguments are missing if (! $ARGV[3]) { print " This program creates a series of oligos from 'sequence' of length 'oligoLength' starting at startPosition nt and ending at endPosition nt USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n"; exit(0); } else { $seq = $ARGV[0]; # sequence name $mer = $ARGV[1]; # length of oligos to extract $start = $ARGV[2]; # nt number for beginning of first oligo $end = $ARGV[3]; # nt number for beginning of last oligo } # Get continuous sequence from sequence file open (SEQ, $seq) || die "cannot open $seq for reading: $!"; @seq = <SEQ>; # make array of lines $defline = $seq[0]; # get defline # Split definition line into fields separated by spaces @defline_fields = split(/\s/, $defline); $seq[0] = ""; # delete defline $seq = join ("", @seq); # join($glue, @list) to get seq without defline $seq =~ s/\s*//g; # delete spaces and end of lines $seqLength = length ($seq); # get length of sequence (optional) # $defline_fields[0] is the first part of defline (up to first space) print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n"; # Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off # This "off-by-one bug" is a common problem, so always check your code # Adjust counting since computers start at 0 but people think of sequences starting at 1 for ($i = $start - 1; $i < $end - 1; $i++) { # Extract substring of sequence to make oligo ==> $oligo = substr(seq, start, length); $oligo = substr($seq, $i, $mer); # Adjust to conventional way of counting (e.g., first nt = 1, not 0) $nt = $i + 1; # Count the number of Gs and Cs in the oligo by counting # the number of times G or C are "translated" into the same G or C within $oligo $numGC = $oligo =~ tr/GC/GC/; # Calculate per cent GC $pcGC = 100 * $numGC / $mer; # Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places) $pcGCrounded = sprintf("%0.1f", $pcGC); # Print results delimited by tabs (for easy importing into spreadsheets, etc) print "$nt\t$oligo\t$pcGCrounded\n"; } print "\n";