oligos.pl

#!/usr/bin/perl -w

# oligos.pl
# Create and analyze an overlapping series of oligos
# WI Bioinformatics course - Feb 2002 - Lecture 5
# WI Bioinformatics course - Revised - Sep 2003 
# Example of input taken as multiple arguments 

# Check input and give info if arguments are missing
if (! $ARGV[3])
{
   print "
This program creates a series of oligos from 'sequence'
of length 'oligoLength' starting at startPosition nt and
ending at endPosition nt
USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n";

   exit(0);
}
else
{
   $seq = $ARGV[0];   # sequence name
   $mer = $ARGV[1];   # length of oligos to extract
   $start = $ARGV[2];   # nt number for beginning of first oligo
   $end = $ARGV[3];   # nt number for beginning of last oligo
}

# Get continuous sequence from sequence file 
open (SEQ, $seq) || die "cannot open $seq for reading: $!";
@seq = <SEQ>;            # make array of lines
$defline = $seq[0];         # get defline

# Split definition line into fields separated by spaces
@defline_fields = split(/\s/, $defline);   

$seq[0] = "";            # delete defline
$seq = join ("", @seq);         # join($glue, @list) to get seq without defline
$seq =~ s/\s*//g;         # delete spaces and end of lines
$seqLength = length ($seq);       # get length of sequence (optional) 

# $defline_fields[0] is the first part of defline (up to first space)
print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n";

# Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off
# This "off-by-one bug" is a common problem, so always check your code

# Adjust counting since computers start at 0 but people think of sequences starting at 1
for ($i = $start - 1; $i < $end - 1; $i++)
{
   # Extract substring of sequence to make oligo ==> $oligo = substr(seq, start, length);
   $oligo = substr($seq, $i, $mer); 
   
   # Adjust to conventional way of counting (e.g., first nt = 1, not 0)
   $nt = $i + 1;
   
   # Count the number of Gs and Cs in the oligo by counting 
   # the number of times G or C are "translated" into the same G or C within $oligo 
   $numGC = $oligo =~ tr/GC/GC/;
   
   # Calculate per cent GC 
   $pcGC = 100 * $numGC / $mer;
   
   # Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places)
   $pcGCrounded = sprintf("%0.1f", $pcGC);
   
   # Print results delimited by tabs (for easy importing into spreadsheets, etc)
   print "$nt\t$oligo\t$pcGCrounded\n";
}
print "\n";