oligos.pl

1	#!/usr/local/bin/perl -w
2
3	# oligos.pl
4	# Create and analyze an overlapping series of oligos
5	# WI Biocomputing course - Bioinformatics for Biologists - October 2003
6	# Example of input taken as multiple arguments 
7
8	# Check input and give info if arguments are missing
9	if (! $ARGV[3])
10	{
11	   print "
12	This program creates a series of oligos from 'sequence'
13	of length 'oligoLength' starting at startPosition nt and
14	ending at endPosition nt
15	USAGE: oligos.pl sequence oligoLength startPosition endPosition\n\n";
16
17	   exit(0);
18	}
19	else
20	{
21	   $seq = $ARGV[0];   # sequence name
22	   $mer = $ARGV[1];   # length of oligos to extract
23	   $start = $ARGV[2];   # nt number for beginning of first oligo
24	   $end = $ARGV[3];   # nt number for beginning of last oligo
25	}
26
27	# Get continuous sequence from sequence file 
28	open (SEQ, $seq) || die "cannot open $seq for reading: $!";
29	# make array of lines
30	@seq = <SEQ>;
31
32	# get header (defline) of sequence (first line of fasta file)
33	$defline = $seq[0];
34
35	# Split definition line into fields separated by spaces
36	@defline_fields = split(/\s/, $defline);
37
38	$seq[0] = "";            # delete defline
39	$seq = join ("", @seq);         # join($glue, @list) to get seq without defline
40	$seq =~ s/\s*//g;         # delete spaces and end of lines
41	$seqLength = length ($seq);       # get length of sequence (optional) 
42
43	# $defline_fields[0] is the first part of defline (up to first space)
44	print "Oligos ($mer mers) for $defline_fields[0] ($seqLength nt) and % GC content\n";
45
46	# Beware of OBOB ==> for ($i = $start; $i < $end; $i++) would be one nucleotide off
47	# This "off-by-one bug" is a common problem, so always check your code
48
49	# Adjust counting since computers start at 0 but people think of sequences starting at 1
50	for ($i = $start - 1; $i < $end - 1; $i++)
51	{
52	   # Extract substring of sequence to make oligo ==> oligo = substr(seq, start, length);
53	   # Use $_ variable to make tr/// transliteration (letter counting) easier
54	   $_ = substr($seq, $i, $mer);
55
56	   # Adjust to conventional way of counting (e.g., first nt = 1, not 0)
57	   $nt = $i + 1;
58
59	   # Count the number of Gs and Cs in the oligo by counting 
60	   # the number of times G or C are "translated" into the same G or C within $_ (the oligo) 
61	   $numGC = tr/GC//;
62
63	   # Calculate per cent GC 
64	   $pcGC = 100 * $numGC / $mer;
65
66	   # Round off percent GC to 1 decimal places (%0.2f ==> 2 decimal places)
67	   $pcGCrounded = sprintf("%0.1f", $pcGC);
68
69	   # Print results delimited by tabs (for easy importing into spreadsheets, etc)
70	   print "$nt\t$_\t$pcGCrounded\n";
71	}
72	print "\n";
73