#!/usr/local/bin/perl -w

# Unix, Perl, and Python - class 2
# Parse a SAM file into a BED file that can be visualized in a genome browser

# Get the SAM file as the first argument
#   ex: ./SAMtoBEDcounts.pl myFile.SAM

$sam = $ARGV[0];

if (! $ARGV[0])	# No argument is given
{
	# Print an error message if the user forgets the name of the sam file
	# 0
	
	print STDERR "\nConvert a SAM file to a BED file\n";
	print STDERR "USAGE: $0 in.sam > out.bed\n\n";
	exit;
}

# Open the SAM file
open (SAM, "$sam") || die "Major problem: cannot open $sam for reading: $!";

while (<SAM>)		# Read one line at a time
{
	# 1
	if (! /^@/)		
	{
		chomp($_);					# Delete newline at end of each line
		
		# Use the tabs "\t" to split the line into fields, and place these fields into an array called @data
		# Example command:  @arrayOfFields = split /\t/, $lineOfTabDelimitedFile;
		# 2
	
		@data = split /\t/, $_;

		# Get the position of the start of the read sequence from field 4 ($data[3]),
		# but SAM files start counting at 1, while BED files start counting as 0, so we need to subtract 1.
		# 3
		
		$start = $data[3] - 1;

		
		# Get the length of the read sequence in field 10 ($data[9])
		# Example command:  $myLength = length "TGCGTGCCCCGGT"; 
		# 4
		
		$length = length $data[9];


		# Given the "start" (calculated above) and the read length ($length), calculate the end ($end)
 		# 5
 		
 		$end = $start + $length;
		
		
		# Convert this row into a BED-style file with the fields chr TAB start TAB end 
 		# 6
	
		$bedRow = "$data[2]\t$start\t$end";
		# print "$bedRow\n";
		
		# Add a name, score, and strand of the read to the BED file
		# The strand is encoded in field 2 ($data[1]): If field 2 is 0, it's +; if field 2 is 16, it's -.
		# 7 [Optional]
		
		if ($data[1] == 0)
		{
			$strand = "+";
		}
		elsif ($data[1] == 16)
		{
			$strand = "-";
		}
		else
		{
			print STDERR "Strand field ($data[1]) is not recognized\n";
		}
		
		# Get the number of mismatches (0 - 2)
		$numMismatches = substr ($data[13], -1, 1);
		# Create a score that's 2 minus the number of mismatches
		$score = 2 - $numMismatches;
		
		$bedRow .= "\t$.\t$score\t$strand";
		
		print "$bedRow\n";
	}
}

# Close file handles
close (SAM);

##########


syntax highlighted by Code2HTML, v. 0.9.1