#!/usr/local/bin/perl -w
# Unix, Perl, and Python - class 2
# Parse a SAM file into a BED file that can be visualized in a genome browser
# Get the SAM file as the first argument
# ex: ./SAMtoBEDcounts.pl myFile.SAM
$sam = $ARGV[0];
if (! $ARGV[0]) # No argument is given
{
# Print an error message if the user forgets the name of the sam file
# 0
print STDERR "\nConvert a SAM file to a BED file\n";
print STDERR "USAGE: $0 in.sam > out.bed\n\n";
exit;
}
# Open the SAM file
open (SAM, "$sam") || die "Major problem: cannot open $sam for reading: $!";
while (<SAM>) # Read one line at a time
{
# 1
if (! /^@/)
{
chomp($_); # Delete newline at end of each line
# Use the tabs "\t" to split the line into fields, and place these fields into an array called @data
# Example command: @arrayOfFields = split /\t/, $lineOfTabDelimitedFile;
# 2
@data = split /\t/, $_;
# Get the position of the start of the read sequence from field 4 ($data[3]),
# but SAM files start counting at 1, while BED files start counting as 0, so we need to subtract 1.
# 3
$start = $data[3] - 1;
# Get the length of the read sequence in field 10 ($data[9])
# Example command: $myLength = length "TGCGTGCCCCGGT";
# 4
$length = length $data[9];
# Given the "start" (calculated above) and the read length ($length), calculate the end ($end)
# 5
$end = $start + $length;
# Convert this row into a BED-style file with the fields chr TAB start TAB end
# 6
$bedRow = "$data[2]\t$start\t$end";
# print "$bedRow\n";
# Add a name, score, and strand of the read to the BED file
# The strand is encoded in field 2 ($data[1]): If field 2 is 0, it's +; if field 2 is 16, it's -.
# 7 [Optional]
if ($data[1] == 0)
{
$strand = "+";
}
elsif ($data[1] == 16)
{
$strand = "-";
}
else
{
print STDERR "Strand field ($data[1]) is not recognized\n";
}
# Get the number of mismatches (0 - 2)
$numMismatches = substr ($data[13], -1, 1);
# Create a score that's 2 minus the number of mismatches
$score = 2 - $numMismatches;
$bedRow .= "\t$.\t$score\t$strand";
print "$bedRow\n";
}
}
# Close file handles
close (SAM);
##########
syntax highlighted by Code2HTML, v. 0.9.1