#!/usr/local/bin/perl -w
# Unix, Perl, and Python - class 2
# Parse a SAM file into a BED file that can be visualized in a genome browser
# Get the SAM file as the first argument
# ex: ./SAMtoBEDcounts.pl myFile.SAM
$sam = $ARGV[0];
if (! $ARGV[0]) # No argument is given
{
# Print an error message if the user forgets the name of the sam file
# 0
exit;
}
# Open the SAM file
open (SAM, "$sam") || die "Major problem: cannot open $sam for reading: $!";
while (<SAM>) # Read one line at a time
{
# 1
if (! /^@/)
{
chomp($_); # Delete newline at end of each line
# Use the tabs "\t" to split the line into fields, and place these fields into an array called @data
# Example command: @arrayOfFields = split /\t/, $lineOfTabDelimitedFile;
# 2
# Get the position of the start of the read sequence from field 4 ($data[3]),
# but SAM files start counting at 1, while BED files start counting as 0, so we need to subtract 1.
# 3
# Get the length of the read sequence in field 10 ($data[9])
# Example command: $myLength = length "TGCGTGCCCCGGT";
# 4
# Given the "start" (field 4) and the red length ($length), calculate the end ($end)
# We also need to subtract 1 to get the BED coordinate to match to SAM coordinate
# 5
# Convert this row into a BED-style file with the fields chr TAB start TAB end
# 6
# Add a name, score, and strand of the read to the BED file
# The strand is encoded in field 2 ($data[1]): If field 2 is 0, it's +; if field 2 is 16, it's -.
# 7 [Optional]
}
}
# Close file handles
close (SAM);
##########
syntax highlighted by Code2HTML, v. 0.9.1