analyzeGFs.pl

#!/usr/local/bin/perl -w

# Unix and Programming Skills for Biologists - class 2
# Analyze a list of growth factors, starting by extracting their promoters
# To do: Complete commands 0-5

# Input data file
$gfData = "mapped_growth_factors.txt";

# Output files (cDNA and promoters)
$cdnaSeqFile = "factors_cdna.fa";
$promoterFile = "factors_prom.fa";

# Temporary promoter file: to be appended to $promoterFile
$tempFile = "prom.tmp";

# Length of promoter to extract
$prom_length = 1000;

# Full path to human chromosome files
$chrPath = "/cluster/db0/Data/human_gp_jul_03/";

# If $promoterFile or $cdnaSeqFile already exists, delete it.
if (-e "$promoterFile")
    { unlink($promoterFile); }
if (-e "$cdnaSeqFile")
   { unlink($cdnaSeqFile); }

# Input file contains the following fields:
# 0: accession
# 1: description of gene
# 2: date of assembly used for mapping data
# 3: chromosome (ex: chr10)
# 4: strand (+ or -)
# 5: transcript end 1 (start if strand = +; end otherwise)
# 6: transcript end 2 (end if strand = +; end otherwise)

# This file was generated from Ensembl Mart data (using an SQL query)

open (GF_DATA, $gfData) || die "Major problem: cannot open $gfData for reading: $!";
while (<GF_DATA>)
{
   # Read one line at a time

   # Get rid of new line at end of each line
   chomp($_);

   # Split the line in to tab-delimited fields
   @data = split(/\t/, $_);

   $acc = $data[0];
   $desc = $data[1];
   $chr = $data[3];
   $strand = $data[4];
   $end1 = $data[5];
   $end2 = $data[6];

   print "$acc $desc\n";      # for debugging

   # Get a sequence from a BLAST-formatted database using the "fastacmd" syntax
   # fastacmd -d database -s accession               and append (>>) to $cdnaSeqFile

   # 0

   # Get the sequence 1 kb upstream of the transcription start
   getUpstreamSeq();
}
# Using the cDNA or promoter files, choose and implement at least one analysis from the
# EMBOSS groups page: http://www.hgmp.mrc.ac.uk/Software/EMBOSS/Apps/groups.html

# 4

# Delete any temporary files
# 5

close (GF_DATA);

print "\nSee output in $cdnaSeqFile (cDNA) and $promoterFile (promoters)\n\n";

sub getUpstreamSeq
{
   # Get full path to chromosome file
   $chrFile = $chrPath . $chr . ".nib";

   if ($strand eq "+")      # get sequence "before" cDNA - end1 is 5' start
   {
        $toExtractStart = $end1 - $prom_length;
        $toExtractEnd = $end1;
   }
   elsif ($strand eq "-")      # get sequence "after" cDNA - end2 is 5' start
   {
        $toExtractStart = $end2;
        $toExtractEnd = $end2 + $prom_length;
   }
   else   # problems
   {
        print ">Strand couldn't be determined for $acc\n";
   }
   if ($strand eq "+" || $strand eq "-")
   {
        # Extract sequence using nibFrag from nib files
        # Syntax: nibFrag chrFile toExtractStart toExtractEnd strand outputFile

      # 1

      # Header to use for promoter file
      $newHeader = "$acc promoter $chr($strand):$toExtractStart-$toExtractEnd $desc";

      # Change header of $tempFile with EMBOSS's descseq command with the syntax
      # descseq -seq seqFile -out newSeqFile -name newHeader

      # 2

      # Add this promoter to the big promoter file ($promoterFile)

      # 3
   }
}