#!/usr/local/bin/perl -w # Unix and Programming Skills for Biologists - class 2 # Analyze a list of growth factors, starting by extracting their promoters # To do: Complete commands 0-5 # Input data file $gfData = "mapped_growth_factors.txt"; # Output files (cDNA and promoters) $cdnaSeqFile = "factors_cdna.fa"; $promoterFile = "factors_prom.fa"; # Temporary promoter file: to be appended to $promoterFile $tempFile = "prom.tmp"; # Length of promoter to extract $prom_length = 1000; # Full path to human chromosome files $chrPath = "/cluster/db0/Data/human_gp_jul_03/"; # If $promoterFile or $cdnaSeqFile already exists, delete it. if (-e "$promoterFile") { unlink($promoterFile); } if (-e "$cdnaSeqFile") { unlink($cdnaSeqFile); } # Input file contains the following fields: # 0: accession # 1: description of gene # 2: date of assembly used for mapping data # 3: chromosome (ex: chr10) # 4: strand (+ or -) # 5: transcript end 1 (start if strand = +; end otherwise) # 6: transcript end 2 (end if strand = +; end otherwise) # This file was generated from Ensembl Mart data (using an SQL query) open (GF_DATA, $gfData) || die "Major problem: cannot open $gfData for reading: $!"; while (<GF_DATA>) { # Read one line at a time # Get rid of new line at end of each line chomp($_); # Split the line in to tab-delimited fields @data = split(/\t/, $_); $acc = $data[0]; $desc = $data[1]; $chr = $data[3]; $strand = $data[4]; $end1 = $data[5]; $end2 = $data[6]; print "$acc $desc\n"; # for debugging # Get a sequence from a BLAST-formatted database using the "fastacmd" syntax # fastacmd -d database -s accession and append (>>) to $cdnaSeqFile # 0 # Get the sequence 1 kb upstream of the transcription start getUpstreamSeq(); } # Using the cDNA or promoter files, choose and implement at least one analysis from the # EMBOSS groups page: http://www.hgmp.mrc.ac.uk/Software/EMBOSS/Apps/groups.html # 4 # Delete any temporary files # 5 close (GF_DATA); print "\nSee output in $cdnaSeqFile (cDNA) and $promoterFile (promoters)\n\n"; sub getUpstreamSeq { # Get full path to chromosome file $chrFile = $chrPath . $chr . ".nib"; if ($strand eq "+") # get sequence "before" cDNA - end1 is 5' start { $toExtractStart = $end1 - $prom_length; $toExtractEnd = $end1; } elsif ($strand eq "-") # get sequence "after" cDNA - end2 is 5' start { $toExtractStart = $end2; $toExtractEnd = $end2 + $prom_length; } else # problems { print ">Strand couldn't be determined for $acc\n"; } if ($strand eq "+" || $strand eq "-") { # Extract sequence using nibFrag from nib files # Syntax: nibFrag chrFile toExtractStart toExtractEnd strand outputFile # 1 # Header to use for promoter file $newHeader = "$acc promoter $chr($strand):$toExtractStart-$toExtractEnd $desc"; # Change header of $tempFile with EMBOSS's descseq command with the syntax # descseq -seq seqFile -out newSeqFile -name newHeader # 2 # Add this promoter to the big promoter file ($promoterFile) # 3 } }