blast_parse_2.pl

1	#!/usr/local/bin/perl -w
2	
3	# Parsing BLAST reports with the NHGRI::Blastall module
4	# see http://genome.nhgri.nih.gov/blastall/Blastall.html for documentation
5	# WI Biocomputing course - Bioinformatics for Biologists - October 2003
6	
7	# Show Perl where the module is (since it's in my own directory)
8	use lib '/home/gbell/perl_modules/nhgri/';
9	# Use the NHGRI::Blastall module
10	use NHGRI::Blastall;
11	use GD;
12	
13	if (! $ARGV[0])
14	{
15	   print "What is the BLAST file to parse? ";
16	
17	   # Get input and remove the newline character at the end
18	   chomp ($inFile = <STDIN>);
19	}
20	else
21	{
22	   $inFile = $ARGV[0];
23	}
24	
25	# Make a new BLAST report "object"
26	my $b = new NHGRI::Blastall;
27	
28	# Read BLAST report
29	$b->read_report("$inFile");
30	
31	# Parse all results
32	@results = $b->result();
33	
34	# Matching sequence IDs (of hit seqs)
35	@ids = $b->result('id');
36	
37	# Get defline of query sequence
38	@query_defline = $b->result('query');
39	
40	print "HIT_NUM\tQUERY\tQUERY_ID\tSUBJECT_LENGTH\tSCORE\tEXPECT\t";
41	print "MATCH_LENGTH\tIDENTITY\tQUERY_START\tQUERY_END\tSUBJECT_START\tSUBJECT_END\n";
42	print "\n**** DATA FOR QUERY $query_defline[0] ****\n\n";
43	
44	# For each matching sequence in the report 
45	for ($i = 0; $i <= $#ids; $i++)
46	{
47	   # Number of hit (1...)
48	   $hit_num = $i + 1;
49	   # Get seq ID for this hit seq
50	   $this_id = $ids[$i];
51	   # Get the definition line of the matching sequence
52	   @deflines = $b->result('defline', $ids[$i]);
53	
54	   for ($j = 0; $j <= $#deflines; $j++)
55	   {
56	      # Get rid of extra spaces at end of lines comprising @deflines
57	      # (This is a bug in this implementation)
58	      $deflines[$j] =~ s/(\s){5,}/ /g;
59	   }
60	
61	   # Score of hit
62	   @scores = $b->result('scores', $ids[$i]);
63	   # Expect value
64	   @expects = $b->result('expects', $ids[$i]);
65	   # Length of matching sequence (entire seq, not just matching region)
66	   @subject_length = $b->result('subject_length', $ids[$i]);
67	   # Length of match
68	   @match_lengths = $b->result('match_lengths', $ids[$i]);
69	   # Percent identities for length of match
70	   @identities = $b->result('identities', $ids[$i]);
71	
72	   # nt number for each end of query and subject alignment
73	   @query_starts = $b->result('query_starts', $ids[$i]);
74	   @query_ends = $b->result('query_ends', $ids[$i]);
75	   @subject_starts = $b->result('subject_starts', $ids[$i]);
76	   @subject_ends = $b->result('subject_ends', $ids[$i]);
77	
78	   # Go through every alignment between query and hit seq
79	   # !!! Note that there's often more than one alignment
80	   # between pair of sequences !!!
81	
82	   # If you only care about the best alignment region between sequences,
83	   # only print out results where $k = 0
84	
85	   for ($k = 0; $k <= $#scores; $k++)
86	   {
87	      print "$hit_num\t$deflines[0]\t$this_id\t";
88	      print "$subject_length[0]\t$scores[$k]\t$expects[$k]\t$match_lengths[$k]\t";
89	      printf "%0.2f", $identities[$k];
90	      print "\t$query_starts[$k]\t$query_ends[$k]\t$subject_starts[$k]\t$subject_ends[$k]\n";
91	   }
92	}
93	
94	
95	
96	# To print out all results (using dump_results() subroutine below)
97	# print "ALL RESULTS:\n";
98	# dump_results();
99	
100	sub dump_results
101	{
102	   foreach $rh_r (@results)
103	   {
104	      while (($key,$value) = each %$rh_r)
105	      {
106	         if (ref($value) eq "ARRAY")
107	         {
108	            print "$key: ";
109	            foreach $v (@$value)
110	            {
111	               print "$v ";
112	            }
113	            print "\n";
114	         }
115	         else
116	         {
117	            print "$key: $value\n";
118	         }
119	      }
120	      print "\n";
121	   }
122	}
123	
124