parse_genbank.pl

1	#! /usr/local/bin/perl -w
2	
3	# Homemade Genbank report parser using regular expressions.
4	# Once desired data is captured, it can be printed in any format.
5	# WI Biocomputing course - Bioinformatics for Biologists - October 2003
6	
7	$gb_report = $ARGV[0] || die "USAGE: $0 GenBankReport\n";
8	
9	open (GB, $gb_report) || die "cannot open $gb_report for reading: $!";
10	
11	# Flag for multiline translation; 1 means translation "in preogress"  
12	$trans = 0;
13	
14	while (<GB>)
15	{
16	   if (/(LOCUS\s*)(\w*)(.*)/)
17	   {
18	      print "Locus: $2\n";
19	   }
20	   elsif (/(VERSION.*GI:)(\d*)/)
21	   {
22	      print "GI: $2\n";
23	   }
24	   elsif (/(DEFINITION\s*)(.*)(\.)/)
25	   {
26	      print "Sequence name: $2\n";
27	   }
28	   elsif (/(ORGANISM\s*)(.*)/)
29	   {
30	      print "Organism: $2\n";
31	   }
32	   elsif(/(gene)(\s*)(\d*)(\.\.)(\d*)/)
33	   {
34	      print "Gene length: $5\n";
35	   }
36	   elsif (/(CDS\s*)(\d*)(\.\.)(\d*)/)
37	   # ex: CDS             357..1541
38	   {
39	      $cds_start = $2;
40	      $cds_end = $4;
41	      print "CDS: $cds_start - $cds_end\n";
42	   }
43	   elsif (/(\/translation=")(.*)/)   # protein product begins
44	   {
45	      print "Translation: ";
46	      $protein = $2;
47	      $trans = 1;
48	   }
49	   elsif ($trans)   # translation still going on
50	   {
51	      if (!/"/)   # no terminal quote; translation continues
52	      {
53	         $protein .= $_;
54	      }
55	      elsif (/(.*)(")/)   # terminal quote; end of translation
56	      {
57	         $protein .= $1;
58	         $protein =~ s/\s*//g;
59	         print "$protein\n";
60	         $trans = 0;
61	      }
62	      else
63	      {
64	         print "Problems: end of translation product not found.\n";
65	      }
66	   }
67	   else
68	   {
69	      # Skip this data
70	   }
71	}
72