parse_genbank.pl
1
2
3
4
5
6
7 $gb_report = $ARGV[0] || die "USAGE: $0 GenBankReport\n";
8
9 open (GB, $gb_report) || die "cannot open $gb_report for reading: $!";
10
11
12 $trans = 0;
13
14 while (<GB>)
15 {
16 if (/(LOCUS\s*)(\w*)(.*)/)
17 {
18 print "Locus: $2\n";
19 }
20 elsif (/(VERSION.*GI:)(\d*)/)
21 {
22 print "GI: $2\n";
23 }
24 elsif (/(DEFINITION\s*)(.*)(\.)/)
25 {
26 print "Sequence name: $2\n";
27 }
28 elsif (/(ORGANISM\s*)(.*)/)
29 {
30 print "Organism: $2\n";
31 }
32 elsif(/(gene)(\s*)(\d*)(\.\.)(\d*)/)
33 {
34 print "Gene length: $5\n";
35 }
36 elsif (/(CDS\s*)(\d*)(\.\.)(\d*)/)
37
38 {
39 $cds_start = $2;
40 $cds_end = $4;
41 print "CDS: $cds_start - $cds_end\n";
42 }
43 elsif (/(\/translation=")(.*)/) # protein product begins
44 {
45 print "Translation: ";
46 $protein = $2;
47 $trans = 1;
48 }
49 elsif ($trans)
50 {
51 if (!/"/) # no terminal quote; translation continues
52 {
53 $protein .= $_;
54 }
55 elsif (/(.*)(")/) # terminal quote; end of translation
56 {
57 $protein .= $1;
58 $protein =~ s/\s*//g;
59 print "$protein\n";
60 $trans = 0;
61 }
62 else
63 {
64 print "Problems: end of translation product not found.\n";
65 }
66 }
67 else
68 {
69
70 }
71 }
72