#! /usr/bin/env perl
use warnings;

# purpose:
# find the shared symbol (1st field) between 2 two tab-delimited files,
# IDs such as gene symbols should be in the first column of both files
#          
# input file format: 
# 2 tab-delimited files
# Output:
# symbol order is the same as gene symbol (IDs) in the first file
# if one symbol in the first file has >1 matches in the second file:
# all the records in the 2nd file corresponding with this symbol will be printed
# if a symbol in the first file does not exist in the 2nd file,
# 'UNMATCHED' will be added for this symbol
#
# author: Bingbing Yuan
#



use strict;

if (! $ARGV[1]) {
  print "Usage:\n
$0 file_1.txt file_2.txt\n
Note: 	Assume input files are tab-delimited.
	The program will split each record by tab.
	Matched id should be in the FIRST column of both files, and are case sensitive.
      	Results will be in the same order of the ids of the first file.
 
      	If an id in the first file has >1 matches in the second file,
      	all the records in the 2nd file with this id will be printed.

      	If an id in a record of the first file does not exist in the 2nd file,
      	'UNMATCHED' will be added for this record.\n\n";
exit;
}

my $UNKNOWN = "UNMATCHED";


my ($db2_href, $Size_2) = fileToHashOfArrayBySep($ARGV[1]);

open(FIRSTFILE, $ARGV[0]) || die "can not read $ARGV[0] $!\n";
while (<FIRSTFILE>) {
  my $found = 0;
  #    print $_;
  chomp();
  # ignore comments and empty spaces
  next if ($_ =~ /^\#/);
  next if ($_ =~ /^\s*$/);
  # split record by tab
  my @arr = split(/\t/,  $_, -1);
  my $id = shift(@arr);
  
  if ( $db2_href ->{ $id } )
  {
      for my $i (0..$#{ $db2_href->{$id} } )
      {
        print "$id\t", join("\t", @arr), "\t", join("\t", $db2_href->{$id}->[$i] ), "\n";
      }
      $found =1;
  }
  # print "\n\n";
  if (! $found)
  {
    my $unknown = $UNKNOWN;
    # print the missing columns with blank
    for my $i (1..$Size_2)
    {
       $unknown .= "\t";
    }
    print "$id\t", join("\t", @arr), "\t", $unknown,  "\n";
    
  }
}
close(FIRSTFILE);


sub fileToHashOfArrayBySep {
  my $file = shift;
  my %hash = ();
  my $size;
  open (FL, $file) || die "Can not open $file\n";
  while (<FL>) {
#    print $_;
    chomp();
    # ignore comments and empty lines
    next if ($_ =~ /^\#/);
    next if ($_ =~ /^\s*$/);

    #0610006A03Rik   2.1000

    # split each record by tab
    my @arr = split(/\t/, $_, -1);
    my $id  = shift(@arr);
    
    if (! defined $hash{ $id } )
    {
      @{ $hash{$id} } = ();
    }
    push @{ $hash{ $id } }, join("\t", @arr);
    
    if (! $size)
    {
      $size = $#arr;
    }
  }
  close (FL) || die "Can not close $file\n";
  return (\%hash, $size);
}
syntax highlighted by Code2HTML, v. 0.9.1