#!/usr/bin/perl -w
#
#--------------------------------------- 
use strict;
use warnings;

# Read seed definition from a formatted file
sub read_subseed_alphabet($){
    my (
	$filename
	) = @_;
    my @seedalphabettable   = ();
    my $i = 0;
    open(IN, $filename) or die("Cant open \"$filename\"");
    while (<IN>){
	if ($_ =~ /^[CFYWMLIVGPATSNHQEDRK \t]+$/ ){ 
 	    chomp($_);
	    my @seedalphabetline = split(/[ \t]+/,$_);
	    for (my $j = 0 ; $j <= $#seedalphabetline ; $j++){
		$seedalphabettable[$i][$j] = $seedalphabetline[$j];
	    }
	    $i++;
	}
    }
    return \@seedalphabettable;
}

# read the nth Subset seeds of sensitivity at least ...
sub read_subseeds($$$){
    my ($filename,$r_alphabet,$minselectivity) = @_;
    my @seedtable  = ();
    open(IN, $filename) or die("Cant open \"$filename\"");
    # search for a seed of selectivity at least ...
    while (<IN>){
	chomp($_);
	my @splitline = split (/[ \t]+/,$_); 
	if ($splitline[1] > $minselectivity){	    
	    #read seeds (first column)
	    my @seeds = split (/,/,$splitline[0]);
	    for (my $i=0 ; $i <= $#seeds ; $i++) {
		my $seed = $seeds[$i];
		my @seedletters = split (/#/,$seed);
		if ($#seedletters >= 1) {
		    for (my $j=1; $j<=$#seedletters; $j++) { 
			# j starts from 1 : there is no letter before the first sharp
			my $r_col = $$r_alphabet[($seedletters[$j])];
			$seedtable[$i][$j] = $r_col;			
		    }
		} else {
		    for (my $j=0 ; $j<length($seed); $j++) {
			my $numberletter = substr($seed,$j,1);
			if ( $numberletter =~ /^[A-Z]/) {
			    $numberletter = ord($numberletter) - ord("A")  + 10;
			} elsif (  $numberletter =~ /^[a-z]/) {
			    $numberletter = ord($numberletter) - ord("a")  + 10 + 26;
			}
			my $r_col = $$r_alphabet[$numberletter];
			$seedtable[$i][$j] = $r_col;
		    }
		}
	    }
	    return \@seedtable;
	}
    }
}



# Print 2d table
sub print_table($){
    my ($r_T) = @_;
    for (my $i = 0 ; $i <= $#$r_T; $i++){
	my $r_L  = $$r_T[$i];
	for (my $j = 0 ; $j <= $#$r_L; $j++){
	    print $$r_L[$j].",";
	}
	print "\n";
    }
}

# Print 3d table
sub print_3table($){
    my ($r_T) = @_;
    for (my $i = 0 ; $i <= $#$r_T; $i++){
	my $r_X  = $$r_T[$i];
	print "".(($#$r_X)+1)."\n";
	for (my $j = 0 ; $j <= $#$r_X; $j++){
	    my $r_Y = $$r_X[$j];
	    for (my $k = 0 ; $k <= $#$r_Y; $k++){
		print $$r_Y[$k].",";	
	    }
	    print "\n";
	}
	print "--\n";
    }
}







#------#
# MAIN #
#------#


my $usage = "PlastP_seeds.pl Alphabet_file Seed_file Selectivity\n  - Alphabet_file : protein alphabet file (see http://bioinfo.lifl.fr/yass/iedera_proteins/Results-alphabets/ )\n  - Seed_file : seed definition file (see http://bioinfo.lifl.fr/yass/iedera_proteins/Results-models/ )\n  - Selectivity : value between 0.0 and 1.0 (eg 0.997909 on L16 for blastp default parameters)\n";

my $ALPHABET_FILENAME  = shift or die $usage;
my $SEEDS_FILENAME     = shift or die $usage;
my $SELECTIVITY        = shift or die $usage;


my $ALPHABET = &read_subseed_alphabet($ALPHABET_FILENAME);
# &print_table($ALPHABET);
my $SUBSEEDS = &read_subseeds($SEEDS_FILENAME,$ALPHABET,$SELECTIVITY);
&print_3table($SUBSEEDS);


