#!/usr/bin/perl -w

use strict;
use read_fasta;


# pts  analysis of protein sequences

########
my $serine_threonine_cutoff = 0.4;
my $proline_cutoff = 0.05;
my $wid = 50;
my $step = 5;
#####################
my  $id = '';
my $seq = '';
my $no_of_seqs = 0;


my $in = shift ; # input file with sequences in fasta format

unless ($in) {die "Usage *.pl [infile]\n";}

my ($array_ref) = read_fasta($in);

my @array = @$array_ref;

foreach my $s (@array) {

        chomp($s);
        my @s = split(/___/,$s);
        my $id = $s[0];
        $id =~ s/ .*//;
        my $seq = $s[1];
	my ($coll,$seqs) = pts ($id, $seq);
	my @coll = @$coll;
	my @seqs = @$seqs;

	#foreach my $tmp(@coll) {print "tmp $tmp \n";}
	#foreach my $tmp(@seqs) {print "tmp $tmp \n";}


	# check overlaps
	my  @check = ();
	my @new =();
	my @newseq = ();
	my $d = 0;
	for (my $p = 0; $p< scalar(@coll); $p++) {
		for (my $r = $p+1; $r < scalar(@coll); $r++) { 
			my @s = split (' ',$coll[$p]);

			my @t = split (' ',$coll[$r]);

			if ($s[0] eq $t[0]) {

				if ($t[1] < $s[2]) { # if overlap merge the two
					$coll[$p] = "$s[0] $s[1] $t[2] $s[3]";
					$seqs[$p] =  $seqs[$r];
					push @check, $r;
	
				}
	
			} # if s=t
	
		} # for r
		
		my $match = 0;
		foreach my $ch (@check) {
			if ($ch eq $p ) {$match = 1;}
			}
		if ($match == 0) {
			$new[$d] = $coll[$p]; 
			$newseq[$d] = $seqs[$p]; 
			$d++;
		
		}
		
	} # for  $p

	# print the whole thing

	for ($a=0; $a < scalar(@new); $a++) {
		print '>';
		print "$new[$a]\n";
		my @s = split (' ',$new[$a]);
		my $be = $s[1]-1;
		my $en = $s[2]-$s[1]+1;
		my $se = $newseq[$a];
		my $subseq = substr($se ,$be, $en);
		my $c=-1;
		        while ($c < length($subseq)) {
 
		   	     $c++; 
			     if ((($c % 60) ==0) && ($c != 0)){print "\n";}
			     print substr($subseq,$c,1);
		        }
		print "\n";
		
	}	





} # foreach my @array




sub pts {
	my ($id, $seq) =  @_;
	my $save = '';
	my  $begin;
	my  $end;
	my  @coll = ();
	my @seqs = ();
	my  $counter = 0;
	my @t = split(/ /,$id);
	$id = $t[0];
	$id =~ s/>//;

	$seq =~ tr/a-z/A-Z/;
	my $len = length($seq) - $wid;
	for (my $a=0; $a < $len; $a=$a+$step) {

		my $test = substr($seq,$a,$wid);

		my $count_p = $test =~ tr/P/P/;
		my $count_s = $test =~ tr/S/S/;
		my $count_t = $test =~ tr/T/T/;


		if ((($count_p / $wid) > $proline_cutoff) && ((($count_s+$count_t)/ $wid) > $serine_threonine_cutoff)) {

			if ( $save eq '') {
				$save = $test;
				$begin = $a +1; $end = $a+$wid;
				}
		
			else {
				my $s = $a + $wid - $step;
				$save .= substr($seq,$s,$step); $end += $step;
				}

		} # if count_p

		else {if ($save ne '' ) {
			#push @coll, $begin, $end;
			$coll[$counter] ="$id $begin $end $len";
 			$seqs[$counter] = $seq;
			$counter ++;
			$save = ''; 
			$begin= 0; 
			$end = 0; 
		} # if $save
	} # else


} # for a

if ( $save ne '') {

			$coll[$counter] ="$id $begin $end $len";
 			$seqs[$counter] = $seq;
			$counter ++;
			$save = ''; 
			$begin= 0; 
			$end = 0; 
}

return (\@coll,\@seqs);

} # sub pts

