#!/usr/bin/perl -w

use strict;
my %code = (
    'UUU', 'F', 'UUC', 'F', 'UUA', 'L', 'UUG', 'L', 'CUU', 'L', 'CUC', 'L',
    'CUA', 'L', 'CUG', 'L', 'AUU', 'I', 'AUC', 'I', 'AUA', 'I', 'AUG', 'M',
    'GUU', 'V', 'GUC', 'V', 'GUA', 'V', 'GUG', 'V', 'UCU', 'S', 'UCC', 'S',
    'UCA', 'S', 'UCG', 'S', 'CCU', 'P', 'CCC', 'P', 'CCA', 'P', 'CCG', 'P',
    'ACU', 'T', 'ACC', 'T', 'ACA', 'T', 'ACG', 'T', 'GCU', 'A', 'GCC', 'A',
    'GCA', 'A', 'GCG', 'A', 'UAU', 'Y', 'UAC', 'Y', 'UAA', 'X', 'UAG', 'X',
    'CAU', 'H', 'CAC', 'H', 'CAA', 'Q', 'CAG', 'Q', 'AAU', 'N', 'AAC', 'N',
    'AAA', 'K', 'AAG', 'K', 'GAU', 'D', 'GAC', 'D', 'GAA', 'E', 'GAG', 'E',
    'UGU', 'C', 'UGC', 'C', 'UGA', 'X', 'UGG', 'W', 'CGU', 'R', 'CGC', 'R',
    'CGA', 'R', 'CGG', 'R', 'AGU', 'S', 'AGC', 'S', 'AGA', 'R', 'AGG', 'R',
    'GGU', 'G', 'GGC', 'G', 'GGA', 'G', 'GGG', 'G'
);

########
my $serine_threonine_cutoff = 0.4;
my $proline_cutoff          = 0.05;
my $wid                     = 50;
my $step                    = 5;
#####################

# pts  analysis of protein sequences starting from nt seqs

my $count   = 0;
my $ref     = 0;
my $counter = 1;

my $infile = shift;
unless ($infile) { die "Usage: pts_scan.pl [nucleotide sequence file\n";}
open( IN, $infile ) or die "Oops, could not open sequence file\n";
my $id  = '';
my $seq = '';
while (<IN>) {
    chomp;
    if (/^>/) {
        if ( $id ne '' ) {
            analyze_nt( $id, $seq );
        }
        $id  = $_;
        $seq = '';
    }
    else {
        $seq .= $_;
    }
}
close IN;

analyze_nt( $id, $seq );

#############################################3

sub analyze_nt {
    my $frame = undef;
    my ( $id, $seq ) = @_;
    my $strand = 'F';
    $seq =~ s/[^A-Za-z]//g;
    $seq =~ tr/a-z/A-Z/;
    $seq =~ s/T/U/g;

    for ($frame = 0 ; $frame < 3 ; $frame++ ) {
        # print "frame $frame\n";
        my $pep = '';

        for ( my $a = $frame ; $a < length($seq) - 3 ; $a = $a + 3 ) {
            my $codon = substr( $seq, $a, 3 );
            
            if ( ( $codon =~ /N/ ) || ( length($codon) < 3 ) ) { $pep .= 'X'; }



            else {
		#unless ($code{$codon}) {die "codon $codon\n";} 

		$pep .= $code{$codon}; }

        }
       # my $len = length($pep);
       # print "len $len\n";
        ptsfind( $pep, $frame, $strand, $id );
        #print "frame $frame\n";
    }
###############
    # reverse comp
###############

    $seq = reverse($seq);
    $seq =~ tr/AUCG/UAGC/;
    $strand = 'R';

    for ( $frame = 0 ; $frame < 3 ; $frame++ ) {
	#print "frame $frame\n";
        my $pep = '';

        for ( my $a = $frame ; $a < length($seq) - 3 ; $a = $a + 3 ) {
            my $codon = substr( $seq, $a, 3 );

            if ( ( $codon =~ /N/ ) || ( length($codon) < 3 ) ) { $pep .= 'X'; }

            else { $pep .= $code{$codon}; }

        }
        #my $len = length($pep);
        #print "len $len\n";
        ptsfind( $pep, $frame, $strand ,$id);

    }

}    # end analyze_nt

###########################################

sub ptsfind {

    my ( $pep, $frame, $strand ,$id) = @_;

    my $save = '';

    my $begin = undef;
    my $end   = undef;

    my $len = length($pep) - $wid;
    #print "len $len\n";
    #if ( $pep =~ /TVTPSTTTTEISSTTLTTTTVTTTTPTTPTTTIPPSTTPEL/ ) {

       # print "pep";
       # exit(1);
    #}
    for ( my $a = 0 ; $a < $len ; $a += $step ) {

        my $test = substr( $pep, $a, $wid );

        my $count_p = $test =~ tr/P/P/;
        my $count_s = $test =~ tr/S/S/;
        my $count_t = $test =~ tr/T/T/;

        if (   ( ( $count_p / $wid ) > $proline_cutoff )
            && ( ( ( $count_s + $count_t ) / $wid ) > $serine_threonine_cutoff )
          )
        {

            if ( $save eq '' ) {
                $save  = $test;
                $begin = $a + 1;
                $end   = $a + $wid;
            }

            else {
                my $s = $a + $wid - $step;

                $save .= substr( $pep, $s, $step );
                $end += $step;
            }

        }    # if count_p

        else {
            if ( $save ne '' ) {

                my $begnt;
                my $endnt;
                if ( $strand eq 'F' ) {
                    $begnt =  $begin * 3 ;
                    $endnt =  $end * 3 ;
                }
                if ( $strand eq 'R' ) {
                    my $seqlen = length($seq);
                    $begnt = $seqlen - ( $begin * 3 );
                    $endnt = $seqlen - ( $end * 3 );

                }
                if ( length($save) > 60 ) {

                    print ">";
                    print $counter;
                    print " $infile";
                    print " $ref ";
                    print '-';
                    print " $count ";
                    print "ID $id FRAME $frame STRAND $strand BEG $begnt END $endnt";
                    print "\n";

                    $counter++;
                    print "$save\n";
                }

                $save  = '';
                $begin = 0;
                $end   = 0;

            }    # if $save
        }    # else

    }    # for a

}    # END PTSFIND

