#! /usr/bin/perl -w
# get_proteins - given a list of SWISS-PROT files, extract data
# from them in preparation for importation into a database system.
#
# Note that the results produced are TAB-delimited.

use strict;
# $table_line - holds the tab-delimited line
# $code       - holds the protein code
# $species    - holds the species value

my ( $table_line, $code, $species );

# run the statements in the while loop until there are no line arriving from the standard input
while ( <> ) {
	# the current line is assigned to the Perl's default variable $_

	# the current is matched against the pattern to look for the ID line type
	# recall: ? override the behaviours of repetition quantifiers to match as few chars as possible
	# recall: use parentheses to retrieve parts of the matched pattern
	if ( /^ID   (.+)_(.+?) / ) {
		( $code, $species ) = ( $1, $2 );
	}


	# the current is matched against the pattern to look for the AC line type
	if ( /^AC   (.+?);/ ) {
		$table_line = $1 . "\t" . $code . "\t" . $species . "\t";
		# since are only care about the first AC line, we simply disgard the rests of them if present.
		while ( <> ) {
			last unless /^AC/;
		}
	}
	
	# the current is matched against the pattern to look for the LAST DT line type
	if ( /^DT/ ) {
		my $date_line = $_;
		# disgard all the DT lines except the last one
		while ( <> ) {
			last unless /^DT/;
			$date_line = $_;
		}
		$date_line =~ /^DT   (.+?) /;
		# use the subroutine "biodb2mysql" to convert the SWISS-PROT" date format into that of MySQL
		$table_line = $table_line . biodb2mysql( $1 ) . "\t";
	}
	
	# the current is matched against the pattern to look for ALL the DE line type
	if ( /^DE   (.+)/ ) {
		my $descr_lines = $1;
		# find and cancatenate all the DE lines
		while ( <> ) {
			last unless /^DE   (.+)/;
			$descr_lines = $descr_lines . ' ' . $1;
		}
		$table_line = $table_line . $descr_lines . "\t";
	}
	
	# the current is matched against the pattern to look for the SQ line type
	if ( /^SQ   (.+)/ ) {
		my $header = $1;
		# extract the sequence length from the header
		$header =~ /(\d+)/;
		$table_line = $table_line . $header . "\t" . $1 . "\t";
	}
	
	# the current is matched against the pattern to look for ALL sequence data
	if ( /^     (.+)/ ) {
		my $sequence_lines = $1;
		while ( <> ) {
			# the square brackets are used as delimiters around the "//" pattern
			# since they a forward slash character is the default pattern mataching delimiter
			if ( m[^//] ) {
				last;
			} else {
				/^     (.+)/;
				$sequence_lines = $sequence_lines . $1;
			}
		}
		$table_line = $table_line . $sequence_lines;
	}
	
	if ( m[^//] )
	{
		print "$table_line\n";
		$table_line = '';
	}
}

sub biodb2mysql {
    #
    # Given:  a date in DD-MMM-YYYY format.
    # Return: a date in YYYY-MM-DD format.
    #
    # Notes:  the returned date format is supported by MySQL.
    #

    my $original = shift;
    
    $original =~ /(\d\d)-(\w\w\w)-(\d\d\d\d)/;

    my ( $day, $month, $year ) = ( $1, $2, $3 );

    $month = '01' if $month eq 'JAN'; 
    $month = '02' if $month eq 'FEB'; 
    $month = '03' if $month eq 'MAR'; 
    $month = '04' if $month eq 'APR'; 
    $month = '05' if $month eq 'MAY'; 
    $month = '06' if $month eq 'JUN'; 
    $month = '07' if $month eq 'JUL'; 
    $month = '08' if $month eq 'AUG'; 
    $month = '09' if $month eq 'SEP'; 
    $month = '10' if $month eq 'OCT'; 
    $month = '11' if $month eq 'NOV'; 
    $month = '12' if $month eq 'DEC'; 
    
    return $year . '-' . $month . '-' . $day;
}
