#!/usr/local/bin/perl -w
#
#                                        David MacKay  Dec 97
# huffman.p - extracted from frequency2.p - writes a huffman code
# 
# input: to standard input, or in a file, should be something like this:
#
# a 10
# b 7
# c 3
# d 2
#
# or
#
# # Comment lines are allowed
# X 0.3
# Y 0.4
# Z 0.3
#
# The corresponding output will be the codewords:
# 
# a 0
# b 10
# c 110
# d 111
#
# The input and output formats can be varied a little.
# input:
$dc = 0 ;   # how far from right column to go to get the counts
$offset = 0 ; # whether to add extra offset to each count.
#
# the first entry in each row is used as the name of the symbol
#
# formatting of output:
$latex=0 ; # change to 1 to change all these
$openbracket = ":\n" ;
$endline = "" ;
$realendline = "\n" ;
$comma = " " ;
$closebracket = "" ;
#
$verbose = 0 ;
#
# accept commandline modifications of the above variables.
#
eval "\$$1=\$2" while ($#ARGV >= 0) && $ARGV[0]=~ /^(\w+)=(.*)/ && shift ;
#
if ( $latex ) {
    $openbracket = ":\n" ;
    $endline = "\t".'\\\\' ;
    $realendline = "\n" ;
    $comma = " & " ;
    $closebracket = "" ;
}
#
$endline = $endline.$realendline ; 
#
#  the start.
#
$cols = 1 ;
$r = 0 ;    # number of rows
$Count = 0 ; 
READING: while ( <> ) 
{
    if ( /^\s*\#/ ) { next READING ; } # ignore comments
    @b = split ;
    if ( $r == 0 ) {
	$cols = $#b ;
    } elsif ( ($#b != $cols) ) {
	print STDERR "Warning, row $r odd\n" ;
    }
    
    $f = $b[0] ;
    $count = $b[$#b - $dc] ;
    $count += $offset ;
    if ( $verbose >= 1 ) {    print "$f : $count\n" ; }
    $pr{$f} =  $count ;
    $arr[$r] = $f ;
    $c[$r] = $count ;   # this is destroyed by the huffman algm
    $ccc[$r] = $count ; # this copppy of c is retained
    $r ++ ;
    $Count += $count ; 
}		
$M = $r ;		
if ($M <= 1) { print "too few symbols\n" ; exit(0) ; }

#
# make huffman code
#

@sorted = sort bypr @arr ;
print "sorted characters\n" ;
print join(' ',@sorted) , "\n" ;

sub bypr {
    $pr{$a} <=> $pr{$b} ;
}
sub bynumber { $a <=> $b; }

print "\n" ;
# make huffman tree
&iterate ( $M ) ;
# ff and ss now contain lists of branching instructions

# now deduce the codewords. Start from the root with the null codeword.
$codes[0] = "" ;
$lengths[0] = 0 ; 
for ( $mm = 2 ; $mm <= $M ; $mm ++ ) {
    $f = $ff[$mm] ;
    $s = $ss[$mm] ;
    $co = $codes[$f] ; # this is the one we will expand
    $le = $lengths[$f] ;
    # move down everyone below s
    for ( $m = $mm - 1 ; $m > $s ; $m -- ) {
	$codes[$m] = $codes[$m-1] ;
	$lengths[$m] = $lengths[$m-1] ;
    }
    $codes[$f] = $co."0" ;
    $codes[$s] = $co."1" ;
    $lengths[$f] = $le +  1 ;
    $lengths[$s] = $le +  1 ;
}
# write the answer
print " codewords$openbracket" ;
if ( $latex ) {
    print '\begin{tabular}{clrrl} \hline' ;
    print "\n" ; 
    print '$a_i$  & $p_i$  & \multicolumn{1}{c}{$\log_2 \frac{1}{p_i}$}  & $l_i$ & $c(a_i)$ \\\\[0.1in] \hline ' ; 
    print "\n" ; 

}
$L = 0.0 ;  $H = 0.0 ; 
for ( $mm = 0 ; $mm < $M ; $mm ++ ) {
    $prob[$mm] =  1.0 * $ccc[$mm] / $Count ;
    if ( $ccc[$mm] > 0.0 ) {
	$logp = log($prob[$mm])*1.0/log(2.0) ;
	$H +=  $ccc[$mm] * log( 1.0 * $Count / $ccc[$mm] ) ;
    } else {
	print STDERR "WARNING ccc[$mm] = $ccc[$mm]\n" ;
	$logp = 0.0 ;
    }
    if ( $latex ) {
	$arr[$mm] = '{\tt '.$arr[$mm].'}' ;
	$codes[$mm] = '{\tt '.$codes[$mm].'}' ;
    }
			   
    print $arr[$mm] , "\t$comma" ;
    printf "%-10.4g %s %-6.1f %s %2d" , $prob[$mm] , $comma , -$logp , $comma , $lengths[$mm] ; 
    print "$comma" , $codes[$mm] , "$endline" ;
    $L += 1.0 * $lengths[$mm] * $ccc[$mm] ;
}
if ( $latex ) {
    print ' \hline' . "\n" ;
    print '\end{tabular}'."\n" ; 
}
print "$closebracket" ;
$L = 1.0 * $L / $Count ;
$H = $H / ( $Count * log(2.0) ) ;
printf "total count %10.5g\n" , $Count ;
printf "expected length %10.5g\n" , $L ;
printf "entropy %10.5g\n" , $H ; 
printf "length / entropy %10.5g\n" , $L/ $H ; 

# end

sub iterate { #usage  ( $m ) 
    local ( $m ) = @_ ;
    if ( $verbose >= 1 ) {     print "iterating, m=$m\n" ; 
			       print join ( ' ' , @c ) , "\n" ;
			   }
    # this is not the most elegant huffman algorithm!
    # run through the array, finding the smallest two entries
    @sorted = sort bynumber @c[0..$m-1] ;
    $th = $sorted[ 1 ] ;
    $bot = $sorted[ 0 ] ;
				# find the first that is below $bot
    for ( $mm = 0 , $found = 0 ; $mm < $m ; $mm ++ ) {
	if ( $c[$mm] <= $bot ) {
	    $second = $mm ; $found ++ ;
	    last ;
	}
    }
				# find the first that is below th:
    for ( $mm = 0 ; $mm < $m ; $mm ++ ) {
	if ( ($second != $mm) && ($c[$mm] <= $th) ) {
	    $first = $mm ; $found ++ ;
	    last ;
	}
    }
    if ( $found != 2 ) {
	print STDERR "warning didn't find 2! threshold = !\n" ;
    }
    if ( $first > $second ) {
	$tmp = $first ;
	$first = $second ; 
	$second = $tmp ;
    }
    # throw the weight of the 2nd into the 1st
    $c[$first] += $c[$second] ;
    # move everyone else up the list
    for ( $mm = $second ; $mm < $m-1 ; $mm ++ ) {
	$c[$mm] = $c[$mm+1] ;
    }
    # destroy the last entry
    $c[$mm] = "" ;
    # record what has been done
    $ff[$m] = $first ;
    $ss[$m] = $second ;
    if ( $verbose >= 1 ) {     print "combining " , $first, " & " , $second , "\n" ; }
    if ( $m > 2 ) {
	&iterate ( $m-1 ) ;
    }
}


