#!/bin/sh

# Run the Brill Tagger through all the files of the corpus.
# Since It's rather slow, merge them all together in one file, then cut
# that out in chunks of about a tenth of the corpus.

# if no argument prints usage
case $# in
    1) ;;
    2) ;;
    *) echo "Usage: $0 input_directory output_directory" >&2; exit 1;;
esac

# First argument is directory for .b2b files
BROWN=$1/???.b2b

# Second argument is directory for .tag files
TAGGEDDIR=$2

# Path fo Brill Tagger
TAGGER_DIR=/u/cs401/Brill/Bin_and_Data

TEMP=/tmp/catbrown.$$
echo $TEMP

# concatenate files and add theur names at the beggining
echo "Concatenating files ..."
for x in $BROWN; do
  echo `basename $x` >> $TEMP;
  cat $x >> $TEMP;
done


DIR=`pwd`  #save current directory
if [ `echo $TAGGEDDIR | cut -c1` != / ]; then
    TAGGEDDIR=${DIR}/${TAGGEDDIR}
fi


cd $TAGGER_DIR

# call tagger
tagger LEXICON $TEMP BIGRAMS LEXICALRULEFILE.BROWN CONTEXTUALRULEFILE.BROWN -s 6000 | $DIR/splitre $TAGGEDDIR '[A-R]\d\d\.b2b\/[A-Z]{2,3}'

# output of tagger is split back into files by splitre
# separator is Xnn.b2b/CD (because the tagger tags the name too)

rm $TEMP;