#!/bin/sh # Run the Brill Tagger through all the files of the corpus. # Since It's rather slow, merge them all together in one file, then cut # that out in chunks of about a tenth of the corpus. # if no argument prints usage case $# in 1) ;; 2) ;; *) echo "Usage: $0 input_directory output_directory" >&2; exit 1;; esac # First argument is directory for .b2b files BROWN=$1/???.b2b # Second argument is directory for .tag files TAGGEDDIR=$2 # Path fo Brill Tagger TAGGER_DIR=/u/cs401/Brill/Bin_and_Data TEMP=/tmp/catbrown.$$ echo $TEMP # concatenate files and add theur names at the beggining echo "Concatenating files ..." for x in $BROWN; do echo `basename $x` >> $TEMP; cat $x >> $TEMP; done DIR=`pwd` #save current directory if [ `echo $TAGGEDDIR | cut -c1` != / ]; then TAGGEDDIR=${DIR}/${TAGGEDDIR} fi cd $TAGGER_DIR # call tagger tagger LEXICON $TEMP BIGRAMS LEXICALRULEFILE.BROWN CONTEXTUALRULEFILE.BROWN -s 6000 | $DIR/splitre $TAGGEDDIR '[A-R]\d\d\.b2b\/[A-Z]{2,3}' # output of tagger is split back into files by splitre # separator is Xnn.b2b/CD (because the tagger tags the name too) rm $TEMP;