#!/local/bin/perl -w # Split a file or stream comming in, saving each part in a separate file. # Command line parameters: output_directory: where to save the output # pattern: regular expression for the file separator # (must match the whole separator line) # Saves each part using the three first characters of the separator. # This is intended to split out a Brown corpus that has been merged with a # single line marking the name of the file at the beginning of each file # # Note: any text coming before the first file separator is dumped to # STDERR #use strict "vars"; use strict; sub PrintUsageAndExit { print STDERR "Usage: $0 output_dir file_separator\n"; exit; } if ( $#ARGV != 1 ) { PrintUsageAndExit(); } my $outdir = $ARGV[0]; my $separator = $ARGV[1]; shift; shift; select STDERR; $| = 1; print STDERR "outdir $outdir\nseparator $separator\n"; open OUT, ">&STDERR"; while (<>) { if ( /\A($separator)\s*\Z/ ) { close OUT; my $filename = "$outdir/" . substr($1, 0, 3); open OUT, ">$filename.tag" or die "Can't open $filename: $!\n"; next; } print OUT; }