#!/usr/bin/env csh # Usage: # cat $MD/DATA/SAMPLE/small.tok \ # | $MD/tsgpipe > $MD/DATA/SAMPLE/small.tsg # # The input to this pipe are the files in DATA/TOK. # The output files are in DATA/TSG/*.tsg. # N.B. DATA/TSG also contains *.gr files which are the output of # running the TSG on DATA/TSG/*.tsg files. # # Difference between this and casspipe: # # (1) in casspipe parentheticals are marked up before tagging # but here they are marked up afterwards - it's not clear which would # give better tagging results. In both the parentheticals are removed # by the final call to xmlperl # (2) lemmatisation done here but not in casspipe, because tsg needs lemmas # (3) word internal white space changed to - or _ before tagging here but # done after tagging in casspipe. (This might have given bad results for # casspipe except that all words with ws have a K attribute which overrides # the tagger's decision). # (4) Use of K attribute to overide tagger here is done in final xmlperl # call, instead of during tagging. $MD/bin/fsgmatch -q ".*/SENTENCE" $MD/GRAM/smallnumbers.gr \ | $MD/bin/fsgmatch -q ".*/SENTENCE" $MD/GRAM/hyph.gr \ | $MD/bin/sgdelmarkup -q ".*/W/W" \ | $MD/bin/sgdelmarkup -q ".*/W/W" \ | $MD/bin/xmlperl $MD/RES/clawstagger.rule \ | $MD/bin/xmlperl $MD/RES/lemma.rule \ | $MD/bin/fsgmatch -q ".*/SENTENCE" $MD/GRAM/paren.gr \ | $MD/bin/xmlperl $MD/RES/tsg.rule \ | $MD/RES/tsgstops.perl