/** * train the tagger using the DocumentCollection in file 'trainingCollection'. * 'trainingCollection' should consist of documents which have been explicitly tagged with * part-of-speech information. */ void train(String trainingCollection) { for (int i = 0; i < posTable.length; i++) tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]}; // build ergodic HMM with one state for each POS (plus start and end states) HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class); posh.addState(startState); for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0)); HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class); posh.addState(endState); for (int i = 0; i < posTable.length; i++) { String pos = posTable[i]; HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class); posh.addState(state); for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0)); state.addArc(new HMMarc("end", 0)); } posh.resolveNames(); posh.resetForTraining(); annotator = new HMMannotator(posh); annotator.setTagTable(tagTable); annotator.setBItag(false); DocumentCollection col = new DocumentCollection(trainingCollection); col.open(); for (int i = 0; i < col.size(); i++) { ExternalDocument doc = col.get(i); doc.open(); System.out.println("Training from " + doc.fileName()); // divide at endmarks (constit cat="."), adding "S" marks int posn = 0; int start = posn; Vector anns; while ((anns = doc.annotationsAt(posn, "constit")) != null) { Annotation ann = (Annotation) anns.get(0); posn = ann.span().end(); String pos = (String) ann.get("cat"); if (pos.equals(".")) { doc.annotate("S", new Span(start, posn), new FeatureSet()); start = posn; } } annotator.train(doc); // free up space taken by annotations on document doc.clearAnnotations(); } posh.computeProbabilities(); }
/** process a single file, 'docName', generating a file with in-line XML. */ public static void processFile(String docName) { docCount++; System.out.println("\nProcessing document " + docCount + ": " + docName); String textFileName = ACEdir + docName + ".sgm"; ExternalDocument doc = new ExternalDocument("sgml", textFileName); doc.setAllTags(true); if (year.equals("2003") || year.equals("2004")) doc.setEmptyTags(new String[] {"TURN"}); doc.open(); String APFfileName = ACEdir + docName + "." + apfExtension; AceDocument aceDoc = new AceDocument(textFileName, APFfileName); addAnnotations(doc, aceDoc); doc.setSGMLwrapMargin(0); doc.saveAs(outputDir, docName + "." + outputExtension); }
/** * process a set of documents through Jet in accordance with a Jet parameter file. Invoked by <br> * ProcessDocuments propsFile docList inputDir inputSuffix outputDir outputSuffix * * @param propsFile Jet properties file * @param docList file containing list of documents to be processed, 1 per line * @param inputDir directory containing files to be processed * @param inputSuffix file extension to be added to document name to obtain name of input file * @param outputDir directory containing output files * @param outputSuffix file extension to be added to document name to obtain name of output file */ public static void main(String[] args) throws IOException { if (args.length != 6) { System.err.println("ProcessDocuments requires 6 arguments:"); System.err.println(" propsFile docList inputDir inputSuffix outputDir outputSuffix"); System.exit(1); } String propsFile = args[0]; String docList = args[1]; String inputDir = args[2]; String inputSuffix = args[3]; String outputDir = args[4]; String outputSuffix = args[5]; // initialize Jet System.out.println("Starting ACE Jet..."); JetTest.initializeFromConfig(propsFile); // load ACE type dictionary EDTtype.readTypeDict(); // turn off traces Pat.trace = false; Resolve.trace = false; // ACE mode (provides additional antecedents ...) Resolve.ACE = true; String docName; int docCount = 0; BufferedReader docListReader = new BufferedReader(new FileReader(docList)); while ((docName = docListReader.readLine()) != null) { docCount++; String inputFile = docName + "." + inputSuffix; ExternalDocument doc = new ExternalDocument("sgml", inputDir, inputFile); doc.setAllTags(true); doc.open(); String[] types = doc.getAnnotationTypes(); doc.setSGMLwrapMargin(0); String outputFile = docName + "." + outputSuffix; BufferedWriter writer = new BufferedWriter(new FileWriter(new File(outputDir, outputFile))); // process document Ace.monocase = Ace.allLowerCase(doc); Control.processDocument(doc, writer, docCount == -1, docCount); writer.close(); } }
private static void processFileList(String fileList) throws IOException { // open list of files BufferedReader reader = new BufferedReader(new FileReader(fileList)); int docCount = 0; String currentDoc; while ((currentDoc = reader.readLine()) != null) { // process file 'currentDoc' docCount++; System.out.println("\nProcessing document " + docCount + ": " + currentDoc); String textFileName = ACEdir + currentDoc + ".sgm"; ExternalDocument doc = new ExternalDocument("sgml", textFileName); doc.setAllTags(true); doc.open(); String APFfileName = ACEdir + currentDoc + apfExtension; AceDocument aceDoc = new AceDocument(textFileName, APFfileName); addMentionTags(doc, aceDoc); doc.setSGMLwrapMargin(0); doc.saveAs(outputDir, currentDoc + ".co.txt"); } reader.close(); }