/** * Creates annotations for each node in parse tree <CODE>node</NODE>. * These annotations are added to the parse tree and to the document * <CODE>doc</CODE>. In constrast to <CODE>setAnnotations</CODE>, * the categories used for terminal nodes are Jet categories obtained by * Jet tokenization and lexical look-up. This means that hyphenated * items are split, and multi-word names are reduced to a single node. * * @param node the root of the parse tree * @param treeSpan the span of the document matching the parse tree * @param doc the document to which annotations will be added */ private void setJetAnnotations(ParseTreeNode node, Span treeSpan, Document doc) { StatParser.buildParserInput(doc, treeSpan.start(), treeSpan.end(), false); StatParser.fixHyphenatedItems(doc); int nameConstitEnd = -1; List<ParseTreeNode> terminals = getTerminalNodes(node); for (ParseTreeNode terminal : terminals) { int terminalEnd = terminal.end; // is there a 'name' constituent or 'hyphword' constituent here? Vector<Annotation> constits = doc.annotationsAt(terminal.start, "constit"); Annotation constit = null; Annotation nameConstit = null; Annotation hyphword = null; if (constits != null) { for (Annotation c : constits) { if (c.get("cat") == "name") { nameConstit = c; } else if (c.get("cat") == "hyphword") { hyphword = c; } if (constit == null) constit = c; } } if (hyphword != null) { nameConstit = null; constit = hyphword; } // if there is a name which is not part of a hyphword, associate the // name with this (first) terminal node, and mark any remaining terminal // nodes which match tokens in the name as empty if (nameConstit != null) { terminal.end = nameConstit.end(); terminal.ann = nameConstit; nameConstitEnd = nameConstit.end(); } else if (nameConstitEnd >= 0) { terminal.word = null; } else { Span span = new Span(terminal.start, terminal.end); String pennPOS = ((String) terminal.category).toUpperCase().intern(); String word = terminal.word; terminal.ann = StatParser.buildWordDefn(doc, word, span, constit, pennPOS); } if (nameConstitEnd == terminalEnd) nameConstitEnd = -1; } // prune parse tree: remove a node if it has no word or children pruneTree(node); determineNonTerminalSpans(node, treeSpan.start()); // add head links if (hr == null) hr = HeadRule.createDefaultRule(); hr.apply(node); // add annotations for non-terminals: Jet.Parser.ParseTreeNode.makeParseAnnotations(doc, node); }
/** * train the tagger using the DocumentCollection in file 'trainingCollection'. * 'trainingCollection' should consist of documents which have been explicitly tagged with * part-of-speech information. */ void train(String trainingCollection) { for (int i = 0; i < posTable.length; i++) tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]}; // build ergodic HMM with one state for each POS (plus start and end states) HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class); posh.addState(startState); for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0)); HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class); posh.addState(endState); for (int i = 0; i < posTable.length; i++) { String pos = posTable[i]; HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class); posh.addState(state); for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0)); state.addArc(new HMMarc("end", 0)); } posh.resolveNames(); posh.resetForTraining(); annotator = new HMMannotator(posh); annotator.setTagTable(tagTable); annotator.setBItag(false); DocumentCollection col = new DocumentCollection(trainingCollection); col.open(); for (int i = 0; i < col.size(); i++) { ExternalDocument doc = col.get(i); doc.open(); System.out.println("Training from " + doc.fileName()); // divide at endmarks (constit cat="."), adding "S" marks int posn = 0; int start = posn; Vector anns; while ((anns = doc.annotationsAt(posn, "constit")) != null) { Annotation ann = (Annotation) anns.get(0); posn = ann.span().end(); String pos = (String) ann.get("cat"); if (pos.equals(".")) { doc.annotate("S", new Span(start, posn), new FeatureSet()); start = posn; } } annotator.train(doc); // free up space taken by annotations on document doc.clearAnnotations(); } posh.computeProbabilities(); }