/** * Provides some testing and opportunities for exploration of the probabilities of a BaseLexicon. * What's here currently probably only works for the English Penn Treeebank, as it uses default * constructors. Of the words given to test on, the first is treated as sentence initial, and the * rest as not sentence initial. * * @param args The command line arguments: java BaseLexicon treebankPath fileRange * unknownWordModel words* */ public static void main(String[] args) { if (args.length < 3) { System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*"); return; } System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... "); Treebank tb = new DiskTreebank(); tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true)); // TODO: change this interface so the lexicon creates its own indices? Index<String> wordIndex = new HashIndex<String>(); Index<String> tagIndex = new HashIndex<String>(); BaseLexicon lex = new BaseLexicon(wordIndex, tagIndex); lex.getUnknownWordModel().setUnknownLevel(Integer.parseInt(args[2])); lex.train(tb); System.out.println("done."); System.out.println(); NumberFormat nf = NumberFormat.getNumberInstance(); nf.setMaximumFractionDigits(4); List<String> impos = new ArrayList<String>(); for (int i = 3; i < args.length; i++) { if (lex.isKnown(args[i])) { System.out.println( args[i] + " is a known word. Log probabilities [log P(w|t)] for its taggings are:"); for (Iterator<IntTaggedWord> it = lex.ruleIteratorByWord(wordIndex.indexOf(args[i], true), i - 3, null); it.hasNext(); ) { IntTaggedWord iTW = it.next(); System.out.println( StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word)))); } } else { String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3); System.out.println( args[i] + " is an unknown word. Signature with uwm " + lex.getUnknownWordModel().getUnknownLevel() + ((i == 3) ? " init" : "non-init") + " is: " + sig); impos.clear(); List<String> lis = new ArrayList<String>(tagIndex.objectsList()); Collections.sort(lis); for (String tStr : lis) { IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex); double score = lex.score(iTW, 1, args[i]); if (score == Float.NEGATIVE_INFINITY) { impos.add(tStr); } else { System.out.println(StringUtils.pad(iTW, 24) + nf.format(score)); } } if (impos.size() > 0) { System.out.println(args[i] + " impossible tags: " + impos); } } System.out.println(); } }
@Override public void finishTraining() { lex.finishTraining(); int numTags = tagIndex.size(); POSes = new HashSet<String>(tagIndex.objectsList()); initialPOSDist = Distribution.laplaceSmoothedDistribution(initial, numTags, 0.5); markovPOSDists = new HashMap<String, Distribution>(); Set entries = ruleCounter.lowestLevelCounterEntrySet(); for (Iterator iter = entries.iterator(); iter.hasNext(); ) { Map.Entry entry = (Map.Entry) iter.next(); // Map.Entry<List<String>, Counter> entry = (Map.Entry<List<String>, Counter>) // iter.next(); Distribution d = Distribution.laplaceSmoothedDistribution((ClassicCounter) entry.getValue(), numTags, 0.5); markovPOSDists.put(((List<String>) entry.getKey()).get(0), d); } }
@Override public Collection<L> labels() { return labelIndex.objectsList(); }
public static <L, F> OneVsAllClassifier<L, F> train( ClassifierFactory<String, F, Classifier<String, F>> classifierFactory, GeneralDataset<L, F> dataset) { Index<L> labelIndex = dataset.labelIndex(); return train(classifierFactory, dataset, labelIndex.objectsList()); }