예제 #1
0
 /**
  * Provides some testing and opportunities for exploration of the probabilities of a BaseLexicon.
  * What's here currently probably only works for the English Penn Treeebank, as it uses default
  * constructors. Of the words given to test on, the first is treated as sentence initial, and the
  * rest as not sentence initial.
  *
  * @param args The command line arguments: java BaseLexicon treebankPath fileRange
  *     unknownWordModel words*
  */
 public static void main(String[] args) {
   if (args.length < 3) {
     System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*");
     return;
   }
   System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
   Treebank tb = new DiskTreebank();
   tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
   // TODO: change this interface so the lexicon creates its own indices?
   Index<String> wordIndex = new HashIndex<String>();
   Index<String> tagIndex = new HashIndex<String>();
   BaseLexicon lex = new BaseLexicon(wordIndex, tagIndex);
   lex.getUnknownWordModel().setUnknownLevel(Integer.parseInt(args[2]));
   lex.train(tb);
   System.out.println("done.");
   System.out.println();
   NumberFormat nf = NumberFormat.getNumberInstance();
   nf.setMaximumFractionDigits(4);
   List<String> impos = new ArrayList<String>();
   for (int i = 3; i < args.length; i++) {
     if (lex.isKnown(args[i])) {
       System.out.println(
           args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
       for (Iterator<IntTaggedWord> it =
               lex.ruleIteratorByWord(wordIndex.indexOf(args[i], true), i - 3, null);
           it.hasNext(); ) {
         IntTaggedWord iTW = it.next();
         System.out.println(
             StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word))));
       }
     } else {
       String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
       System.out.println(
           args[i]
               + " is an unknown word.  Signature with uwm "
               + lex.getUnknownWordModel().getUnknownLevel()
               + ((i == 3) ? " init" : "non-init")
               + " is: "
               + sig);
       impos.clear();
       List<String> lis = new ArrayList<String>(tagIndex.objectsList());
       Collections.sort(lis);
       for (String tStr : lis) {
         IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
         double score = lex.score(iTW, 1, args[i]);
         if (score == Float.NEGATIVE_INFINITY) {
           impos.add(tStr);
         } else {
           System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
         }
       }
       if (impos.size() > 0) {
         System.out.println(args[i] + " impossible tags: " + impos);
       }
     }
     System.out.println();
   }
 }
  @Override
  public void finishTraining() {
    lex.finishTraining();

    int numTags = tagIndex.size();
    POSes = new HashSet<String>(tagIndex.objectsList());
    initialPOSDist = Distribution.laplaceSmoothedDistribution(initial, numTags, 0.5);
    markovPOSDists = new HashMap<String, Distribution>();
    Set entries = ruleCounter.lowestLevelCounterEntrySet();
    for (Iterator iter = entries.iterator(); iter.hasNext(); ) {
      Map.Entry entry = (Map.Entry) iter.next();
      //      Map.Entry<List<String>, Counter> entry = (Map.Entry<List<String>, Counter>)
      // iter.next();
      Distribution d =
          Distribution.laplaceSmoothedDistribution((ClassicCounter) entry.getValue(), numTags, 0.5);
      markovPOSDists.put(((List<String>) entry.getKey()).get(0), d);
    }
  }
예제 #3
0
 @Override
 public Collection<L> labels() {
   return labelIndex.objectsList();
 }
예제 #4
0
 public static <L, F> OneVsAllClassifier<L, F> train(
     ClassifierFactory<String, F, Classifier<String, F>> classifierFactory,
     GeneralDataset<L, F> dataset) {
   Index<L> labelIndex = dataset.labelIndex();
   return train(classifierFactory, dataset, labelIndex.objectsList());
 }