コード例 #1
0
 private NaiveBayesClassifier<L, F> trainClassifier(
     int[][] data,
     int[] labels,
     int numFeatures,
     int numClasses,
     Index<L> labelIndex,
     Index<F> featureIndex) {
   Set<L> labelSet = Generics.newHashSet();
   NBWeights nbWeights = trainWeights(data, labels, numFeatures, numClasses);
   Counter<L> priors = new ClassicCounter<L>();
   double[] pr = nbWeights.priors;
   for (int i = 0; i < pr.length; i++) {
     priors.incrementCount(labelIndex.get(i), pr[i]);
     labelSet.add(labelIndex.get(i));
   }
   Counter<Pair<Pair<L, F>, Number>> weightsCounter =
       new ClassicCounter<Pair<Pair<L, F>, Number>>();
   double[][][] wts = nbWeights.weights;
   for (int c = 0; c < numClasses; c++) {
     L label = labelIndex.get(c);
     for (int f = 0; f < numFeatures; f++) {
       F feature = featureIndex.get(f);
       Pair<L, F> p = new Pair<L, F>(label, feature);
       for (int val = 0; val < wts[c][f].length; val++) {
         Pair<Pair<L, F>, Number> key = new Pair<Pair<L, F>, Number>(p, Integer.valueOf(val));
         weightsCounter.incrementCount(key, wts[c][f][val]);
       }
     }
   }
   return new NaiveBayesClassifier<L, F>(weightsCounter, priors, labelSet);
 }
コード例 #2
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
  /**
   * Evaluates how many words (= terminals) in a collection of trees are covered by the lexicon.
   * First arg is the collection of trees; second through fourth args get the results. Currently
   * unused; this probably only works if train and test at same time so tags and words variables are
   * initialized.
   */
  public double evaluateCoverage(
      Collection<Tree> trees,
      Set<String> missingWords,
      Set<String> missingTags,
      Set<IntTaggedWord> missingTW) {

    List<IntTaggedWord> iTW1 = new ArrayList<IntTaggedWord>();
    for (Tree t : trees) {
      iTW1.addAll(treeToEvents(t));
    }

    int total = 0;
    int unseen = 0;

    for (IntTaggedWord itw : iTW1) {
      total++;
      if (!words.contains(new IntTaggedWord(itw.word(), nullTag))) {
        missingWords.add(wordIndex.get(itw.word()));
      }
      if (!tags.contains(new IntTaggedWord(nullWord, itw.tag()))) {
        missingTags.add(tagIndex.get(itw.tag()));
      }
      // if (!rules.contains(itw)) {
      if (seenCounter.getCount(itw) == 0.0) {
        unseen++;
        missingTW.add(itw);
      }
    }
    return (double) unseen / total;
  }
コード例 #3
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
 /**
  * Generate the possible taggings for a word at a sentence position. This may either be based on a
  * strict lexicon or an expanded generous set of possible taggings.
  *
  * <p><i>Implementation note:</i> Expanded sets of possible taggings are calculated dynamically at
  * runtime, so as to reduce the memory used by the lexicon (a space/time tradeoff).
  *
  * @param word The word (as an int)
  * @param loc Its index in the sentence (usually only relevant for unknown words)
  * @return A list of possible taggings
  */
 public Iterator<IntTaggedWord> ruleIteratorByWord(int word, int loc, String featureSpec) {
   // if (rulesWithWord == null) { // tested in isKnown already
   // initRulesWithWord();
   // }
   List<IntTaggedWord> wordTaggings;
   if (isKnown(word)) {
     if (!flexiTag) {
       // Strict lexical tagging for seen items
       wordTaggings = rulesWithWord[word];
     } else {
       /* Allow all tags with same basicCategory */
       /* Allow all scored taggings, unless very common */
       IntTaggedWord iW = new IntTaggedWord(word, nullTag);
       if (seenCounter.getCount(iW) > smoothInUnknownsThreshold) {
         return rulesWithWord[word].iterator();
       } else {
         // give it flexible tagging not just lexicon
         wordTaggings = new ArrayList<IntTaggedWord>(40);
         for (IntTaggedWord iTW2 : tags) {
           IntTaggedWord iTW = new IntTaggedWord(word, iTW2.tag);
           if (score(iTW, loc, wordIndex.get(word)) > Float.NEGATIVE_INFINITY) {
             wordTaggings.add(iTW);
           }
         }
       }
     }
   } else {
     // we copy list so we can insert correct word in each item
     wordTaggings = new ArrayList<IntTaggedWord>(40);
     for (IntTaggedWord iTW : rulesWithWord[wordIndex.indexOf(UNKNOWN_WORD)]) {
       wordTaggings.add(new IntTaggedWord(word, iTW.tag));
     }
   }
   if (DEBUG_LEXICON) {
     EncodingPrintWriter.err.println(
         "Lexicon: "
             + wordIndex.get(word)
             + " ("
             + (isKnown(word) ? "known" : "unknown")
             + ", loc="
             + loc
             + ", n="
             + (isKnown(word) ? word : wordIndex.indexOf(UNKNOWN_WORD))
             + ") "
             + (flexiTag ? "flexi" : "lexicon")
             + " taggings: "
             + wordTaggings,
         "UTF-8");
   }
   return wordTaggings.iterator();
 }
コード例 #4
0
ファイル: SimpleSequence.java プロジェクト: wentaouc/phrasal
 public SimpleSequence(int[] intElements, Index<T> index) {
   elements = new Object[intElements.length];
   for (int i = 0; i < intElements.length; i++) {
     elements[i] = index.get(intElements[i]);
   }
   start = 0;
   end = intElements.length;
 }
コード例 #5
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
 /**
  * Provides some testing and opportunities for exploration of the probabilities of a BaseLexicon.
  * What's here currently probably only works for the English Penn Treeebank, as it uses default
  * constructors. Of the words given to test on, the first is treated as sentence initial, and the
  * rest as not sentence initial.
  *
  * @param args The command line arguments: java BaseLexicon treebankPath fileRange
  *     unknownWordModel words*
  */
 public static void main(String[] args) {
   if (args.length < 3) {
     System.err.println("java BaseLexicon treebankPath fileRange unknownWordModel words*");
     return;
   }
   System.out.print("Training BaseLexicon from " + args[0] + ' ' + args[1] + " ... ");
   Treebank tb = new DiskTreebank();
   tb.loadPath(args[0], new NumberRangesFileFilter(args[1], true));
   // TODO: change this interface so the lexicon creates its own indices?
   Index<String> wordIndex = new HashIndex<String>();
   Index<String> tagIndex = new HashIndex<String>();
   BaseLexicon lex = new BaseLexicon(wordIndex, tagIndex);
   lex.getUnknownWordModel().setUnknownLevel(Integer.parseInt(args[2]));
   lex.train(tb);
   System.out.println("done.");
   System.out.println();
   NumberFormat nf = NumberFormat.getNumberInstance();
   nf.setMaximumFractionDigits(4);
   List<String> impos = new ArrayList<String>();
   for (int i = 3; i < args.length; i++) {
     if (lex.isKnown(args[i])) {
       System.out.println(
           args[i] + " is a known word.  Log probabilities [log P(w|t)] for its taggings are:");
       for (Iterator<IntTaggedWord> it =
               lex.ruleIteratorByWord(wordIndex.indexOf(args[i], true), i - 3, null);
           it.hasNext(); ) {
         IntTaggedWord iTW = it.next();
         System.out.println(
             StringUtils.pad(iTW, 24) + nf.format(lex.score(iTW, i - 3, wordIndex.get(iTW.word))));
       }
     } else {
       String sig = lex.getUnknownWordModel().getSignature(args[i], i - 3);
       System.out.println(
           args[i]
               + " is an unknown word.  Signature with uwm "
               + lex.getUnknownWordModel().getUnknownLevel()
               + ((i == 3) ? " init" : "non-init")
               + " is: "
               + sig);
       impos.clear();
       List<String> lis = new ArrayList<String>(tagIndex.objectsList());
       Collections.sort(lis);
       for (String tStr : lis) {
         IntTaggedWord iTW = new IntTaggedWord(args[i], tStr, wordIndex, tagIndex);
         double score = lex.score(iTW, 1, args[i]);
         if (score == Float.NEGATIVE_INFINITY) {
           impos.add(tStr);
         } else {
           System.out.println(StringUtils.pad(iTW, 24) + nf.format(score));
         }
       }
       if (impos.size() > 0) {
         System.out.println(args[i] + " impossible tags: " + impos);
       }
     }
     System.out.println();
   }
 }
コード例 #6
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
 private void populateTagsToBaseTags(TreebankLanguagePack tlp) {
   int total = tagIndex.size();
   tagsToBaseTags = new int[total];
   for (int i = 0; i < total; i++) {
     String tag = tagIndex.get(i);
     String baseTag = tlp.basicCategory(tag);
     int j = tagIndex.indexOf(baseTag, true);
     tagsToBaseTags[i] = j;
   }
 }
コード例 #7
0
 private short tagProject(short tag) {
   if (smoothTPIndex == null) {
     smoothTPIndex = new HashIndex<String>(tagIndex);
   }
   if (tag < 0) {
     return tag;
   } else {
     String tagStr = smoothTPIndex.get(tag);
     String binStr = TP_PREFIX + smoothTP.project(tagStr);
     return (short) smoothTPIndex.indexOf(binStr, true);
   }
 }
コード例 #8
0
  public <F> double score(Classifier<L, F> classifier, GeneralDataset<L, F> data) {

    List<L> guesses = new ArrayList<L>();
    List<L> labels = new ArrayList<L>();

    for (int i = 0; i < data.size(); i++) {
      Datum<L, F> d = data.getRVFDatum(i);
      L guess = classifier.classOf(d);
      guesses.add(guess);
    }

    int[] labelsArr = data.getLabelsArray();
    labelIndex = data.labelIndex;
    for (int i = 0; i < data.size(); i++) {
      labels.add(labelIndex.get(labelsArr[i]));
    }

    labelIndex = new HashIndex<L>();
    labelIndex.addAll(data.labelIndex().objectsList());
    labelIndex.addAll(classifier.labels());

    int numClasses = labelIndex.size();
    tpCount = new int[numClasses];
    fpCount = new int[numClasses];
    fnCount = new int[numClasses];

    negIndex = labelIndex.indexOf(negLabel);

    for (int i = 0; i < guesses.size(); ++i) {
      L guess = guesses.get(i);
      int guessIndex = labelIndex.indexOf(guess);
      L label = labels.get(i);
      int trueIndex = labelIndex.indexOf(label);

      if (guessIndex == trueIndex) {
        if (guessIndex != negIndex) {
          tpCount[guessIndex]++;
        }
      } else {
        if (guessIndex != negIndex) {
          fpCount[guessIndex]++;
        }
        if (trueIndex != negIndex) {
          fnCount[trueIndex]++;
        }
      }
    }

    return getFMeasure();
  }
コード例 #9
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
  /** Print some statistics about this lexicon. */
  public void printLexStats() {
    System.out.println("BaseLexicon statistics");
    System.out.println("unknownLevel is " + getUnknownWordModel().getUnknownLevel());
    // System.out.println("Rules size: " + rules.size());
    System.out.println("Sum of rulesWithWord: " + numRules());
    System.out.println("Tags size: " + tags.size());
    int wsize = words.size();
    System.out.println("Words size: " + wsize);
    // System.out.println("Unseen Sigs size: " + sigs.size() +
    // " [number of unknown equivalence classes]");
    System.out.println(
        "rulesWithWord length: "
            + rulesWithWord.length
            + " [should be sum of words + unknown sigs]");
    int[] lengths = new int[STATS_BINS];
    ArrayList<String>[] wArr = new ArrayList[STATS_BINS];
    for (int j = 0; j < STATS_BINS; j++) {
      wArr[j] = new ArrayList<String>();
    }
    for (int i = 0; i < rulesWithWord.length; i++) {
      int num = rulesWithWord[i].size();
      if (num > STATS_BINS - 1) {
        num = STATS_BINS - 1;
      }
      lengths[num]++;
      if (wsize <= 20 || num >= STATS_BINS / 2) {
        wArr[num].add(wordIndex.get(i));
      }
    }
    System.out.println("Stats on how many taggings for how many words");
    for (int j = 0; j < STATS_BINS; j++) {
      System.out.print(j + " taggings: " + lengths[j] + " words ");
      if (wsize <= 20 || j >= STATS_BINS / 2) {
        System.out.print(wArr[j]);
      }
      System.out.println();
    }
    NumberFormat nf = NumberFormat.getNumberInstance();
    nf.setMaximumFractionDigits(0);
    System.out.println("Unseen counter: " + Counters.toString(uwModel.unSeenCounter(), nf));

    if (wsize < 50 && tags.size() < 10) {
      nf.setMaximumFractionDigits(3);
      StringWriter sw = new StringWriter();
      PrintWriter pw = new PrintWriter(sw);
      pw.println("Tagging probabilities log P(word|tag)");
      for (int t = 0; t < tags.size(); t++) {
        pw.print('\t');
        pw.print(tagIndex.get(t));
      }
      pw.println();
      for (int w = 0; w < wsize; w++) {
        pw.print(wordIndex.get(w));
        pw.print('\t');
        for (int t = 0; t < tags.size(); t++) {
          IntTaggedWord iTW = new IntTaggedWord(w, t);
          pw.print(nf.format(score(iTW, 1, wordIndex.get(w))));
          if (t == tags.size() - 1) {
            pw.println();
          } else pw.print('\t');
        }
      }
      pw.close();
      System.out.println(sw.toString());
    }
  }
コード例 #10
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
  /**
   * Get the score of this word with this tag (as an IntTaggedWord) at this location. (Presumably an
   * estimate of P(word | tag).)
   *
   * <p><i>Implementation documentation:</i> Seen: c_W = count(W) c_TW = count(T,W) c_T = count(T)
   * c_Tunseen = count(T) among new words in 2nd half total = count(seen words) totalUnseen =
   * count("unseen" words) p_T_U = Pmle(T|"unseen") pb_T_W = P(T|W). If (c_W &gt;
   * smoothInUnknownsThreshold) = c_TW/c_W Else (if not smart mutation) pb_T_W = bayes prior
   * smooth[1] with p_T_U p_T= Pmle(T) p_W = Pmle(W) pb_W_T = log(pb_T_W * p_W / p_T) [Bayes rule]
   * Note that this doesn't really properly reserve mass to unknowns.
   *
   * <p>Unseen: c_TS = count(T,Sig|Unseen) c_S = count(Sig) c_T = count(T|Unseen) c_U = totalUnseen
   * above p_T_U = Pmle(T|Unseen) pb_T_S = Bayes smooth of Pmle(T|S) with P(T|Unseen) [smooth[0]]
   * pb_W_T = log(P(W|T)) inverted
   *
   * @param iTW An IntTaggedWord pairing a word and POS tag
   * @param loc The position in the sentence. <i>In the default implementation this is used only for
   *     unknown words to change their probability distribution when sentence initial</i>
   * @return A float score, usually, log P(word|tag)
   */
  public float score(IntTaggedWord iTW, int loc, String word) {
    // both actual
    double c_TW = seenCounter.getCount(iTW);
    // double x_TW = xferCounter.getCount(iTW);

    IntTaggedWord temp = new IntTaggedWord(iTW.word, nullTag);
    // word counts
    double c_W = seenCounter.getCount(temp);
    // double x_W = xferCounter.getCount(temp);

    // totals
    double total = seenCounter.getCount(NULL_ITW);
    double totalUnseen = uwModel.unSeenCounter().getCount(NULL_ITW);

    temp = new IntTaggedWord(nullWord, iTW.tag);
    // tag counts
    double c_T = seenCounter.getCount(temp);
    double c_Tunseen = uwModel.unSeenCounter().getCount(temp);

    double pb_W_T; // always set below

    if (DEBUG_LEXICON) {
      // dump info about last word
      if (iTW.word != debugLastWord) {
        if (debugLastWord >= 0 && debugPrefix != null) {
          // the 2nd conjunct in test above handles older serialized files
          EncodingPrintWriter.err.println(debugPrefix + debugProbs + debugNoProbs, "UTF-8");
        }
      }
    }

    boolean seen = (c_W > 0.0);

    if (seen) {

      // known word model for P(T|W)
      if (DEBUG_LEXICON_SCORE) {
        System.err.println(
            "Lexicon.score "
                + wordIndex.get(iTW.word)
                + "/"
                + tagIndex.get(iTW.tag)
                + " as known word.");
      }

      // c_TW = Math.sqrt(c_TW); [cdm: funny math scaling? dunno who played with this]
      // c_TW += 0.5;

      double p_T_U;
      if (useSignatureForKnownSmoothing) { // only works for English currently
        p_T_U = getUnknownWordModel().scoreProbTagGivenWordSignature(iTW, loc, smooth[0], word);
        if (DEBUG_LEXICON_SCORE)
          System.err.println(
              "With useSignatureForKnownSmoothing, P(T|U) is "
                  + p_T_U
                  + " rather than "
                  + (c_Tunseen / totalUnseen));
      } else {
        p_T_U = c_Tunseen / totalUnseen;
      }
      double pb_T_W; // always set below

      if (DEBUG_LEXICON_SCORE) {
        System.err.println(
            "c_W is "
                + c_W
                + " mle = "
                + (c_TW / c_W)
                + " smoothInUnknownsThresh is "
                + smoothInUnknownsThreshold
                + " base p_T_U is "
                + c_Tunseen
                + "/"
                + totalUnseen
                + " = "
                + p_T_U);
      }
      if (c_W > smoothInUnknownsThreshold && c_TW > 0.0 && c_W > 0.0) {
        // we've seen the word enough times to have confidence in its tagging
        pb_T_W = c_TW / c_W;
      } else {
        // we haven't seen the word enough times to have confidence in its
        // tagging
        if (smartMutation) {
          int numTags = tagIndex.size();
          if (m_TT == null || numTags != m_T.length) {
            buildPT_T();
          }
          p_T_U *= 0.1;
          // System.out.println("Checking "+iTW);
          for (int t = 0; t < numTags; t++) {
            IntTaggedWord iTW2 = new IntTaggedWord(iTW.word, t);
            double p_T_W2 = seenCounter.getCount(iTW2) / c_W;
            if (p_T_W2 > 0) {
              // System.out.println(" Observation of "+tagIndex.get(t)+"
              // ("+seenCounter.getCount(iTW2)+") mutated to
              // "+tagIndex.get(iTW.tag)+" at rate
              // "+(m_TT[tag][t]/m_T[t]));
              p_T_U += p_T_W2 * m_TT[iTW.tag][t] / m_T[t] * 0.9;
            }
          }
        }
        if (DEBUG_LEXICON_SCORE) {
          System.err.println("c_TW = " + c_TW + " c_W = " + c_W + " p_T_U = " + p_T_U);
        }
        // double pb_T_W = (c_TW+smooth[1]*x_TW)/(c_W+smooth[1]*x_W);
        pb_T_W = (c_TW + smooth[1] * p_T_U) / (c_W + smooth[1]);
      }
      double p_T = (c_T / total);
      double p_W = (c_W / total);
      pb_W_T = Math.log(pb_T_W * p_W / p_T);

      if (DEBUG_LEXICON) {
        if (iTW.word != debugLastWord) {
          debugLastWord = iTW.word;
          debugLoc = loc;
          debugProbs = new StringBuilder();
          debugNoProbs = new StringBuilder("impossible: ");
          debugPrefix = "Lexicon: " + wordIndex.get(debugLastWord) + " (known): ";
        }
        if (pb_W_T > Double.NEGATIVE_INFINITY) {
          NumberFormat nf = NumberFormat.getNumberInstance();
          nf.setMaximumFractionDigits(3);
          debugProbs.append(
              tagIndex.get(iTW.tag)
                  + ": cTW="
                  + c_TW
                  + " c_T="
                  + c_T
                  + " pb_T_W="
                  + nf.format(pb_T_W)
                  + " log pb_W_T="
                  + nf.format(pb_W_T)
                  + ", ");
          // debugProbs.append("\n" + "smartMutation=" + smartMutation + "
          // smoothInUnknownsThreshold=" + smoothInUnknownsThreshold + "
          // smooth0=" + smooth[0] + "smooth1=" + smooth[1] + " p_T_U=" + p_T_U
          // + " c_W=" + c_W);
        } else {
          debugNoProbs.append(tagIndex.get(iTW.tag)).append(' ');
        }
      } // end if (DEBUG_LEXICON)

    } else { // when unseen
      if (loc >= 0) {
        pb_W_T = getUnknownWordModel().score(iTW, loc, c_T, total, smooth[0], word);
      } else {
        // For negative we now do a weighted average for the dependency grammar :-)
        double pb_W0_T = getUnknownWordModel().score(iTW, 0, c_T, total, smooth[0], word);
        double pb_W1_T = getUnknownWordModel().score(iTW, 1, c_T, total, smooth[0], word);
        pb_W_T = Math.log((Math.exp(pb_W0_T) + 2 * Math.exp(pb_W1_T)) / 3);
      }
    }

    // Categorical cutoff if score is too low
    if (pb_W_T > -100.0) {
      return (float) pb_W_T;
    }
    return Float.NEGATIVE_INFINITY;
  } // end score()
コード例 #11
0
ファイル: BaseLexicon.java プロジェクト: rodolfopc/ufmg-nlp
  protected void initRulesWithWord() {
    if (testOptions.verbose || DEBUG_LEXICON) {
      System.err.print("\nInitializing lexicon scores ... ");
    }
    // int numWords = words.size()+sigs.size()+1;
    int unkWord = wordIndex.indexOf(UNKNOWN_WORD, true);
    int numWords = wordIndex.size();
    rulesWithWord = new List[numWords];
    for (int w = 0; w < numWords; w++) {
      rulesWithWord[w] = new ArrayList<IntTaggedWord>(1); // most have 1 or 2
      // items in them
    }
    // for (Iterator ruleI = rules.iterator(); ruleI.hasNext();) {
    tags = new HashSet<IntTaggedWord>();
    for (IntTaggedWord iTW : seenCounter.keySet()) {
      if (iTW.word() == nullWord && iTW.tag() != nullTag) {
        tags.add(iTW);
      }
    }

    // tags for unknown words
    if (DEBUG_LEXICON) {
      System.err.println(
          "Lexicon initializing tags for UNKNOWN WORD ("
              + Lexicon.UNKNOWN_WORD
              + ", "
              + unkWord
              + ')');
    }
    if (DEBUG_LEXICON) System.err.println("unSeenCounter is: " + uwModel.unSeenCounter());
    if (DEBUG_LEXICON)
      System.err.println(
          "Train.openClassTypesThreshold is " + trainOptions.openClassTypesThreshold);
    for (IntTaggedWord iT : tags) {
      if (DEBUG_LEXICON)
        System.err.println("Entry for " + iT + " is " + uwModel.unSeenCounter().getCount(iT));
      double types = uwModel.unSeenCounter().getCount(iT);
      if (types > trainOptions.openClassTypesThreshold) {
        // Number of types before it's treated as open class
        IntTaggedWord iTW = new IntTaggedWord(unkWord, iT.tag);
        rulesWithWord[iTW.word].add(iTW);
      }
    }
    if (testOptions.verbose || DEBUG_LEXICON) {
      System.err.print("The " + rulesWithWord[unkWord].size() + " open class tags are: [");
      for (IntTaggedWord item : rulesWithWord[unkWord]) {
        System.err.print(" " + tagIndex.get(item.tag()));
        if (DEBUG_LEXICON) {
          IntTaggedWord iTprint = new IntTaggedWord(nullWord, item.tag);
          System.err.print(
              " (tag "
                  + item.tag()
                  + ", type count is "
                  + uwModel.unSeenCounter().getCount(iTprint)
                  + ')');
        }
      }
      System.err.println(" ] ");
    }

    for (IntTaggedWord iTW : seenCounter.keySet()) {
      if (iTW.tag() != nullTag && iTW.word() != nullWord) {
        rulesWithWord[iTW.word].add(iTW);
      }
    }
  }
コード例 #12
0
ファイル: TTags.java プロジェクト: guokr/stan-cn-com
 public String getTag(int i) {
   return index.get(i);
 }
コード例 #13
0
  // CDM 2007: I wonder what this does differently from segmentWordsWithMarkov???
  private ArrayList<TaggedWord> basicSegmentWords(String s) {
    int length = s.length();
    //    Set<String> POSes = (Set<String>) POSDistribution.keySet();  // 1.5
    // best score of span
    double[][] scores = new double[length][length + 1];
    // best (last index of) first word for this span
    int[][] splitBacktrace = new int[length][length + 1];
    // best tag for word over this span
    int[][] POSbacktrace = new int[length][length + 1];
    for (int i = 0; i < length; i++) {
      Arrays.fill(scores[i], Double.NEGATIVE_INFINITY);
    }
    // first fill in word probabilities
    for (int diff = 1; diff <= 10; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        StringBuilder wordBuf = new StringBuilder();
        for (int pos = start; pos < end; pos++) {
          wordBuf.append(s.charAt(pos));
        }
        String word = wordBuf.toString();
        //        for (String tag : POSes) {  // 1.5
        for (Iterator<String> iter = POSes.iterator(); iter.hasNext(); ) {
          String tag = iter.next();
          IntTaggedWord itw = new IntTaggedWord(word, tag, wordIndex, tagIndex);
          double newScore =
              lex.score(itw, 0, word, null) + Math.log(lex.getPOSDistribution().probabilityOf(tag));
          if (newScore > scores[start][end]) {
            scores[start][end] = newScore;
            splitBacktrace[start][end] = end;
            POSbacktrace[start][end] = itw.tag();
          }
        }
      }
    }
    // now fill in word combination probabilities
    for (int diff = 2; diff <= length; diff++) {
      for (int start = 0; start + diff <= length; start++) {
        int end = start + diff;
        for (int split = start + 1; split < end && split - start <= 10; split++) {
          if (splitBacktrace[start][split] != split) {
            continue; // only consider words on left
          }
          double newScore = scores[start][split] + scores[split][end];
          if (newScore > scores[start][end]) {
            scores[start][end] = newScore;
            splitBacktrace[start][end] = split;
          }
        }
      }
    }

    List<TaggedWord> words = new ArrayList<TaggedWord>();
    int start = 0;
    while (start < length) {
      int end = splitBacktrace[start][length];
      StringBuilder wordBuf = new StringBuilder();
      for (int pos = start; pos < end; pos++) {
        wordBuf.append(s.charAt(pos));
      }
      String word = wordBuf.toString();
      String tag = tagIndex.get(POSbacktrace[start][end]);

      words.add(new TaggedWord(word, tag));
      start = end;
    }

    return new ArrayList<TaggedWord>(words);
  }