Пример #1
0
  public static SuffixTrie getSuffixes(Iterable<Sentence> sentences) {
    SuffixTrie trie = new SuffixTrie(null);

    Set<String> vocab = new HashSet<String>();

    for (Sentence sentence : sentences) {
      for (Token token : sentence) {
        String form = token.getWordForm();

        if (form.matches(".*[0-9].*")) {
          continue;
        }

        if (form.matches(".*[A-ZÄÖÜ].*")) {
          continue;
        }

        if (form.matches(".*-.*")) {
          continue;
        }

        if (form.length() < 5) {
          continue;
        }

        if (vocab.contains(form)) {
          continue;
        }

        vocab.add(form);
        trie.add(form, Math.max(form.length() - 5, form.length() / 2), token.getTag());
      }
    }

    trie.prune(0.5, 50);
    trie.clean();
    return trie;
  }