public static SuffixTrie getSuffixes(Iterable<Sentence> sentences) { SuffixTrie trie = new SuffixTrie(null); Set<String> vocab = new HashSet<String>(); for (Sentence sentence : sentences) { for (Token token : sentence) { String form = token.getWordForm(); if (form.matches(".*[0-9].*")) { continue; } if (form.matches(".*[A-ZÄÖÜ].*")) { continue; } if (form.matches(".*-.*")) { continue; } if (form.length() < 5) { continue; } if (vocab.contains(form)) { continue; } vocab.add(form); trie.add(form, Math.max(form.length() - 5, form.length() / 2), token.getTag()); } } trie.prune(0.5, 50); trie.clean(); return trie; }