public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff) throws IOException { NGramModel ngramModel = new NGramModel(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); if (words.length > 0) ngramModel.add(new StringList(words), 1, 1); } ngramModel.cutoff(cutoff, Integer.MAX_VALUE); return ngramModel.toDictionary(true); }
public static void populatePOSDictionary( ObjectStream<POSSample> samples, MutableTagDictionary dict, int cutoff) throws IOException { System.out.println("Expanding POS Dictionary ..."); long start = System.nanoTime(); // the data structure will store the word, the tag, and the number of // occurrences Map<String, Map<String, AtomicInteger>> newEntries = new HashMap<String, Map<String, AtomicInteger>>(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); String[] tags = sample.getTags(); for (int i = 0; i < words.length; i++) { // only store words if (!StringPattern.recognize(words[i]).containsDigit()) { String word; if (dict.isCaseSensitive()) { word = words[i]; } else { word = StringUtil.toLowerCase(words[i]); } if (!newEntries.containsKey(word)) { newEntries.put(word, new HashMap<String, AtomicInteger>()); } String[] dictTags = dict.getTags(word); if (dictTags != null) { for (String tag : dictTags) { // for this tags we start with the cutoff Map<String, AtomicInteger> value = newEntries.get(word); if (!value.containsKey(tag)) { value.put(tag, new AtomicInteger(cutoff)); } } } if (!newEntries.get(word).containsKey(tags[i])) { newEntries.get(word).put(tags[i], new AtomicInteger(1)); } else { newEntries.get(word).get(tags[i]).incrementAndGet(); } } } } // now we check if the word + tag pairs have enough occurrences, if yes we // add it to the dictionary for (Entry<String, Map<String, AtomicInteger>> wordEntry : newEntries.entrySet()) { List<String> tagsForWord = new ArrayList<String>(); for (Entry<String, AtomicInteger> entry : wordEntry.getValue().entrySet()) { if (entry.getValue().get() >= cutoff) { tagsForWord.add(entry.getKey()); } } if (tagsForWord.size() > 0) { dict.put(wordEntry.getKey(), tagsForWord.toArray(new String[tagsForWord.size()])); } } System.out.println( "... finished expanding POS Dictionary. [" + (System.nanoTime() - start) / 1000000 + "ms]"); }