/** * This function generaes smoothed unigramprobabilities for given data * * @param f * @return * @throws IOException */ public List<Gram> generateSmoothedProbabilities(File f) throws IOException { Tokenizer tokenizer = new Tokenizer(); // Frequency map - this map contains frequency as the key and frequency // of frequency as the value. SmoothingInputBean bean = new SmoothingInputBean(); bean.setTokens(tokenizer.getTokens(f)); bean.setWordFrequencyMap(tokenizer.extractCorpusFrequency(bean.getTokens())); bean.setDenominiator(bean.getTokens().getTokens().size()); List<Gram> grams = generateSmoothedProbabilities(bean); return grams; }
/* * (non-Javadoc) * * @see hw1.service.NGramGenerator#generateProbabilities(java.io.File) */ public List<Gram> generateProbabilities(File f) throws IOException { List<Gram> grams = new LinkedList<Gram>(); Tokenizer tokenizer = new Tokenizer(); TokenList tokens = tokenizer.getTokens(f); Map<String, Integer> map = tokenizer.extractCorpusFrequency(tokens); int count = 0; for (Map.Entry<String, Integer> entry : map.entrySet()) { count += entry.getValue(); } for (Map.Entry<String, Integer> entry : map.entrySet()) { grams.add(new Gram(entry.getKey(), (Utils.round(((double) entry.getValue() / count), 5)))); } Collections.sort(grams); return grams; }