public void add(JCas jcas, Set<String> inputPaths, Type sentenceType) throws IOException { ConditionalFrequencyDistribution<Integer, String> cfd = new ConditionalFrequencyDistribution<Integer, String>(); CAS cas = jcas.getCas(); for (AnnotationFS annotation : CasUtil.select(cas, sentenceType)) { for (String path : inputPaths) { String[] segments = path.split("/", 2); String typeName = segments[0]; Type type = getInputType(cas, typeName); List<AnnotationFS> tokens = CasUtil.selectCovered(cas, type, annotation); List<String> tokenStrings; try { tokenStrings = createStringList(tokens, segments); } catch (AnalysisEngineProcessException e) { throw new IOException(e); } for (int ngramLength = minNgramLength; ngramLength <= maxNgramLength; ngramLength++) { cfd.incAll(ngramLength, new NGramStringIterable(tokenStrings, ngramLength, ngramLength)); } } } add(cfd); }
/** Write the frequency distributions to the corresponding n-gram files. */ private void writeFrequencyDistributionsToNGramFiles( ConditionalFrequencyDistribution<Integer, String> cfd) throws IOException { for (int level : cfd.getConditions()) { if (!ngramWriters.containsKey(level)) { throw new IOException("No writer for ngram level " + level + " initialized."); } writeNGramFile(cfd, level); } }
private void writeNGramFile(ConditionalFrequencyDistribution<Integer, String> cfd, int level) throws IOException { FrequencyDistribution<String> letterFD = letterFDs.get(level); BufferedWriter writer = ngramWriters.get(level); for (String key : cfd.getFrequencyDistribution(level).getKeys()) { // add starting letter to frequency distribution if (key.length() > 1) { String subsKey = key.substring(0, 2); String subsKeyLowered = subsKey.toLowerCase(); letterFD.addSample(subsKeyLowered, 1); } else { String subsKey = key.substring(0, 1); String subsKeyLowered = subsKey.toLowerCase(); letterFD.addSample(subsKeyLowered, 1); } writer.write(key); writer.write(TAB); writer.write(Long.toString(cfd.getCount(level, key))); writer.write(LF); } writer.flush(); }