/** * Significant bigrams counter. * * @param minCount the min count * @param calculator the calculator * @param minScore the min score * @param removeStopWords the remove stop words * @param toString the to string * @return the counter */ default Counter<Tuple> significantBigrams( int minCount, @NonNull ContingencyTableCalculator calculator, double minScore, boolean removeStopWords, @NonNull SerializableFunction<? super Annotation, String> toString) { Counter<Tuple> unigrams = ngrams(NGramSpec.create().order(1)); Counter<Tuple> bigrams = ngrams(NGramSpec.create().order(2)).filterByValue(v -> v >= minCount); Counter<Tuple> filtered = new HashMapCounter<>(); bigrams .items() .forEach( bigram -> { double score = calculator.calculate( ContingencyTable.create2X2( bigrams.get(bigram), unigrams.get(bigram.slice(0, 1)), unigrams.get(bigram.slice(1, 2)), unigrams.sum())); if (score >= minScore) { filtered.set(bigram, score); } }); return filtered; }
/** * Ngrams counter. * * @param nGramSpec the n gram spec * @return the counter */ default Counter<Tuple> ngrams(@NonNull NGramSpec nGramSpec) { return nGramSpec .getValueCalculator() .adjust( new HashMapCounter<>( stream() .flatMap( doc -> doc.ngrams( nGramSpec.getAnnotationType(), nGramSpec.getMin(), nGramSpec.getMax()) .stream() .filter(nGramSpec.getFilter()) .map( hString -> $( hString .get(nGramSpec.getAnnotationType()) .stream() .map(nGramSpec.getToStringFunction()) .collect(Collectors.toList()))) .collect(Collectors.toList())) .countByValue())); }