Ejemplo n.º 1
0
 /**
  * Significant bigrams counter.
  *
  * @param minCount the min count
  * @param calculator the calculator
  * @param minScore the min score
  * @param removeStopWords the remove stop words
  * @param toString the to string
  * @return the counter
  */
 default Counter<Tuple> significantBigrams(
     int minCount,
     @NonNull ContingencyTableCalculator calculator,
     double minScore,
     boolean removeStopWords,
     @NonNull SerializableFunction<? super Annotation, String> toString) {
   Counter<Tuple> unigrams = ngrams(NGramSpec.create().order(1));
   Counter<Tuple> bigrams = ngrams(NGramSpec.create().order(2)).filterByValue(v -> v >= minCount);
   Counter<Tuple> filtered = new HashMapCounter<>();
   bigrams
       .items()
       .forEach(
           bigram -> {
             double score =
                 calculator.calculate(
                     ContingencyTable.create2X2(
                         bigrams.get(bigram),
                         unigrams.get(bigram.slice(0, 1)),
                         unigrams.get(bigram.slice(1, 2)),
                         unigrams.sum()));
             if (score >= minScore) {
               filtered.set(bigram, score);
             }
           });
   return filtered;
 }
Ejemplo n.º 2
0
 /**
  * Ngrams counter.
  *
  * @param nGramSpec the n gram spec
  * @return the counter
  */
 default Counter<Tuple> ngrams(@NonNull NGramSpec nGramSpec) {
   return nGramSpec
       .getValueCalculator()
       .adjust(
           new HashMapCounter<>(
               stream()
                   .flatMap(
                       doc ->
                           doc.ngrams(
                                   nGramSpec.getAnnotationType(),
                                   nGramSpec.getMin(),
                                   nGramSpec.getMax())
                               .stream()
                               .filter(nGramSpec.getFilter())
                               .map(
                                   hString ->
                                       $(
                                           hString
                                               .get(nGramSpec.getAnnotationType())
                                               .stream()
                                               .map(nGramSpec.getToStringFunction())
                                               .collect(Collectors.toList())))
                               .collect(Collectors.toList()))
                   .countByValue()));
 }