Пример #1
0
    private TrigramCollocation trigramParsing(
        Map<String, Integer> trigram, String text, TrigramCollocation previousWords) {
      Matcher matcher = pattern.matcher(text);
      int begIndex = 0;
      TrigramCollocation begWords = previousWords;
      String word;
      while (matcher.find()) {
        word = text.substring(begIndex, matcher.start()).toLowerCase();
        if (word.length() > 1) {
          if (!StopWordsHelper.INSTANCE.isStopWord(word)) {
            if (begWords.addWord(word)) {
              countWord(trigram, begWords.toString());
              begWords.shiftWords();
            }

            if (!" ".equals(matcher.group())) {
              begWords.clear();
            }
          }
        } else {
          begWords.clear();
        }
        begIndex = matcher.end();
      }
      if (begIndex != text.length()) {
        word = text.substring(begIndex).toLowerCase();
        if (!StopWordsHelper.INSTANCE.isStopWord(word)) {
          if (begWords.addWord(word)) {
            countWord(trigram, begWords.toString());
            begWords.shiftWords();
          }
        }
      }
      return begWords;
    }
Пример #2
0
 private String bigramParsing(Map<String, Integer> bigram, String text, String previousWord) {
   Matcher matcher = pattern.matcher(text);
   int begIndex = 0;
   String begWord = previousWord;
   String collocation, word;
   while (matcher.find()) {
     word = text.substring(begIndex, matcher.start()).toLowerCase();
     if (word.length() > 1) {
       if (!StopWordsHelper.INSTANCE.isStopWord(word)) {
         if (begWord != null) {
           collocation = new StringBuilder(begWord).append(" ").append(word).toString();
           countWord(bigram, collocation);
         }
         begWord = " ".equals(matcher.group()) ? word : null;
       }
     } else {
       begWord = null;
     }
     begIndex = matcher.end();
   }
   if (begIndex != text.length()) {
     word = text.substring(begIndex).toLowerCase();
     if (!StopWordsHelper.INSTANCE.isStopWord(word)) {
       if (begWord != null) {
         collocation = new StringBuilder(begWord).append(" ").append(word).toString();
         countWord(bigram, collocation);
       }
       begWord = word;
     }
   }
   return begWord;
 }
Пример #3
0
 private void countWord(Map<String, Integer> map, String collocation) {
   if (!StopWordsHelper.INSTANCE.isStopWord(collocation)) {
     Integer count = map.get(collocation);
     if (count == null) {
       count = 0;
     }
     map.put(collocation, ++count);
   }
 }
Пример #4
0
    private void unigramParsing(Map<String, Integer> unigram, String text) {

      String lowerCaseWord;
      for (String word : text.split(WORD_SPLITTING_REGEX)) {
        lowerCaseWord = word.toLowerCase();
        if (!StopWordsHelper.INSTANCE.isStopWord(lowerCaseWord)) {
          countWord(unigram, lowerCaseWord);
        }
      }
    }