private TrigramCollocation trigramParsing( Map<String, Integer> trigram, String text, TrigramCollocation previousWords) { Matcher matcher = pattern.matcher(text); int begIndex = 0; TrigramCollocation begWords = previousWords; String word; while (matcher.find()) { word = text.substring(begIndex, matcher.start()).toLowerCase(); if (word.length() > 1) { if (!StopWordsHelper.INSTANCE.isStopWord(word)) { if (begWords.addWord(word)) { countWord(trigram, begWords.toString()); begWords.shiftWords(); } if (!" ".equals(matcher.group())) { begWords.clear(); } } } else { begWords.clear(); } begIndex = matcher.end(); } if (begIndex != text.length()) { word = text.substring(begIndex).toLowerCase(); if (!StopWordsHelper.INSTANCE.isStopWord(word)) { if (begWords.addWord(word)) { countWord(trigram, begWords.toString()); begWords.shiftWords(); } } } return begWords; }
private String bigramParsing(Map<String, Integer> bigram, String text, String previousWord) { Matcher matcher = pattern.matcher(text); int begIndex = 0; String begWord = previousWord; String collocation, word; while (matcher.find()) { word = text.substring(begIndex, matcher.start()).toLowerCase(); if (word.length() > 1) { if (!StopWordsHelper.INSTANCE.isStopWord(word)) { if (begWord != null) { collocation = new StringBuilder(begWord).append(" ").append(word).toString(); countWord(bigram, collocation); } begWord = " ".equals(matcher.group()) ? word : null; } } else { begWord = null; } begIndex = matcher.end(); } if (begIndex != text.length()) { word = text.substring(begIndex).toLowerCase(); if (!StopWordsHelper.INSTANCE.isStopWord(word)) { if (begWord != null) { collocation = new StringBuilder(begWord).append(" ").append(word).toString(); countWord(bigram, collocation); } begWord = word; } } return begWord; }
private void countWord(Map<String, Integer> map, String collocation) { if (!StopWordsHelper.INSTANCE.isStopWord(collocation)) { Integer count = map.get(collocation); if (count == null) { count = 0; } map.put(collocation, ++count); } }
private void unigramParsing(Map<String, Integer> unigram, String text) { String lowerCaseWord; for (String word : text.split(WORD_SPLITTING_REGEX)) { lowerCaseWord = word.toLowerCase(); if (!StopWordsHelper.INSTANCE.isStopWord(lowerCaseWord)) { countWord(unigram, lowerCaseWord); } } }