private int validation( List<WordListCandidate> candiateList, String thisChar, int start, String inputText) { int newStart = -1; AnalysisOutput dividedOutput = null; boolean lastPos = true; for (int i = candiateList.size() - 1; i >= 0; i--) { WordListCandidate candidate = candiateList.get(i); AnalysisOutput output = candidate.getWordList().get(0).get(0); int tempStart = validWord(output, start, inputText, lastPos); lastPos = false; if (tempStart <= start) { newStart = tempStart; dividedOutput = output; break; } } // if here is a dividing point. if (newStart == start) removeInvalidCandidate(candiateList, dividedOutput); return newStart; }
/** * when the fragment can be analyzed as a verb, check whether if noun is included in the fragment. * prevent from being divided such as "전복사고==>전^복사고" * * @param candidate all candidate list * @param dividedOutput this analysis output * @return check result */ private boolean includeNoun(WordListCandidate candidate, AnalysisOutput dividedOutput, int pos) { if (candidate.getWordList().size() > 1) { AnalysisOutput nextOutput = candidate.getWordList().get(1).get(0); if (nextOutput.getSource().length() > 1 && nextOutput.getPatn() == PatternConstants.PTN_N && nextOutput.getScore() == AnalysisOutput.SCORE_CORRECT) return true; } return false; }
private boolean hasConsecutiveOneWord(WordListCandidate candidate) { int size = candidate.getWordList().size(); for (int i = 1; i < size; i++) { List<AnalysisOutput> outputs1 = candidate.getWordList().get(i - 1); List<AnalysisOutput> outputs2 = candidate.getWordList().get(i); if (outputs1.get(0).getStem().length() == 1 && outputs2.get(0).getStem().length() == 1) return true; } return false; }
/** * @param candiateList all candidate list * @param dividedOutput the dividing analysis output */ private void removeInvalidCandidate( List<WordListCandidate> candiateList, AnalysisOutput dividedOutput) { List<WordListCandidate> removes = new ArrayList<WordListCandidate>(); for (int i = 0; i < candiateList.size(); i++) { WordListCandidate candidate = candiateList.get(i); AnalysisOutput output = candidate.getWordList().get(0).get(0); if (!output.getSource().equals(dividedOutput.getSource()) && !includeNoun(candidate, dividedOutput, i)) removes.add(candidate); } candiateList.removeAll(removes); }
/** * segment unsegmented sentence into words a input text which has less than 3 characters or more * than 10 character is not analyzed, because It seems to be no word segmentation error. * * @param inputText unsegmented sentence * @return segmented sentence into words * @throws MorphException */ public void analyze(String inputText, List<List<AnalysisOutput>> result, boolean containOneJosa) throws MorphException { List<WordListCandidate> candiateList = new ArrayList<WordListCandidate>(); List<AnalysisOutput> aoList = morphAnal.analyze(inputText); if (aoList.get(0).getScore() == AnalysisOutput.SCORE_CORRECT && !containOneJosa) { // valid morpheme result.add(aoList); return; } int length = inputText.length(); // add last character as the first candidate WordListCandidate listCandidate = new WordListCandidate(morphAnal.analyze(inputText.substring(length - 1, length))); candiateList.add(listCandidate); boolean divided = false; Map<String, List<AnalysisOutput>> analyzedSet = new HashMap<String, List<AnalysisOutput>>(); // from last position, check whether if each position can be a dividing point. for (int start = inputText.length() - 2; start >= 0; start--) { String thisChar = Character.toString(inputText.charAt(start)); List<WordListCandidate> newCandidates = null; // if(!divided) { // newly created candidates newCandidates = new ArrayList<WordListCandidate>(); for (WordListCandidate candidate : candiateList) { String fragment = thisChar + candidate.getFirstFragment(); // build the position key with the start position and the end position String posKey = new StringBuffer() .append(start) .append(",") .append(start + candidate.getFirstFragment().length()) .toString(); WordListCandidate newCandidate = candidate.newCopy(); List<AnalysisOutput> outputs = analyzedSet.get(posKey); // check whether if already analyzed. if (outputs == null) { outputs = morphAnal.analyze(fragment); newCandidate.replaceFirst(outputs); analyzedSet.put(posKey, outputs); } else { newCandidate.replaceFirst(outputs); } newCandidates.add(newCandidate); } // } List<AnalysisOutput> outputs = morphAnal.analyze(thisChar); String posKey = new StringBuffer().append(start).append(",").append(start + 1).toString(); analyzedSet.put(posKey, outputs); for (WordListCandidate candidate : candiateList) { candidate.addWord(outputs); } if (newCandidates != null) candiateList.addAll(newCandidates); if (candiateList.size() >= maxCandidate) { Collections.sort(candiateList, new WordListComparator()); removeLast(candiateList, adjustNoOfCandidate); } // int newStart = validation(candiateList, thisChar, start, inputText); // // divided = (newStart==start); // if(divided) start = newStart; } Collections.sort(candiateList, new WordListComparator()); // List<AnalysisOutput> result = new ArrayList<AnalysisOutput>(); for (WordListCandidate candidate : candiateList) { if (candiateList.indexOf(candidate) != candiateList.size() - 1 && hasConsecutiveOneWord(candidate)) continue; for (List<AnalysisOutput> outputs : candidate.getWordList()) { result.add(outputs); } break; } }