コード例 #1
0
  private int validation(
      List<WordListCandidate> candiateList, String thisChar, int start, String inputText) {

    int newStart = -1;
    AnalysisOutput dividedOutput = null;

    boolean lastPos = true;
    for (int i = candiateList.size() - 1; i >= 0; i--) {

      WordListCandidate candidate = candiateList.get(i);
      AnalysisOutput output = candidate.getWordList().get(0).get(0);

      int tempStart = validWord(output, start, inputText, lastPos);
      lastPos = false;

      if (tempStart <= start) {
        newStart = tempStart;
        dividedOutput = output;
        break;
      }
    }

    // if here is a dividing point.
    if (newStart == start) removeInvalidCandidate(candiateList, dividedOutput);

    return newStart;
  }
コード例 #2
0
  /**
   * when the fragment can be analyzed as a verb, check whether if noun is included in the fragment.
   * prevent from being divided such as "전복사고==>전^복사고"
   *
   * @param candidate all candidate list
   * @param dividedOutput this analysis output
   * @return check result
   */
  private boolean includeNoun(WordListCandidate candidate, AnalysisOutput dividedOutput, int pos) {

    if (candidate.getWordList().size() > 1) {
      AnalysisOutput nextOutput = candidate.getWordList().get(1).get(0);
      if (nextOutput.getSource().length() > 1
          && nextOutput.getPatn() == PatternConstants.PTN_N
          && nextOutput.getScore() == AnalysisOutput.SCORE_CORRECT) return true;
    }

    return false;
  }
コード例 #3
0
  private boolean hasConsecutiveOneWord(WordListCandidate candidate) {

    int size = candidate.getWordList().size();
    for (int i = 1; i < size; i++) {
      List<AnalysisOutput> outputs1 = candidate.getWordList().get(i - 1);
      List<AnalysisOutput> outputs2 = candidate.getWordList().get(i);
      if (outputs1.get(0).getStem().length() == 1 && outputs2.get(0).getStem().length() == 1)
        return true;
    }

    return false;
  }
コード例 #4
0
  /**
   * @param candiateList all candidate list
   * @param dividedOutput the dividing analysis output
   */
  private void removeInvalidCandidate(
      List<WordListCandidate> candiateList, AnalysisOutput dividedOutput) {

    List<WordListCandidate> removes = new ArrayList<WordListCandidate>();
    for (int i = 0; i < candiateList.size(); i++) {

      WordListCandidate candidate = candiateList.get(i);
      AnalysisOutput output = candidate.getWordList().get(0).get(0);

      if (!output.getSource().equals(dividedOutput.getSource())
          && !includeNoun(candidate, dividedOutput, i)) removes.add(candidate);
    }

    candiateList.removeAll(removes);
  }
コード例 #5
0
  /**
   * segment unsegmented sentence into words a input text which has less than 3 characters or more
   * than 10 character is not analyzed, because It seems to be no word segmentation error.
   *
   * @param inputText unsegmented sentence
   * @return segmented sentence into words
   * @throws MorphException
   */
  public void analyze(String inputText, List<List<AnalysisOutput>> result, boolean containOneJosa)
      throws MorphException {

    List<WordListCandidate> candiateList = new ArrayList<WordListCandidate>();

    List<AnalysisOutput> aoList = morphAnal.analyze(inputText);
    if (aoList.get(0).getScore() == AnalysisOutput.SCORE_CORRECT
        && !containOneJosa) { // valid morpheme
      result.add(aoList);
      return;
    }

    int length = inputText.length();
    // add last character as the first candidate
    WordListCandidate listCandidate =
        new WordListCandidate(morphAnal.analyze(inputText.substring(length - 1, length)));

    candiateList.add(listCandidate);

    boolean divided = false;

    Map<String, List<AnalysisOutput>> analyzedSet = new HashMap<String, List<AnalysisOutput>>();

    // from last position, check whether if each position can be a dividing point.
    for (int start = inputText.length() - 2; start >= 0; start--) {

      String thisChar = Character.toString(inputText.charAt(start));
      List<WordListCandidate> newCandidates = null;

      //      if(!divided) {
      // newly created candidates
      newCandidates = new ArrayList<WordListCandidate>();

      for (WordListCandidate candidate : candiateList) {

        String fragment = thisChar + candidate.getFirstFragment();
        // build the position key with the start position and the end position
        String posKey =
            new StringBuffer()
                .append(start)
                .append(",")
                .append(start + candidate.getFirstFragment().length())
                .toString();

        WordListCandidate newCandidate = candidate.newCopy();
        List<AnalysisOutput> outputs = analyzedSet.get(posKey);

        // check whether if already analyzed.
        if (outputs == null) {
          outputs = morphAnal.analyze(fragment);
          newCandidate.replaceFirst(outputs);
          analyzedSet.put(posKey, outputs);
        } else {
          newCandidate.replaceFirst(outputs);
        }

        newCandidates.add(newCandidate);
      }
      //      }

      List<AnalysisOutput> outputs = morphAnal.analyze(thisChar);

      String posKey = new StringBuffer().append(start).append(",").append(start + 1).toString();

      analyzedSet.put(posKey, outputs);

      for (WordListCandidate candidate : candiateList) {
        candidate.addWord(outputs);
      }

      if (newCandidates != null) candiateList.addAll(newCandidates);
      if (candiateList.size() >= maxCandidate) {
        Collections.sort(candiateList, new WordListComparator());
        removeLast(candiateList, adjustNoOfCandidate);
      }

      //      int newStart = validation(candiateList, thisChar, start, inputText);
      //
      //      divided = (newStart==start);
      //      if(divided) start = newStart;
    }

    Collections.sort(candiateList, new WordListComparator());
    //    List<AnalysisOutput> result = new ArrayList<AnalysisOutput>();
    for (WordListCandidate candidate : candiateList) {

      if (candiateList.indexOf(candidate) != candiateList.size() - 1
          && hasConsecutiveOneWord(candidate)) continue;

      for (List<AnalysisOutput> outputs : candidate.getWordList()) {
        result.add(outputs);
      }
      break;
    }
  }