Java AnalysisOutput Exemples, org.apache.lucene.analysis.ko.morph.AnalysisOutput Java Exemples

Exemple #1

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

 public static double countFoundNouns(AnalysisOutput o) {
   int count = 0;
   for (int i = 0; i < o.getCNounList().size(); i++) {
     if (o.getCNounList().get(i).isExist()) count++;
   }
   return (count * 100) / o.getCNounList().size();
 }

Exemple #2

0

Afficher le fichier

Fichier : KoreanFilter.java Projet : skyer9/arirang.lucene-analyzer-v4

 /**
  * return the start offset of current decompounds entry.
  *
  * @param output morphlogical analysis output
  * @param index the index of current decompounds entry
  * @return the start offset of current decoumpounds entry
  */
 private int getStartOffset(AnalysisOutput output, int index) {
   int sOffset = 0;
   for (int i = 0; i < index; i++) {
     sOffset += output.getCNounList().get(i).getWord().length();
   }
   return sOffset;
 }

Exemple #3

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

  /**
   * 복합명사에서 단위명사를 분리해낸다. 리스트의 가장 마지막에 위치한 단어가 최장단어이다.
   *
   * @param str 복합명사
   * @param pos the analysing start point
   * @param o 분석결과 return 단위명사 리스트
   * @throws MorphException throw exception
   */
  private static List<WordEntry> findNouns(String str, int pos, AnalysisOutput o)
      throws MorphException {

    List<WordEntry> nList = new ArrayList<WordEntry>();

    if (str.length() == 2
        && DictionaryUtil.existSuffix(str.substring(0, 1))
        && DNouns.contains(str.substring(1))) {
      o.setStem(o.getStem().substring(0, o.getStem().length() - 1));
      o.setNsfx(str.substring(1));
      nList.add(new WordEntry(str.substring(0, 1)));
      return nList;
    } else if (str.length() == 2
        && DictionaryUtil.existSuffix(str.substring(0, 1))
        && DictionaryUtil.existJosa(str.substring(1))) {
      return null;
    }

    if (pos >= 2 && DictionaryUtil.existJosa(str)) return null;

    if (str.length() == 1 && (DictionaryUtil.existSuffix(str) || DNouns.contains(str))) {
      nList.add(new WordEntry(str));
      return nList;
    }

    for (int i = 1; i < str.length(); i++) {
      String sub = str.substring(0, i + 1);
      if (!DictionaryUtil.findWithPrefix(sub).hasNext()) break;
      WordEntry entry = DictionaryUtil.getAllNoun(sub);
      if (entry != null) {
        nList.add(entry);
      }
    }

    return nList;
  }

Exemple #4

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

  /*
   * 마지막 음절이 명사형 접미사(등,상..)인지 조사한다.
   */
  public static boolean confirmDNoun(AnalysisOutput output) throws MorphException {

    int strlen = output.getStem().length();
    String d = output.getStem().substring(strlen - 1);
    if (!DNouns.contains(d)) return false;

    String s = output.getStem().substring(0, strlen - 1);
    output.setNsfx(d);
    output.setStem(s);

    WordEntry cnoun = DictionaryUtil.getAllNoun(s);
    if (cnoun != null) {
      if (cnoun.getFeature(WordEntry.IDX_NOUN) == '2') output.setCNoun(cnoun.getCompounds());
      else output.setCNoun(Collections.EMPTY_LIST);
      output.setScore(AnalysisOutput.SCORE_CORRECT);
    }

    return true;
  }

Exemple #5

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

  /**
   * 용언 + '음/기' + 조사(PTN_VMXMJ)
   *
   * @param o the analyzed output
   * @param candidates candidates
   * @throws MorphException throw exception
   */
  public static boolean analysisVMJ(AnalysisOutput o, List<AnalysisOutput> candidates)
      throws MorphException {

    String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getElist().get(0));
    if (irrs != null) {
      o.setStem(irrs[0]);
      o.setElist(irrs[1], 0);
    }

    if (DictionaryUtil.getVerb(o.getStem()) != null) {
      o.setPatn(PatternConstants.PTN_VMJ);
      o.setPos(PatternConstants.POS_VERB);
      o.setScore(AnalysisOutput.SCORE_CORRECT);
      candidates.add(o);
      return true;
    }

    return false;
  }

Exemple #6

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

  /**
   * 어간부가 음/기 로 끝나는 경우
   *
   * @param o the analyzed output
   * @param candidates candidates
   * @throws MorphException throw exception
   */
  public static boolean analysisMJ(AnalysisOutput o, List<AnalysisOutput> candidates)
      throws MorphException {

    int strlen = o.getStem().length();

    if (strlen < 2) return false;

    char[] chrs = MorphUtil.decompose(o.getStem().charAt(strlen - 1));
    boolean success = false;

    if (o.getStem().charAt(strlen - 1) != '기' && !(chrs.length == 3 && chrs[2] == 'ㅁ'))
      return false;

    String start = o.getStem();
    String end = "";
    if (o.getStem().charAt(strlen - 1) == '기') {
      start = o.getStem().substring(0, strlen - 1);
      end = "기";
    } else if (o.getStem().charAt(strlen - 1) == '음') {
      start = o.getStem().substring(0, strlen - 1);
      end = "음";
    }

    String[] eomis = EomiUtil.splitEomi(start, end);
    if (eomis[0] == null) return false;
    String[] pomis = EomiUtil.splitPomi(eomis[0]);
    o.setStem(pomis[0]);
    o.addElist(eomis[1]);
    o.setPomi(pomis[1]);

    try {
      if (analysisVMJ(o.clone(), candidates)) return true;
      if (analysisNSMJ(o.clone(), candidates)) return true;
      if (analysisVMXMJ(o.clone(), candidates)) return true;
    } catch (CloneNotSupportedException e) {
      throw new MorphException(e.getMessage(), e);
    }

    if (DictionaryUtil.getVerb(o.getStem()) != null) {
      o.setPos(PatternConstants.POS_VERB);
      o.setPatn(PatternConstants.PTN_VMJ);
      o.setScore(AnalysisOutput.SCORE_CORRECT);
      candidates.add(o);
      return true;
    }

    return false;
  }

Exemple #7

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

  public static boolean analysisNSMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates)
      throws MorphException {

    int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem());
    if (idxVbSfix == -1) return false;

    o.setVsfx(o.getStem().substring(idxVbSfix));
    o.setStem(o.getStem().substring(0, idxVbSfix));
    o.setPatn(PatternConstants.PTN_NSMXMJ);
    o.setPos(PatternConstants.POS_NOUN);

    WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem());

    if (entry != null) {
      if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false;
      else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false;
      else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false;
      else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false;
      o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
    } else {
      o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다.
    }

    candidates.add(o);

    return true;
  }

Exemple #8

0

Afficher le fichier

Fichier : NounUtil.java Projet : jong03/arirang.lucene-analyzer-5.0.0

  /**
   * 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ)
   *
   * @param o the analyzed output
   * @param candidates candidates
   * @throws MorphException throw exception
   */
  public static boolean analysisVMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates)
      throws MorphException {

    int idxXVerb = VerbUtil.endsWithXVerb(o.getStem());

    if (idxXVerb != -1) { // 2. 사랑받아보다
      String eogan = o.getStem().substring(0, idxXVerb);
      o.setXverb(o.getStem().substring(idxXVerb));

      String[] stomis = null;
      if (eogan.endsWith("아") || eogan.endsWith("어"))
        stomis =
            EomiUtil.splitEomi(
                eogan.substring(0, eogan.length() - 1), eogan.substring(eogan.length() - 1));
      else stomis = EomiUtil.splitEomi(eogan, "");
      if (stomis[0] == null) return false;

      String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]);
      if (irrs != null) {
        o.setStem(irrs[0]);
        o.addElist(irrs[1]);
      } else {
        o.setStem(stomis[0]);
        o.addElist(stomis[1]);
      }

      if (DictionaryUtil.getVerb(o.getStem()) != null) {
        o.setPatn(PatternConstants.PTN_VMXMJ);
        o.setPos(PatternConstants.POS_VERB);
        o.setScore(AnalysisOutput.SCORE_CORRECT);
        candidates.add(o);
        return true;
      } else if (analysisNSMXMJ(o, candidates)) {
        return true;
      }
    }

    return false;
  }

Exemple #9

0

Afficher le fichier

Fichier : KoreanFilter.java Projet : skyer9/arirang.lucene-analyzer-v4

  private void extractKeyword(
      List<AnalysisOutput> outputs, int startoffset, Map<String, KoreanToken> map, int position) {

    int maxDecompounds = 0;
    int maxStem = 0;

    for (AnalysisOutput output : outputs) {
      if (queryMode
          && hasOrigin
          && output.getScore() == AnalysisOutput.SCORE_ANALYSIS
          && output.getCNounList().size() < 2) break;
      if (output.getPos() == PatternConstants.POS_VERB) continue; // extract keywords from only noun
      if (!originCNoun && output.getCNounList().size() > 0) continue; // except compound nound
      int inc = map.size() > 0 ? 0 : 1;
      map.put(
          position + ":" + output.getStem(), new KoreanToken(output.getStem(), startoffset, inc));

      if (output.getStem().length() > maxStem) maxStem = output.getStem().length();
      if (output.getCNounList().size() > maxDecompounds)
        maxDecompounds = output.getCNounList().size();

      // extract the first stem as the keyword for the query processing
      if (queryMode) break;
    }

    if (maxDecompounds > 1) {
      for (int i = 0; i < maxDecompounds; i++) {
        position += i;

        int cPosition = position;
        for (AnalysisOutput output : outputs) {
          if (output.getPos() == PatternConstants.POS_VERB || output.getCNounList().size() <= i)
            continue;

          CompoundEntry cEntry = output.getCNounList().get(i);
          int cStartoffset = getStartOffset(output, i) + startoffset;
          int inc = i == 0 ? 0 : 1;
          map.put(
              (cPosition) + ":" + cEntry.getWord(),
              new KoreanToken(cEntry.getWord(), cStartoffset, inc));

          if (bigrammable && !cEntry.isExist())
            cPosition = addBiagramToMap(cEntry.getWord(), cStartoffset, map, cPosition);

          // extract	the words derived from the first stem as the keyword for the query processing
          if (queryMode) break;
        }
      }
    } else {
      for (AnalysisOutput output : outputs) {
        if (output.getPos() == PatternConstants.POS_VERB) continue;

        if (bigrammable && output.getScore() < AnalysisOutput.SCORE_COMPOUNDS)
          addBiagramToMap(output.getStem(), startoffset, map, position);
      }
    }
  }