public static double countFoundNouns(AnalysisOutput o) { int count = 0; for (int i = 0; i < o.getCNounList().size(); i++) { if (o.getCNounList().get(i).isExist()) count++; } return (count * 100) / o.getCNounList().size(); }
/** * return the start offset of current decompounds entry. * * @param output morphlogical analysis output * @param index the index of current decompounds entry * @return the start offset of current decoumpounds entry */ private int getStartOffset(AnalysisOutput output, int index) { int sOffset = 0; for (int i = 0; i < index; i++) { sOffset += output.getCNounList().get(i).getWord().length(); } return sOffset; }
/** * 복합명사에서 단위명사를 분리해낸다. 리스트의 가장 마지막에 위치한 단어가 최장단어이다. * * @param str 복합명사 * @param pos the analysing start point * @param o 분석결과 return 단위명사 리스트 * @throws MorphException throw exception */ private static List<WordEntry> findNouns(String str, int pos, AnalysisOutput o) throws MorphException { List<WordEntry> nList = new ArrayList<WordEntry>(); if (str.length() == 2 && DictionaryUtil.existSuffix(str.substring(0, 1)) && DNouns.contains(str.substring(1))) { o.setStem(o.getStem().substring(0, o.getStem().length() - 1)); o.setNsfx(str.substring(1)); nList.add(new WordEntry(str.substring(0, 1))); return nList; } else if (str.length() == 2 && DictionaryUtil.existSuffix(str.substring(0, 1)) && DictionaryUtil.existJosa(str.substring(1))) { return null; } if (pos >= 2 && DictionaryUtil.existJosa(str)) return null; if (str.length() == 1 && (DictionaryUtil.existSuffix(str) || DNouns.contains(str))) { nList.add(new WordEntry(str)); return nList; } for (int i = 1; i < str.length(); i++) { String sub = str.substring(0, i + 1); if (!DictionaryUtil.findWithPrefix(sub).hasNext()) break; WordEntry entry = DictionaryUtil.getAllNoun(sub); if (entry != null) { nList.add(entry); } } return nList; }
/* * 마지막 음절이 명사형 접미사(등,상..)인지 조사한다. */ public static boolean confirmDNoun(AnalysisOutput output) throws MorphException { int strlen = output.getStem().length(); String d = output.getStem().substring(strlen - 1); if (!DNouns.contains(d)) return false; String s = output.getStem().substring(0, strlen - 1); output.setNsfx(d); output.setStem(s); WordEntry cnoun = DictionaryUtil.getAllNoun(s); if (cnoun != null) { if (cnoun.getFeature(WordEntry.IDX_NOUN) == '2') output.setCNoun(cnoun.getCompounds()); else output.setCNoun(Collections.EMPTY_LIST); output.setScore(AnalysisOutput.SCORE_CORRECT); } return true; }
/** * 용언 + '음/기' + 조사(PTN_VMXMJ) * * @param o the analyzed output * @param candidates candidates * @throws MorphException throw exception */ public static boolean analysisVMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { String[] irrs = IrregularUtil.restoreIrregularVerb(o.getStem(), o.getElist().get(0)); if (irrs != null) { o.setStem(irrs[0]); o.setElist(irrs[1], 0); } if (DictionaryUtil.getVerb(o.getStem()) != null) { o.setPatn(PatternConstants.PTN_VMJ); o.setPos(PatternConstants.POS_VERB); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } return false; }
/** * 어간부가 음/기 로 끝나는 경우 * * @param o the analyzed output * @param candidates candidates * @throws MorphException throw exception */ public static boolean analysisMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int strlen = o.getStem().length(); if (strlen < 2) return false; char[] chrs = MorphUtil.decompose(o.getStem().charAt(strlen - 1)); boolean success = false; if (o.getStem().charAt(strlen - 1) != '기' && !(chrs.length == 3 && chrs[2] == 'ㅁ')) return false; String start = o.getStem(); String end = ""; if (o.getStem().charAt(strlen - 1) == '기') { start = o.getStem().substring(0, strlen - 1); end = "기"; } else if (o.getStem().charAt(strlen - 1) == '음') { start = o.getStem().substring(0, strlen - 1); end = "음"; } String[] eomis = EomiUtil.splitEomi(start, end); if (eomis[0] == null) return false; String[] pomis = EomiUtil.splitPomi(eomis[0]); o.setStem(pomis[0]); o.addElist(eomis[1]); o.setPomi(pomis[1]); try { if (analysisVMJ(o.clone(), candidates)) return true; if (analysisNSMJ(o.clone(), candidates)) return true; if (analysisVMXMJ(o.clone(), candidates)) return true; } catch (CloneNotSupportedException e) { throw new MorphException(e.getMessage(), e); } if (DictionaryUtil.getVerb(o.getStem()) != null) { o.setPos(PatternConstants.POS_VERB); o.setPatn(PatternConstants.PTN_VMJ); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } return false; }
public static boolean analysisNSMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int idxVbSfix = VerbUtil.endsWithVerbSuffix(o.getStem()); if (idxVbSfix == -1) return false; o.setVsfx(o.getStem().substring(idxVbSfix)); o.setStem(o.getStem().substring(0, idxVbSfix)); o.setPatn(PatternConstants.PTN_NSMXMJ); o.setPos(PatternConstants.POS_NOUN); WordEntry entry = DictionaryUtil.getWordExceptVerb(o.getStem()); if (entry != null) { if (entry.getFeature(WordEntry.IDX_NOUN) == '0') return false; else if (o.getVsfx().equals("하") && entry.getFeature(WordEntry.IDX_DOV) != '1') return false; else if (o.getVsfx().equals("되") && entry.getFeature(WordEntry.IDX_BEV) != '1') return false; else if (o.getVsfx().equals("내") && entry.getFeature(WordEntry.IDX_NE) != '1') return false; o.setScore(AnalysisOutput.SCORE_CORRECT); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } else { o.setScore(AnalysisOutput.SCORE_ANALYSIS); // '입니다'인 경우 인명 등 미등록어가 많이 발생되므로 분석성공으로 가정한다. } candidates.add(o); return true; }
/** * 용언 + '아/어' + 보조용언 + '음/기' + 조사(PTN_VMXMJ) * * @param o the analyzed output * @param candidates candidates * @throws MorphException throw exception */ public static boolean analysisVMXMJ(AnalysisOutput o, List<AnalysisOutput> candidates) throws MorphException { int idxXVerb = VerbUtil.endsWithXVerb(o.getStem()); if (idxXVerb != -1) { // 2. 사랑받아보다 String eogan = o.getStem().substring(0, idxXVerb); o.setXverb(o.getStem().substring(idxXVerb)); String[] stomis = null; if (eogan.endsWith("아") || eogan.endsWith("어")) stomis = EomiUtil.splitEomi( eogan.substring(0, eogan.length() - 1), eogan.substring(eogan.length() - 1)); else stomis = EomiUtil.splitEomi(eogan, ""); if (stomis[0] == null) return false; String[] irrs = IrregularUtil.restoreIrregularVerb(stomis[0], stomis[1]); if (irrs != null) { o.setStem(irrs[0]); o.addElist(irrs[1]); } else { o.setStem(stomis[0]); o.addElist(stomis[1]); } if (DictionaryUtil.getVerb(o.getStem()) != null) { o.setPatn(PatternConstants.PTN_VMXMJ); o.setPos(PatternConstants.POS_VERB); o.setScore(AnalysisOutput.SCORE_CORRECT); candidates.add(o); return true; } else if (analysisNSMXMJ(o, candidates)) { return true; } } return false; }
private void extractKeyword( List<AnalysisOutput> outputs, int startoffset, Map<String, KoreanToken> map, int position) { int maxDecompounds = 0; int maxStem = 0; for (AnalysisOutput output : outputs) { if (queryMode && hasOrigin && output.getScore() == AnalysisOutput.SCORE_ANALYSIS && output.getCNounList().size() < 2) break; if (output.getPos() == PatternConstants.POS_VERB) continue; // extract keywords from only noun if (!originCNoun && output.getCNounList().size() > 0) continue; // except compound nound int inc = map.size() > 0 ? 0 : 1; map.put( position + ":" + output.getStem(), new KoreanToken(output.getStem(), startoffset, inc)); if (output.getStem().length() > maxStem) maxStem = output.getStem().length(); if (output.getCNounList().size() > maxDecompounds) maxDecompounds = output.getCNounList().size(); // extract the first stem as the keyword for the query processing if (queryMode) break; } if (maxDecompounds > 1) { for (int i = 0; i < maxDecompounds; i++) { position += i; int cPosition = position; for (AnalysisOutput output : outputs) { if (output.getPos() == PatternConstants.POS_VERB || output.getCNounList().size() <= i) continue; CompoundEntry cEntry = output.getCNounList().get(i); int cStartoffset = getStartOffset(output, i) + startoffset; int inc = i == 0 ? 0 : 1; map.put( (cPosition) + ":" + cEntry.getWord(), new KoreanToken(cEntry.getWord(), cStartoffset, inc)); if (bigrammable && !cEntry.isExist()) cPosition = addBiagramToMap(cEntry.getWord(), cStartoffset, map, cPosition); // extract the words derived from the first stem as the keyword for the query processing if (queryMode) break; } } } else { for (AnalysisOutput output : outputs) { if (output.getPos() == PatternConstants.POS_VERB) continue; if (bigrammable && output.getScore() < AnalysisOutput.SCORE_COMPOUNDS) addBiagramToMap(output.getStem(), startoffset, map, position); } } }