private int addBiagramToMap( String input, int startoffset, Map<String, KoreanToken> map, int position) { int offset = 0; int strlen = input.length(); if (strlen < 2) return position; while (offset < strlen - 1) { int inc = offset == 0 ? 0 : 1; if (isAlphaNumChar(input.charAt(offset))) { String text = findAlphaNumeric(input.substring(offset)); map.put(position + ":" + text, new KoreanToken(text, startoffset + offset, inc)); offset += text.length(); } else { String text = input.substring(offset, offset + 2 > strlen ? strlen : offset + 2); map.put(position + ":" + text, new KoreanToken(text, startoffset + offset, inc)); offset++; } position += 1; } return position - 1; }
private void extractKeyword( List<AnalysisOutput> outputs, int startoffset, Map<String, KoreanToken> map, int position) { int maxDecompounds = 0; int maxStem = 0; for (AnalysisOutput output : outputs) { if (queryMode && hasOrigin && output.getScore() == AnalysisOutput.SCORE_ANALYSIS && output.getCNounList().size() < 2) break; if (output.getPos() == PatternConstants.POS_VERB) continue; // extract keywords from only noun if (!originCNoun && output.getCNounList().size() > 0) continue; // except compound nound int inc = map.size() > 0 ? 0 : 1; map.put( position + ":" + output.getStem(), new KoreanToken(output.getStem(), startoffset, inc)); if (output.getStem().length() > maxStem) maxStem = output.getStem().length(); if (output.getCNounList().size() > maxDecompounds) maxDecompounds = output.getCNounList().size(); // extract the first stem as the keyword for the query processing if (queryMode) break; } if (maxDecompounds > 1) { for (int i = 0; i < maxDecompounds; i++) { position += i; int cPosition = position; for (AnalysisOutput output : outputs) { if (output.getPos() == PatternConstants.POS_VERB || output.getCNounList().size() <= i) continue; CompoundEntry cEntry = output.getCNounList().get(i); int cStartoffset = getStartOffset(output, i) + startoffset; int inc = i == 0 ? 0 : 1; map.put( (cPosition) + ":" + cEntry.getWord(), new KoreanToken(cEntry.getWord(), cStartoffset, inc)); if (bigrammable && !cEntry.isExist()) cPosition = addBiagramToMap(cEntry.getWord(), cStartoffset, map, cPosition); // extract the words derived from the first stem as the keyword for the query processing if (queryMode) break; } } } else { for (AnalysisOutput output : outputs) { if (output.getPos() == PatternConstants.POS_VERB) continue; if (bigrammable && output.getScore() < AnalysisOutput.SCORE_COMPOUNDS) addBiagramToMap(output.getStem(), startoffset, map, position); } } }
/** * Analyze korean text * * @throws MorphException */ private void analysisKorean(String input) throws MorphException { input = trimHangul(input); List<AnalysisOutput> outputs = morph.analyze(input); if (outputs.size() == 0) return; Map<String, KoreanToken> map = new LinkedHashMap<String, KoreanToken>(); if (hasOrigin) map.put("0:" + input, new KoreanToken(input, offsetAtt.startOffset())); extractKeyword(outputs, offsetAtt.startOffset(), map, 0); // if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) { // extractKeyword(outputs,offsetAtt.startOffset(), map, 0); //// } else { //// // check whether the input text has some insert spacing errors. //// List<AnalysisOutput> list = wsAnal.analyze(input); //// List<AnalysisOutput> results = new ArrayList<AnalysisOutput>(); //// if(list.size()>1 && wsAnal.getOutputScore(list)>AnalysisOutput.SCORE_ANALYSIS) { //// int offset = 0; //// for(AnalysisOutput o : list) { //// if(hasOrigin) map.put(o.getSource(), new // Token(o.getSource(),offsetAtt.startOffset()+offset,1)); //// results.addAll(morph.analyze(o.getSource())); //// offset += o.getSource().length(); //// } //// } else { //// results.addAll(outputs); //// } //// extractKeyword(results, offsetAtt.startOffset(), map, 0); // } Collection<KoreanToken> values = map.values(); for (KoreanToken kt : values) { kt.setOutputs(outputs); } morphQueue.addAll(map.values()); }