public boolean incrementToken() throws IOException { if (!morphQueue.isEmpty()) { restoreState(currentState); setAttributesFromQueue(false); return true; } while (input.incrementToken()) { final String type = typeAtt.type(); if (KOREAN_TYPE.equals(type)) { try { analysisKorean(termAtt.toString()); } catch (MorphException e) { throw new RuntimeException(e); } } else { return true; // pass anything else thru } if (!morphQueue.isEmpty()) { setAttributesFromQueue(true); return true; } } return false; }
private void setAttributesFromQueue(boolean isFirst) { final KoreanToken iw = morphQueue.removeFirst(); if (isFirst && !morphQueue.isEmpty()) { // our queue has more elements remaining (e.g. we decompounded) // capture state for those. We set the term attribute to be empty // so we save lots of array copying later. termAtt.setEmpty(); currentState = captureState(); } termAtt.setEmpty().append(iw.getTerm()); offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getLength()); morphAtt.setToken(iw); // on the first Token we preserve incoming increment: if (!isFirst) { posIncrAtt.setPositionIncrement(iw.getPosInc()); } // TODO: How to handle PositionLengthAttribute correctly? }
/** * Analyze korean text * * @throws MorphException */ private void analysisKorean(String input) throws MorphException { input = trimHangul(input); List<AnalysisOutput> outputs = morph.analyze(input); if (outputs.size() == 0) return; Map<String, KoreanToken> map = new LinkedHashMap<String, KoreanToken>(); if (hasOrigin) map.put("0:" + input, new KoreanToken(input, offsetAtt.startOffset())); extractKeyword(outputs, offsetAtt.startOffset(), map, 0); // if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) { // extractKeyword(outputs,offsetAtt.startOffset(), map, 0); //// } else { //// // check whether the input text has some insert spacing errors. //// List<AnalysisOutput> list = wsAnal.analyze(input); //// List<AnalysisOutput> results = new ArrayList<AnalysisOutput>(); //// if(list.size()>1 && wsAnal.getOutputScore(list)>AnalysisOutput.SCORE_ANALYSIS) { //// int offset = 0; //// for(AnalysisOutput o : list) { //// if(hasOrigin) map.put(o.getSource(), new // Token(o.getSource(),offsetAtt.startOffset()+offset,1)); //// results.addAll(morph.analyze(o.getSource())); //// offset += o.getSource().length(); //// } //// } else { //// results.addAll(outputs); //// } //// extractKeyword(results, offsetAtt.startOffset(), map, 0); // } Collection<KoreanToken> values = map.values(); for (KoreanToken kt : values) { kt.setOutputs(outputs); } morphQueue.addAll(map.values()); }
@Override public void reset() throws IOException { super.reset(); morphQueue.clear(); currentState = null; }