private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); tokenizer.reset(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); } tokenizer.end(); return result.toString(); }
/** * Analyze korean text * * @throws MorphException */ private void analysisKorean(String input) throws MorphException { input = trimHangul(input); List<AnalysisOutput> outputs = morph.analyze(input); if (outputs.size() == 0) return; Map<String, KoreanToken> map = new LinkedHashMap<String, KoreanToken>(); if (hasOrigin) map.put("0:" + input, new KoreanToken(input, offsetAtt.startOffset())); extractKeyword(outputs, offsetAtt.startOffset(), map, 0); // if(outputs.get(0).getScore()>=AnalysisOutput.SCORE_COMPOUNDS) { // extractKeyword(outputs,offsetAtt.startOffset(), map, 0); //// } else { //// // check whether the input text has some insert spacing errors. //// List<AnalysisOutput> list = wsAnal.analyze(input); //// List<AnalysisOutput> results = new ArrayList<AnalysisOutput>(); //// if(list.size()>1 && wsAnal.getOutputScore(list)>AnalysisOutput.SCORE_ANALYSIS) { //// int offset = 0; //// for(AnalysisOutput o : list) { //// if(hasOrigin) map.put(o.getSource(), new // Token(o.getSource(),offsetAtt.startOffset()+offset,1)); //// results.addAll(morph.analyze(o.getSource())); //// offset += o.getSource().length(); //// } //// } else { //// results.addAll(outputs); //// } //// extractKeyword(results, offsetAtt.startOffset(), map, 0); // } Collection<KoreanToken> values = map.values(); for (KoreanToken kt : values) { kt.setOutputs(outputs); } morphQueue.addAll(map.values()); }
public static void assertTokenStreamContents( TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { assertNotNull(output); CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute) ts.addAttribute(CheckClearAttributesAttribute.class); assertTrue("has no TermAttribute", ts.hasAttribute(TermAttribute.class)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { assertTrue( "has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class); } ts.reset(); for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.clearAttributes(); termAtt.setTermBuffer("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue( "clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.term()); if (startOffsets != null) assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); if (types != null) assertEquals("type " + i, types[i], typeAtt.type()); if (posIncrements != null) assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } assertFalse("end of stream", ts.incrementToken()); ts.end(); if (finalOffset != null) assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); ts.close(); }