private String tokenizerToString(Tokenizer tokenizer) throws Exception { OffsetAttribute extOffset = tokenizer.addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncrAtt = tokenizer.addAttribute(PositionIncrementAttribute.class); PositionLengthAttribute posLengthAtt = tokenizer.addAttribute(PositionLengthAttribute.class); CharTermAttribute term = tokenizer.addAttribute(CharTermAttribute.class); TypeAttribute type = tokenizer.addAttribute(TypeAttribute.class); SemanticClassAttribute semanticClass = tokenizer.addAttribute(SemanticClassAttribute.class); PartOfSpeechAttribute pos = tokenizer.addAttribute(PartOfSpeechAttribute.class); StringBuilder result = new StringBuilder(); tokenizer.reset(); while (tokenizer.incrementToken() == true) { result.append(new String(term.buffer(), 0, term.length())).append(":"); result.append(type.type()).append(":"); result.append(pos.partOfSpeech()).append(":"); result.append(semanticClass.semanticClass()).append(":"); result.append(String.valueOf(posIncrAtt.getPositionIncrement())).append(":"); result.append(String.valueOf(posLengthAtt.getPositionLength())).append(":"); result.append(String.valueOf(extOffset.startOffset())).append(":"); result.append(String.valueOf(extOffset.endOffset())); result.append(","); } tokenizer.end(); return result.toString(); }
public void testAppendableInterface() { CharTermAttributeImpl t = new CharTermAttributeImpl(); Formatter formatter = new Formatter(t, Locale.ROOT); formatter.format("%d", 1234); assertEquals("1234", t.toString()); formatter.format("%d", 5678); assertEquals("12345678", t.toString()); t.append('9'); assertEquals("123456789", t.toString()); t.append((CharSequence) "0"); assertEquals("1234567890", t.toString()); t.append((CharSequence) "0123456789", 1, 3); assertEquals("123456789012", t.toString()); t.append((CharSequence) CharBuffer.wrap("0123456789".toCharArray()), 3, 5); assertEquals("12345678901234", t.toString()); t.append((CharSequence) t); assertEquals("1234567890123412345678901234", t.toString()); t.append((CharSequence) new StringBuilder("0123456789"), 5, 7); assertEquals("123456789012341234567890123456", t.toString()); t.append((CharSequence) new StringBuffer(t)); assertEquals("123456789012341234567890123456123456789012341234567890123456", t.toString()); // very wierd, to test if a subSlice is wrapped correct :) CharBuffer buf = CharBuffer.wrap("0123456789".toCharArray(), 3, 5); assertEquals("34567", buf.toString()); t.setEmpty().append((CharSequence) buf, 1, 2); assertEquals("4", t.toString()); CharTermAttribute t2 = new CharTermAttributeImpl(); t2.append("test"); t.append((CharSequence) t2); assertEquals("4test", t.toString()); t.append((CharSequence) t2, 1, 2); assertEquals("4teste", t.toString()); try { t.append((CharSequence) t2, 1, 5); fail("Should throw IndexOutOfBoundsException"); } catch (IndexOutOfBoundsException iobe) { } try { t.append((CharSequence) t2, 1, 0); fail("Should throw IndexOutOfBoundsException"); } catch (IndexOutOfBoundsException iobe) { } t.append((CharSequence) null); assertEquals("4testenull", t.toString()); }
public void testNonCharSequenceAppend() { CharTermAttributeImpl t = new CharTermAttributeImpl(); t.append("0123456789"); t.append("0123456789"); assertEquals("01234567890123456789", t.toString()); t.append(new StringBuilder("0123456789")); assertEquals("012345678901234567890123456789", t.toString()); CharTermAttribute t2 = new CharTermAttributeImpl(); t2.append("test"); t.append(t2); assertEquals("012345678901234567890123456789test", t.toString()); t.append((String) null); t.append((StringBuilder) null); t.append((CharTermAttribute) null); assertEquals("012345678901234567890123456789testnullnullnull", t.toString()); }
public boolean incrementToken() throws IOException { if (!morphQueue.isEmpty()) { restoreState(currentState); setAttributesFromQueue(false); return true; } while (input.incrementToken()) { final String type = typeAtt.type(); if (KOREAN_TYPE.equals(type)) { try { analysisKorean(termAtt.toString()); } catch (MorphException e) { throw new RuntimeException(e); } } else { return true; // pass anything else thru } if (!morphQueue.isEmpty()) { setAttributesFromQueue(true); return true; } } return false; }
private void setAttributesFromQueue(boolean isFirst) { final KoreanToken iw = morphQueue.removeFirst(); if (isFirst && !morphQueue.isEmpty()) { // our queue has more elements remaining (e.g. we decompounded) // capture state for those. We set the term attribute to be empty // so we save lots of array copying later. termAtt.setEmpty(); currentState = captureState(); } termAtt.setEmpty().append(iw.getTerm()); offsetAtt.setOffset(iw.getOffset(), iw.getOffset() + iw.getLength()); morphAtt.setToken(iw); // on the first Token we preserve incoming increment: if (!isFirst) { posIncrAtt.setPositionIncrement(iw.getPosInc()); } // TODO: How to handle PositionLengthAttribute correctly? }