public boolean incrementToken() throws IOException { if (inPhrase) { inPhrase = false; clearAttributes(); termAtt.setTermBuffer("phrase2"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else while (input.incrementToken()) { if (termAtt.term().equals("phrase")) { inPhrase = true; savedStart = offsetAtt.startOffset(); savedEnd = offsetAtt.endOffset(); termAtt.setTermBuffer("phrase1"); offsetAtt.setOffset(savedStart, savedEnd); return true; } else if (!termAtt.term().equals("stop")) return true; } return false; }
public boolean incrementToken() throws IOException { if (!input.incrementToken()) // Advance to next token return false; // When false, end has been reached String encoded; encoded = metaphoner.encode(termAttr.term()); // Convert term text to // Metaphone encoding termAttr.setTermBuffer(encoded); // Overwrite term text with encoded // text typeAttr.setType(METAPHONE); // Set token type return true; }
@Override public boolean incrementToken() throws IOException { if (!input.incrementToken()) return false; char[] termBuffer = termAtt.termBuffer(); int termBufferLength = termAtt.termLength(); char[] backup = null; if (factory.maxWordCount < CapitalizationFilterFactory.DEFAULT_MAX_WORD_COUNT) { // make a backup in case we exceed the word count backup = new char[termBufferLength]; System.arraycopy(termBuffer, 0, backup, 0, termBufferLength); } if (termBufferLength < factory.maxTokenLength) { int wordCount = 0; int lastWordStart = 0; for (int i = 0; i < termBufferLength; i++) { char c = termBuffer[i]; if (c <= ' ' || c == '.') { int len = i - lastWordStart; if (len > 0) { factory.processWord(termBuffer, lastWordStart, len, wordCount++); lastWordStart = i + 1; i++; } } } // process the last word if (lastWordStart < termBufferLength) { factory.processWord( termBuffer, lastWordStart, termBufferLength - lastWordStart, wordCount++); } if (wordCount > factory.maxWordCount) { termAtt.setTermBuffer(backup, 0, termBufferLength); } } return true; }
private void applyToken(Token token) { termAtt.setTermBuffer(token.termBuffer(), 0, token.termLength()); posAtt.setPositionIncrement(token.getPositionIncrement()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); }
public static void assertTokenStreamContents( TokenStream ts, String[] output, int startOffsets[], int endOffsets[], String types[], int posIncrements[], Integer finalOffset) throws IOException { assertNotNull(output); CheckClearAttributesAttribute checkClearAtt = (CheckClearAttributesAttribute) ts.addAttribute(CheckClearAttributesAttribute.class); assertTrue("has no TermAttribute", ts.hasAttribute(TermAttribute.class)); TermAttribute termAtt = (TermAttribute) ts.getAttribute(TermAttribute.class); OffsetAttribute offsetAtt = null; if (startOffsets != null || endOffsets != null || finalOffset != null) { assertTrue("has no OffsetAttribute", ts.hasAttribute(OffsetAttribute.class)); offsetAtt = (OffsetAttribute) ts.getAttribute(OffsetAttribute.class); } TypeAttribute typeAtt = null; if (types != null) { assertTrue("has no TypeAttribute", ts.hasAttribute(TypeAttribute.class)); typeAtt = (TypeAttribute) ts.getAttribute(TypeAttribute.class); } PositionIncrementAttribute posIncrAtt = null; if (posIncrements != null) { assertTrue( "has no PositionIncrementAttribute", ts.hasAttribute(PositionIncrementAttribute.class)); posIncrAtt = (PositionIncrementAttribute) ts.getAttribute(PositionIncrementAttribute.class); } ts.reset(); for (int i = 0; i < output.length; i++) { // extra safety to enforce, that the state is not preserved and also assign bogus values ts.clearAttributes(); termAtt.setTermBuffer("bogusTerm"); if (offsetAtt != null) offsetAtt.setOffset(14584724, 24683243); if (typeAtt != null) typeAtt.setType("bogusType"); if (posIncrAtt != null) posIncrAtt.setPositionIncrement(45987657); checkClearAtt.getAndResetClearCalled(); // reset it, because we called clearAttribute() before assertTrue("token " + i + " does not exist", ts.incrementToken()); assertTrue( "clearAttributes() was not called correctly in TokenStream chain", checkClearAtt.getAndResetClearCalled()); assertEquals("term " + i, output[i], termAtt.term()); if (startOffsets != null) assertEquals("startOffset " + i, startOffsets[i], offsetAtt.startOffset()); if (endOffsets != null) assertEquals("endOffset " + i, endOffsets[i], offsetAtt.endOffset()); if (types != null) assertEquals("type " + i, types[i], typeAtt.type()); if (posIncrements != null) assertEquals("posIncrement " + i, posIncrements[i], posIncrAtt.getPositionIncrement()); } assertFalse("end of stream", ts.incrementToken()); ts.end(); if (finalOffset != null) assertEquals("finalOffset ", finalOffset.intValue(), offsetAtt.endOffset()); ts.close(); }