public static void displayTokensWithFullDetails(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("contents", new StringReader(text)); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class); int position = 0; while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { position = position + increment; System.out.println(); System.out.print(position + ":"); } BytesRef pl = payload.getPayload(); if (pl != null) { System.out.print( "[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + ":" + new String(pl.bytes) + "] "); } else { System.out.print( "[" + term.toString() + ":" + offset.startOffset() + "->" + offset.endOffset() + ":" + type.type() + "] "); } } System.out.println(); }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { String token = termAtt.toString(); if (!nopayload.contains(token)) { if (entities.contains(token)) { payloadAtt.setPayload(new BytesRef(token + ":Entity:" + pos)); } else { payloadAtt.setPayload(new BytesRef(token + ":Noise:" + pos)); } } pos += posIncrAtt.getPositionIncrement(); return true; } return false; }
@Override public boolean incrementToken() throws IOException { boolean hasNext = input.incrementToken(); if (hasNext) { payloadAtt.setPayload(new BytesRef(new byte[] {(byte) payloadCount.incrementAndGet()})); } return hasNext; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { final char[] buffer = termAtt.buffer(); final int length = termAtt.length(); for (int i = 0; i < length; i++) { if (buffer[i] == delimiter) { payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1)))); termAtt.setLength(i); // simply set a new length return true; } } // we have not seen the delimiter payAtt.setPayload(null); return true; } else return false; }
@Override public boolean incrementToken() throws IOException { if (input.incrementToken()) { if (fieldName.equals(FIELD)) { payloadAtt.setPayload(new Payload(payloadField)); } else if (fieldName.equals(MULTI_FIELD)) { if (numSeen % 2 == 0) { payloadAtt.setPayload(new Payload(payloadMultiField1)); } else { payloadAtt.setPayload(new Payload(payloadMultiField2)); } numSeen++; } return true; } return false; }
@Override public boolean incrementToken() throws IOException { if (index >= tokens.length) return false; else { clearAttributes(); Token token = tokens[index++]; termAtt.setEmpty().append(token); offsetAtt.setOffset(token.startOffset(), token.endOffset()); posIncAtt.setPositionIncrement(token.getPositionIncrement()); flagsAtt.setFlags(token.getFlags()); typeAtt.setType(token.type()); payloadAtt.setPayload(token.getPayload()); return true; } }
@Override public boolean incrementToken() { if (upto < tokens.length) { final Token token = tokens[upto++]; // TODO: can we just capture/restoreState so // we get all attrs...? clearAttributes(); termAtt.setEmpty(); termAtt.append(token.toString()); posIncrAtt.setPositionIncrement(token.getPositionIncrement()); posLengthAtt.setPositionLength(token.getPositionLength()); offsetAtt.setOffset(token.startOffset(), token.endOffset()); payloadAtt.setPayload(token.getPayload()); return true; } else { return false; } }
@Override public boolean incrementToken() throws IOException { if (exhausted) { return false; } int[] values = data[idx]; ubaos.reInit(buf); encoder.reInit(ubaos); for (int val : values) { encoder.encode(val); } encoder.close(); payload.setPayload(new Payload(buf, 0, ubaos.length())); exhausted = true; return true; }
@Override public void copyTo(AttributeImpl target) { super.copyTo(target); ((FlagsAttribute) target).setFlags(flags); ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone()); }
private void copyToWithoutPayloadClone(AttributeImpl target) { super.copyTo(target); ((FlagsAttribute) target).setFlags(flags); ((PayloadAttribute) target).setPayload(payload); }
/** * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene * <code>KeywordTokenizer</code> or similar utilities. * * @param fieldName a name to be associated with the text * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no * matter what. * @param boost the boost factor for hits for this field * @param positionIncrementGap the position increment gap if fields with the same name are added * more than once * @param offsetGap the offset gap if fields with the same name are added more than once * @see org.apache.lucene.document.Field#setBoost(float) */ public void addField( String fieldName, TokenStream tokenStream, float boost, int positionIncrementGap, int offsetGap) { try (TokenStream stream = tokenStream) { if (frozen) throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen"); if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null"); if (stream == null) throw new IllegalArgumentException("token stream must not be null"); if (boost <= 0.0f) throw new IllegalArgumentException("boost factor must be greater than 0.0"); int numTokens = 0; int numOverlapTokens = 0; int pos = -1; final BytesRefHash terms; final SliceByteStartArray sliceArray; Info info; long sumTotalTermFreq = 0; int offset = 0; FieldInfo fieldInfo; if ((info = fields.get(fieldName)) != null) { fieldInfo = info.fieldInfo; numTokens = info.numTokens; numOverlapTokens = info.numOverlapTokens; pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; terms = info.terms; boost *= info.boost; sliceArray = info.sliceArray; sumTotalTermFreq = info.sumTotalTermFreq; } else { fieldInfo = new FieldInfo( fieldName, fields.size(), true, false, this.storePayloads, this.storeOffsets ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, DocValuesType.NONE, -1, Collections.<String, String>emptyMap()); sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY); terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray); } TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null; BytesRef ref = termAtt.getBytesRef(); stream.reset(); while (stream.incrementToken()) { termAtt.fillBytesRef(); // if (DEBUG) System.err.println("token='" + term + "'"); numTokens++; final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr == 0) numOverlapTokens++; pos += posIncr; int ord = terms.add(ref); if (ord < 0) { ord = (-ord) - 1; postingsWriter.reset(sliceArray.end[ord]); } else { sliceArray.start[ord] = postingsWriter.startNewSlice(); } sliceArray.freq[ord]++; sumTotalTermFreq++; postingsWriter.writeInt(pos); if (storeOffsets) { postingsWriter.writeInt(offsetAtt.startOffset() + offset); postingsWriter.writeInt(offsetAtt.endOffset() + offset); } if (storePayloads) { final BytesRef payload = payloadAtt.getPayload(); final int pIndex; if (payload == null || payload.length == 0) { pIndex = -1; } else { pIndex = payloadsBytesRefs.append(payload); } postingsWriter.writeInt(pIndex); } sliceArray.end[ord] = postingsWriter.getCurrentOffset(); } stream.end(); // ensure infos.numTokens > 0 invariant; needed for correct operation of terms() if (numTokens > 0) { fields.put( fieldName, new Info( fieldInfo, terms, sliceArray, numTokens, numOverlapTokens, boost, pos, offsetAtt.endOffset() + offset, sumTotalTermFreq)); } } catch (IOException e) { throw new RuntimeException(e); } }