Example #1
0
  public static void displayTokensWithFullDetails(Analyzer analyzer, String text)
      throws IOException {

    TokenStream stream = analyzer.tokenStream("contents", new StringReader(text));

    CharTermAttribute term = stream.addAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class);
    TypeAttribute type = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payload = stream.addAttribute(PayloadAttribute.class);

    int position = 0;
    while (stream.incrementToken()) {

      int increment = posIncr.getPositionIncrement();
      if (increment > 0) {
        position = position + increment;
        System.out.println();
        System.out.print(position + ":");
      }

      BytesRef pl = payload.getPayload();

      if (pl != null) {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + ":"
                + new String(pl.bytes)
                + "] ");

      } else {
        System.out.print(
            "["
                + term.toString()
                + ":"
                + offset.startOffset()
                + "->"
                + offset.endOffset()
                + ":"
                + type.type()
                + "] ");
      }
    }
    System.out.println();
  }
    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        String token = termAtt.toString();

        if (!nopayload.contains(token)) {
          if (entities.contains(token)) {
            payloadAtt.setPayload(new BytesRef(token + ":Entity:" + pos));
          } else {
            payloadAtt.setPayload(new BytesRef(token + ":Noise:" + pos));
          }
        }
        pos += posIncrAtt.getPositionIncrement();
        return true;
      }
      return false;
    }
 @Override
 public boolean incrementToken() throws IOException {
   boolean hasNext = input.incrementToken();
   if (hasNext) {
     payloadAtt.setPayload(new BytesRef(new byte[] {(byte) payloadCount.incrementAndGet()}));
   }
   return hasNext;
 }
 @Override
 public boolean incrementToken() throws IOException {
   if (input.incrementToken()) {
     final char[] buffer = termAtt.buffer();
     final int length = termAtt.length();
     for (int i = 0; i < length; i++) {
       if (buffer[i] == delimiter) {
         payAtt.setPayload(encoder.encode(buffer, i + 1, (length - (i + 1))));
         termAtt.setLength(i); // simply set a new length
         return true;
       }
     }
     // we have not seen the delimiter
     payAtt.setPayload(null);
     return true;
   } else return false;
 }
    @Override
    public boolean incrementToken() throws IOException {

      if (input.incrementToken()) {
        if (fieldName.equals(FIELD)) {
          payloadAtt.setPayload(new Payload(payloadField));
        } else if (fieldName.equals(MULTI_FIELD)) {
          if (numSeen % 2 == 0) {
            payloadAtt.setPayload(new Payload(payloadMultiField1));
          } else {
            payloadAtt.setPayload(new Payload(payloadMultiField2));
          }
          numSeen++;
        }
        return true;
      }
      return false;
    }
Example #6
0
 @Override
 public boolean incrementToken() throws IOException {
   if (index >= tokens.length) return false;
   else {
     clearAttributes();
     Token token = tokens[index++];
     termAtt.setEmpty().append(token);
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     posIncAtt.setPositionIncrement(token.getPositionIncrement());
     flagsAtt.setFlags(token.getFlags());
     typeAtt.setType(token.type());
     payloadAtt.setPayload(token.getPayload());
     return true;
   }
 }
Example #7
0
 @Override
 public boolean incrementToken() {
   if (upto < tokens.length) {
     final Token token = tokens[upto++];
     // TODO: can we just capture/restoreState so
     // we get all attrs...?
     clearAttributes();
     termAtt.setEmpty();
     termAtt.append(token.toString());
     posIncrAtt.setPositionIncrement(token.getPositionIncrement());
     posLengthAtt.setPositionLength(token.getPositionLength());
     offsetAtt.setOffset(token.startOffset(), token.endOffset());
     payloadAtt.setPayload(token.getPayload());
     return true;
   } else {
     return false;
   }
 }
    @Override
    public boolean incrementToken() throws IOException {
      if (exhausted) {
        return false;
      }

      int[] values = data[idx];
      ubaos.reInit(buf);
      encoder.reInit(ubaos);
      for (int val : values) {
        encoder.encode(val);
      }
      encoder.close();
      payload.setPayload(new Payload(buf, 0, ubaos.length()));

      exhausted = true;
      return true;
    }
 @Override
 public void copyTo(AttributeImpl target) {
   super.copyTo(target);
   ((FlagsAttribute) target).setFlags(flags);
   ((PayloadAttribute) target).setPayload((payload == null) ? null : payload.clone());
 }
 private void copyToWithoutPayloadClone(AttributeImpl target) {
   super.copyTo(target);
   ((FlagsAttribute) target).setFlags(flags);
   ((PayloadAttribute) target).setPayload(payload);
 }
Example #11
0
  /**
   * Iterates over the given token stream and adds the resulting terms to the index; Equivalent to
   * adding a tokenized, indexed, termVectorStored, unstored, Lucene {@link
   * org.apache.lucene.document.Field}. Finally closes the token stream. Note that untokenized
   * keywords can be added with this method via {@link #keywordTokenStream(Collection)}, the Lucene
   * <code>KeywordTokenizer</code> or similar utilities.
   *
   * @param fieldName a name to be associated with the text
   * @param tokenStream the token stream to retrieve tokens from. It's guaranteed to be closed no
   *     matter what.
   * @param boost the boost factor for hits for this field
   * @param positionIncrementGap the position increment gap if fields with the same name are added
   *     more than once
   * @param offsetGap the offset gap if fields with the same name are added more than once
   * @see org.apache.lucene.document.Field#setBoost(float)
   */
  public void addField(
      String fieldName,
      TokenStream tokenStream,
      float boost,
      int positionIncrementGap,
      int offsetGap) {
    try (TokenStream stream = tokenStream) {
      if (frozen)
        throw new IllegalArgumentException("Cannot call addField() when MemoryIndex is frozen");
      if (fieldName == null) throw new IllegalArgumentException("fieldName must not be null");
      if (stream == null) throw new IllegalArgumentException("token stream must not be null");
      if (boost <= 0.0f)
        throw new IllegalArgumentException("boost factor must be greater than 0.0");
      int numTokens = 0;
      int numOverlapTokens = 0;
      int pos = -1;
      final BytesRefHash terms;
      final SliceByteStartArray sliceArray;
      Info info;
      long sumTotalTermFreq = 0;
      int offset = 0;
      FieldInfo fieldInfo;
      if ((info = fields.get(fieldName)) != null) {
        fieldInfo = info.fieldInfo;
        numTokens = info.numTokens;
        numOverlapTokens = info.numOverlapTokens;
        pos = info.lastPosition + positionIncrementGap;
        offset = info.lastOffset + offsetGap;
        terms = info.terms;
        boost *= info.boost;
        sliceArray = info.sliceArray;
        sumTotalTermFreq = info.sumTotalTermFreq;
      } else {
        fieldInfo =
            new FieldInfo(
                fieldName,
                fields.size(),
                true,
                false,
                this.storePayloads,
                this.storeOffsets
                    ? IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS
                    : IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
                DocValuesType.NONE,
                -1,
                Collections.<String, String>emptyMap());
        sliceArray = new SliceByteStartArray(BytesRefHash.DEFAULT_CAPACITY);
        terms = new BytesRefHash(byteBlockPool, BytesRefHash.DEFAULT_CAPACITY, sliceArray);
      }

      TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
      PositionIncrementAttribute posIncrAttribute =
          stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      PayloadAttribute payloadAtt =
          storePayloads ? stream.addAttribute(PayloadAttribute.class) : null;
      BytesRef ref = termAtt.getBytesRef();
      stream.reset();

      while (stream.incrementToken()) {
        termAtt.fillBytesRef();
        //        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0) numOverlapTokens++;
        pos += posIncr;
        int ord = terms.add(ref);
        if (ord < 0) {
          ord = (-ord) - 1;
          postingsWriter.reset(sliceArray.end[ord]);
        } else {
          sliceArray.start[ord] = postingsWriter.startNewSlice();
        }
        sliceArray.freq[ord]++;
        sumTotalTermFreq++;
        postingsWriter.writeInt(pos);
        if (storeOffsets) {
          postingsWriter.writeInt(offsetAtt.startOffset() + offset);
          postingsWriter.writeInt(offsetAtt.endOffset() + offset);
        }
        if (storePayloads) {
          final BytesRef payload = payloadAtt.getPayload();
          final int pIndex;
          if (payload == null || payload.length == 0) {
            pIndex = -1;
          } else {
            pIndex = payloadsBytesRefs.append(payload);
          }
          postingsWriter.writeInt(pIndex);
        }
        sliceArray.end[ord] = postingsWriter.getCurrentOffset();
      }
      stream.end();

      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()
      if (numTokens > 0) {
        fields.put(
            fieldName,
            new Info(
                fieldInfo,
                terms,
                sliceArray,
                numTokens,
                numOverlapTokens,
                boost,
                pos,
                offsetAtt.endOffset() + offset,
                sumTotalTermFreq));
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
  }