Example #1
0
  /**
   * Sets the enum to operate in linear fashion, as we have found a looping transition at position:
   * we set an upper bound and act like a TermRangeQuery for this portion of the term space.
   */
  private void setLinear(int position) {
    assert linear == false;

    int state = runAutomaton.getInitialState();
    int maxInterval = 0xff;
    for (int i = 0; i < position; i++) {
      state = runAutomaton.step(state, seekBytesRef.bytes[i] & 0xff);
      assert state >= 0 : "state=" + state;
    }
    for (int i = 0; i < allTransitions[state].length; i++) {
      Transition t = allTransitions[state][i];
      if (t.getMin() <= (seekBytesRef.bytes[position] & 0xff)
          && (seekBytesRef.bytes[position] & 0xff) <= t.getMax()) {
        maxInterval = t.getMax();
        break;
      }
    }
    // 0xff terms don't get the optimization... not worth the trouble.
    if (maxInterval != 0xff) maxInterval++;
    int length = position + 1; /* position + maxTransition */
    if (linearUpperBound.bytes.length < length) linearUpperBound.bytes = new byte[length];
    System.arraycopy(seekBytesRef.bytes, 0, linearUpperBound.bytes, 0, position);
    linearUpperBound.bytes[position] = (byte) maxInterval;
    linearUpperBound.length = length;

    linear = true;
  }
    private void process(int groupOrd, int facetOrd) {
      if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
        return;
      }

      int segmentGroupedFacetsIndex = groupOrd * (facetFieldNumTerms + 1) + facetOrd;
      if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
        return;
      }

      segmentTotalCount++;
      segmentFacetCounts[facetOrd]++;

      segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);

      BytesRef groupKey;
      if (groupOrd == -1) {
        groupKey = null;
      } else {
        groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd));
      }

      final BytesRef facetValue;
      if (facetOrd == facetFieldNumTerms) {
        facetValue = null;
      } else {
        facetValue = BytesRef.deepCopyOf(facetFieldDocTermOrds.lookupOrd(facetOrd));
      }
      groupedFacetHits.add(new GroupedFacetHit(groupKey, facetValue));
    }
    @Override
    public BytesRef getPayload() throws IOException {
      if (!payloadPending) {
        return null;
      }

      if (pendingPayloadBytes == 0) {
        return payload;
      }

      assert pendingPayloadBytes >= payloadLength;

      if (pendingPayloadBytes > payloadLength) {
        payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength));
      }

      if (payload == null) {
        payload = new BytesRef();
        payload.bytes = new byte[payloadLength];
      } else if (payload.bytes.length < payloadLength) {
        payload.grow(payloadLength);
      }

      payloadIn.readBytes(payload.bytes, 0, payloadLength);
      payload.length = payloadLength;
      pendingPayloadBytes = 0;
      return payload;
    }
 @Override
 public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
   if (include != null) {
     builder.field(INCLUDE_FIELD.getPreferredName(), include.getOriginalString());
   } else if (includeValues != null) {
     builder.startArray(INCLUDE_FIELD.getPreferredName());
     for (BytesRef value : includeValues) {
       builder.value(value.utf8ToString());
     }
     builder.endArray();
   } else if (isPartitionBased()) {
     builder.startObject(INCLUDE_FIELD.getPreferredName());
     builder.field(PARTITION_FIELD.getPreferredName(), incZeroBasedPartition);
     builder.field(NUM_PARTITIONS_FIELD.getPreferredName(), incNumPartitions);
     builder.endObject();
   }
   if (exclude != null) {
     builder.field(EXCLUDE_FIELD.getPreferredName(), exclude.getOriginalString());
   } else if (excludeValues != null) {
     builder.startArray(EXCLUDE_FIELD.getPreferredName());
     for (BytesRef value : excludeValues) {
       builder.value(value.utf8ToString());
     }
     builder.endArray();
   }
   return builder;
 }
  // LUCENE-3870
  public void testLengthPrefixAcrossTwoPages() throws Exception {
    Directory d = newDirectory();
    IndexWriter w =
        new IndexWriter(d, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())));
    Document doc = new Document();
    byte[] bytes = new byte[32764];
    BytesRef b = new BytesRef();
    b.bytes = bytes;
    b.length = bytes.length;
    doc.add(new SortedDocValuesField("field", b));
    w.addDocument(doc);
    bytes[0] = 1;
    w.addDocument(doc);
    w.forceMerge(1);
    DirectoryReader r = w.getReader();
    BinaryDocValues s = FieldCache.DEFAULT.getTerms(getOnlySegmentReader(r), "field");

    BytesRef bytes1 = new BytesRef();
    s.get(0, bytes1);
    assertEquals(bytes.length, bytes1.length);
    bytes[0] = 0;
    assertEquals(b, bytes1);

    s.get(1, bytes1);
    assertEquals(bytes.length, bytes1.length);
    bytes[0] = 1;
    assertEquals(b, bytes1);
    r.close();
    w.close();
    d.close();
  }
 @Override
 public BytesRef binaryValue() {
   assertThread("Sorted doc values", creationThread);
   final BytesRef result = in.binaryValue();
   assert result.isValid();
   return result;
 }
Example #7
0
 /**
  * Find terms in the index based on a prefix. Useful for autocomplete.
  *
  * @param index the index
  * @param fieldName the field
  * @param prefix the prefix we're looking for (null or empty string for all terms)
  * @param sensitive match case-sensitively or not?
  * @param maxResults max. number of results to return (or -1 for all)
  * @return the matching terms
  */
 public static List<String> findTermsByPrefix(
     LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) {
   boolean allTerms = prefix == null || prefix.length() == 0;
   if (allTerms) {
     prefix = "";
     sensitive = true; // don't do unnecessary work in this case
   }
   try {
     if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase();
     org.apache.lucene.index.Terms terms = index.terms(fieldName);
     List<String> results = new ArrayList<>();
     TermsEnum termsEnum = terms.iterator();
     BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET));
     termsEnum.seekCeil(brPrefix); // find the prefix in the terms list
     while (maxResults < 0 || results.size() < maxResults) {
       BytesRef term = termsEnum.next();
       if (term == null) break;
       String termText = term.utf8ToString();
       String optDesensitized = termText;
       if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase();
       if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) {
         // Doesn't match prefix or different field; no more matches
         break;
       }
       // Match, add term
       results.add(termText);
     }
     return results;
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
Example #8
0
    @Override
    public SeekStatus seekCeil(BytesRef target) throws IOException {

      // already here
      if (term != null && term.equals(target)) {
        return SeekStatus.FOUND;
      }

      int startIdx = Arrays.binarySearch(indexedTermsArray, target);

      if (startIdx >= 0) {
        // we hit the term exactly... lucky us!
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = startIdx << indexIntervalBits;
        setTerm();
        assert term != null;
        return SeekStatus.FOUND;
      }

      // we didn't hit the term exactly
      startIdx = -startIdx - 1;

      if (startIdx == 0) {
        // our target occurs *before* the first term
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
        ord = 0;
        setTerm();
        assert term != null;
        return SeekStatus.NOT_FOUND;
      }

      // back up to the start of the block
      startIdx--;

      if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
        // we are already in the right block and the current term is before the term we want,
        // so we don't need to seek.
      } else {
        // seek to the right block
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = startIdx << indexIntervalBits;
        setTerm();
        assert term != null; // should be non-null since it's in the index
      }

      while (term != null && term.compareTo(target) < 0) {
        next();
      }

      if (term == null) {
        return SeekStatus.END;
      } else if (term.compareTo(target) == 0) {
        return SeekStatus.FOUND;
      } else {
        return SeekStatus.NOT_FOUND;
      }
    }
  @Override
  public BytesRef writeToBytes() {
    long start = System.nanoTime();
    int size = set.size();

    BytesRef bytes = new BytesRef(new byte[HEADER_SIZE + (int) bytesUsed.get()]);

    // Encode encoding type
    Bytes.writeInt(bytes, this.getEncoding().ordinal());

    // Encode flag
    bytes.bytes[bytes.offset++] = (byte) (this.isPruned() ? 1 : 0);

    // Encode size of the set
    Bytes.writeInt(bytes, size);

    // Encode longs
    BytesRef reusable = new BytesRef();
    for (int i = 0; i < this.set.size(); i++) {
      this.set.get(i, reusable);
      Bytes.writeBytesRef(reusable, bytes);
    }

    logger.debug(
        "Serialized {} terms - took {} ms", this.size(), (System.nanoTime() - start) / 1000000);

    bytes.length = bytes.offset;
    bytes.offset = 0;
    return bytes;
  }
    @Override
    public void hitExecute(SearchContext context, HitContext hitContext) {
      if (context.getFetchSubPhaseContext(CONTEXT_FACTORY).hitExecutionNeeded() == false) {
        return;
      }
      String field = context.getFetchSubPhaseContext(CONTEXT_FACTORY).getField();

      if (hitContext.hit().fieldsOrNull() == null) {
        hitContext.hit().fields(new HashMap<>());
      }
      SearchHitField hitField = hitContext.hit().fields().get(NAMES[0]);
      if (hitField == null) {
        hitField = new InternalSearchHitField(NAMES[0], new ArrayList<>(1));
        hitContext.hit().fields().put(NAMES[0], hitField);
      }
      TermVectorsResponse termVector =
          TermVectorsService.getTermVectors(
              context.indexShard(),
              new TermVectorsRequest(
                  context.indexShard().shardId().getIndex().getName(),
                  hitContext.hit().type(),
                  hitContext.hit().id()));
      try {
        Map<String, Integer> tv = new HashMap<>();
        TermsEnum terms = termVector.getFields().terms(field).iterator();
        BytesRef term;
        while ((term = terms.next()) != null) {
          tv.put(term.utf8ToString(), terms.postings(null, PostingsEnum.ALL).freq());
        }
        hitField.values().add(tv);
      } catch (IOException e) {
        ESLoggerFactory.getLogger(FetchSubPhasePluginIT.class.getName())
            .info("Swallowed exception", e);
      }
    }
 public void testIntStream() throws Exception {
   final NumericTokenStream stream = new NumericTokenStream().setIntValue(ivalue);
   // use getAttribute to test if attributes really exist, if not an IAE will be throwed
   final TermToBytesRefAttribute bytesAtt = stream.getAttribute(TermToBytesRefAttribute.class);
   final TypeAttribute typeAtt = stream.getAttribute(TypeAttribute.class);
   final NumericTokenStream.NumericTermAttribute numericAtt =
       stream.getAttribute(NumericTokenStream.NumericTermAttribute.class);
   final BytesRef bytes = bytesAtt.getBytesRef();
   stream.reset();
   assertEquals(32, numericAtt.getValueSize());
   for (int shift = 0; shift < 32; shift += NumericUtils.PRECISION_STEP_DEFAULT) {
     assertTrue("New token is available", stream.incrementToken());
     assertEquals("Shift value wrong", shift, numericAtt.getShift());
     final int hash = bytesAtt.fillBytesRef();
     assertEquals("Hash incorrect", bytes.hashCode(), hash);
     assertEquals(
         "Term is incorrectly encoded",
         ivalue & ~((1 << shift) - 1),
         NumericUtils.prefixCodedToInt(bytes));
     assertEquals(
         "Term raw value is incorrectly encoded",
         ((long) ivalue) & ~((1L << shift) - 1L),
         numericAtt.getRawValue());
     assertEquals(
         "Type incorrect",
         (shift == 0)
             ? NumericTokenStream.TOKEN_TYPE_FULL_PREC
             : NumericTokenStream.TOKEN_TYPE_LOWER_PREC,
         typeAtt.type());
   }
   assertFalse("More tokens available", stream.incrementToken());
   stream.end();
   stream.close();
 }
 private static BytesRef readBytesRef(IndexInput in) throws IOException {
   BytesRef bytes = new BytesRef();
   bytes.length = in.readVInt();
   bytes.bytes = new byte[bytes.length];
   in.readBytes(bytes.bytes, 0, bytes.length);
   return bytes;
 }
Example #13
0
  /**
   * Returns the next String in lexicographic order that will not put the machine into a reject
   * state.
   *
   * <p>This method traverses the DFA from the given position in the String, starting at the given
   * state.
   *
   * <p>If this cannot satisfy the machine, returns false. This method will walk the minimal path,
   * in lexicographic order, as long as possible.
   *
   * <p>If this method returns false, then there might still be more solutions, it is necessary to
   * backtrack to find out.
   *
   * @param state current non-reject state
   * @param position useful portion of the string
   * @return true if more possible solutions exist for the DFA from this position
   */
  private boolean nextString(int state, int position) {
    /*
     * the next lexicographic character must be greater than the existing
     * character, if it exists.
     */
    int c = 0;
    if (position < seekBytesRef.length) {
      c = seekBytesRef.bytes[position] & 0xff;
      // if the next byte is 0xff and is not part of the useful portion,
      // then by definition it puts us in a reject state, and therefore this
      // path is dead. there cannot be any higher transitions. backtrack.
      if (c++ == 0xff) return false;
    }

    seekBytesRef.length = position;
    visited[state] = curGen;

    Transition transitions[] = allTransitions[state];

    // find the minimal path (lexicographic order) that is >= c

    for (int i = 0; i < transitions.length; i++) {
      Transition transition = transitions[i];
      if (transition.getMax() >= c) {
        int nextChar = Math.max(c, transition.getMin());
        // append either the next sequential char, or the minimum transition
        seekBytesRef.grow(seekBytesRef.length + 1);
        seekBytesRef.length++;
        seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar;
        state = transition.getDest().getNumber();
        /*
         * as long as is possible, continue down the minimal path in
         * lexicographic order. if a loop or accept state is encountered, stop.
         */
        while (visited[state] != curGen && !runAutomaton.isAccept(state)) {
          visited[state] = curGen;
          /*
           * Note: we work with a DFA with no transitions to dead states.
           * so the below is ok, if it is not an accept state,
           * then there MUST be at least one transition.
           */
          transition = allTransitions[state][0];
          state = transition.getDest().getNumber();

          // append the minimum transition
          seekBytesRef.grow(seekBytesRef.length + 1);
          seekBytesRef.length++;
          seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin();

          // we found a loop, record it for faster enumeration
          if (!finite && !linear && visited[state] == curGen) {
            setLinear(seekBytesRef.length - 1);
          }
        }
        return true;
      }
    }
    return false;
  }
 @Override
 public BytesRef lookupOrd(long ord) {
   assertThread("Sorted set doc values", creationThread);
   assert ord >= 0 && ord < valueCount;
   final BytesRef result = in.lookupOrd(ord);
   assert result.isValid();
   return result;
 }
 @Override
 public BytesRef term() throws IOException {
   assertThread("Terms enums", creationThread);
   assert state == State.POSITIONED : "term() called on unpositioned TermsEnum";
   BytesRef ret = super.term();
   assert ret == null || ret.isValid();
   return ret;
 }
 /**
  * Ensure we own term.bytes so that it's safe to modify. We detect via a kluge in which
  * cellsByLevel[0].termBuf is non-null, which is a pre-allocated for use to replace term.bytes.
  */
 void ensureOwnTermBytes() {
   NRCell cell0 = cellsByLevel[0];
   if (cell0.termBuf == null) return; // we already own the bytes
   System.arraycopy(term.bytes, term.offset, cell0.termBuf, 0, term.length);
   term.bytes = cell0.termBuf;
   term.offset = 0;
   cell0.termBuf = null;
 }
 protected static void copy(BytesRef from, BytesRef to) {
   if (to.bytes.length < from.length) {
     to.bytes = new byte[ArrayUtil.oversize(from.length, RamUsageEstimator.NUM_BYTES_BYTE)];
   }
   to.offset = 0;
   to.length = from.length;
   System.arraycopy(from.bytes, from.offset, to.bytes, 0, from.length);
 }
Example #18
0
 @Override
 public void seekExact(BytesRef target, TermState otherState) {
   if (!target.equals(term)) {
     state.copyFrom(otherState);
     term = BytesRef.deepCopyOf(target);
     seekPending = true;
   }
 }
 @Override
 public BytesRef getTokenBytesNoLeaf(BytesRef result) {
   if (result == null) result = new BytesRef();
   result.bytes = term.bytes;
   result.offset = term.offset;
   result.length = termLenByLevel[cellLevel];
   assert result.length <= term.length;
   return result;
 }
 @Test
 public void testNoCopy() throws Exception {
   BytesRef ref = new BytesRef("i do not want to be copied!");
   BytesRef sub1 = SubstrFunction.substring(ref, 0, 10);
   BytesRef sub2 = SubstrFunction.substring(ref, 5, 14);
   assertThat(sub1.utf8ToString(), is("i do not w"));
   assertThat(sub2.utf8ToString(), is("not want "));
   assertThat(ref.bytes, allOf(is(sub2.bytes), is(sub1.bytes)));
 }
 @Override
 public long lookupTerm(BytesRef key) {
   assertThread("Sorted set doc values", creationThread);
   assert key.isValid();
   long result = in.lookupTerm(key);
   assert result < valueCount;
   assert key.isValid();
   return result;
 }
Example #22
0
 public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field)
     throws Exception {
   BytesRef term;
   while ((term = termsEnum.next()) != null) {
     BytesRef r = new BytesRef();
     r.copyBytes(term);
     tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq()));
   }
 }
 @Override
 public int hashCode() {
   final int prime = 31;
   int result = super.hashCode();
   result = prime * result + (includeLower ? 1231 : 1237);
   result = prime * result + (includeUpper ? 1231 : 1237);
   result = prime * result + ((lowerTerm == null) ? 0 : lowerTerm.hashCode());
   result = prime * result + ((upperTerm == null) ? 0 : upperTerm.hashCode());
   return result;
 }
  @Override
  public final boolean incrementToken() throws IOException {
    clearAttributes();

    BytesRef bytes = termsEnum.next();
    if (bytes == null) return false;
    charTerm.setEmpty();
    charTerm.append(bytes.utf8ToString());
    return true;
  }
    @Override
    public void collect(int doc) throws IOException {
      if (doc > facetFieldTermsIndex.docID()) {
        facetFieldTermsIndex.advance(doc);
      }

      int facetOrd;
      if (doc == facetFieldTermsIndex.docID()) {
        facetOrd = facetFieldTermsIndex.ordValue();
      } else {
        facetOrd = -1;
      }

      if (facetOrd < startFacetOrd || facetOrd >= endFacetOrd) {
        return;
      }

      if (doc > groupFieldTermsIndex.docID()) {
        groupFieldTermsIndex.advance(doc);
      }

      int groupOrd;
      if (doc == groupFieldTermsIndex.docID()) {
        groupOrd = groupFieldTermsIndex.ordValue();
      } else {
        groupOrd = -1;
      }
      int segmentGroupedFacetsIndex =
          groupOrd * (facetFieldTermsIndex.getValueCount() + 1) + facetOrd;
      if (segmentGroupedFacetHits.exists(segmentGroupedFacetsIndex)) {
        return;
      }

      segmentTotalCount++;
      segmentFacetCounts[facetOrd + 1]++;

      segmentGroupedFacetHits.put(segmentGroupedFacetsIndex);

      BytesRef groupKey;
      if (groupOrd == -1) {
        groupKey = null;
      } else {
        groupKey = BytesRef.deepCopyOf(groupFieldTermsIndex.lookupOrd(groupOrd));
      }

      BytesRef facetKey;
      if (facetOrd == -1) {
        facetKey = null;
      } else {
        facetKey = BytesRef.deepCopyOf(facetFieldTermsIndex.lookupOrd(facetOrd));
      }

      groupedFacetHits.add(new GroupedFacetHit(groupKey, facetKey));
    }
 // for debugging
 @SuppressWarnings("unused")
 static String brToString(BytesRef b) {
   try {
     return b.utf8ToString() + " " + b;
   } catch (Throwable t) {
     // If BytesRef isn't actually UTF8, or it's eg a
     // prefix of UTF8 that ends mid-unicode-char, we
     // fallback to hex:
     return b.toString();
   }
 }
  @Test
  public void testTokenStream() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    ContextSuggestField field =
        new ContextSuggestField("field", "input", 1, "context1", "context2");
    BytesRef surfaceForm = new BytesRef("input");
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
      output.writeVInt(surfaceForm.length);
      output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
      output.writeVInt(1 + 1);
      output.writeByte(ContextSuggestField.TYPE);
    }
    BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
    String[] expectedOutputs = new String[2];
    CharsRefBuilder builder = new CharsRefBuilder();
    builder.append("context1");
    builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("input");
    expectedOutputs[0] = builder.toCharsRef().toString();
    builder.clear();
    builder.append("context2");
    builder.append(((char) ContextSuggestField.CONTEXT_SEPARATOR));
    builder.append(((char) CompletionAnalyzer.SEP_LABEL));
    builder.append("input");
    expectedOutputs[1] = builder.toCharsRef().toString();
    TokenStream stream =
        new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(
            field.tokenStream(analyzer, null));
    assertTokenStreamContents(
        stream,
        expectedOutputs,
        null,
        null,
        new String[] {payload.utf8ToString(), payload.utf8ToString()},
        new int[] {1, 1},
        null,
        null);

    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
    stream =
        new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(
            field.tokenStream(completionAnalyzer, null));
    assertTokenStreamContents(
        stream,
        expectedOutputs,
        null,
        null,
        new String[] {payload.utf8ToString(), payload.utf8ToString()},
        new int[] {1, 1},
        null,
        null);
  }
 private BytesRef bytesFromTokenStream(TokenStream stream) throws IOException {
   TermToBytesRefAttribute termAttr = stream.getAttribute(TermToBytesRefAttribute.class);
   BytesRef bytesRef = termAttr.getBytesRef();
   stream.reset();
   while (stream.incrementToken()) {
     termAttr.fillBytesRef();
   }
   stream.close();
   BytesRef copy = new BytesRef();
   copy.copyBytes(bytesRef);
   return copy;
 }
Example #29
0
 /** decodes the payload at the current position */
 protected BytesRef decodePayload(BytesRef scratch, ByteArrayDataInput tmpInput) {
   tmpInput.reset(scratch.bytes);
   tmpInput.skipBytes(scratch.length - 2); // skip to payload size
   short payloadLength = tmpInput.readShort(); // read payload size
   tmpInput.setPosition(scratch.length - 2 - payloadLength); // setPosition to start of payload
   BytesRef payloadScratch = new BytesRef(payloadLength);
   tmpInput.readBytes(payloadScratch.bytes, 0, payloadLength); // read payload
   payloadScratch.length = payloadLength;
   scratch.length -= 2; // payload length info (short)
   scratch.length -= payloadLength; // payload
   return payloadScratch;
 }
    @Override
    public int nextPosition() throws IOException {
      final int pos;
      if (readPositions) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + POS.length,
            scratch.length - POS.length,
            scratchUTF16_2);
        pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      } else {
        pos = -1;
      }

      if (readOffsets) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, START_OFFSET)
            : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + START_OFFSET.length,
            scratch.length - START_OFFSET.length,
            scratchUTF16_2);
        startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + END_OFFSET.length,
            scratch.length - END_OFFSET.length,
            scratchUTF16_2);
        endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      }

      final long fp = in.getFilePointer();
      SimpleTextUtil.readLine(in, scratch);
      if (StringHelper.startsWith(scratch, PAYLOAD)) {
        final int len = scratch.length - PAYLOAD.length;
        if (scratch2.bytes.length < len) {
          scratch2.grow(len);
        }
        System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
        scratch2.length = len;
        payload = scratch2;
      } else {
        payload = null;
        in.seek(fp);
      }
      return pos;
    }