public void testRandomReads() throws IOException {
   int length = randomIntBetween(10, scaledRandomIntBetween(PAGE_SIZE * 2, PAGE_SIZE * 20));
   BytesReference pbr = newBytesReference(length);
   StreamInput streamInput = pbr.streamInput();
   BytesRefBuilder target = new BytesRefBuilder();
   while (target.length() < pbr.length()) {
     switch (randomIntBetween(0, 10)) {
       case 6:
       case 5:
         target.append(new BytesRef(new byte[] {streamInput.readByte()}));
         break;
       case 4:
       case 3:
         BytesRef bytesRef =
             streamInput.readBytesRef(scaledRandomIntBetween(1, pbr.length() - target.length()));
         target.append(bytesRef);
         break;
       default:
         byte[] buffer = new byte[scaledRandomIntBetween(1, pbr.length() - target.length())];
         int offset = scaledRandomIntBetween(0, buffer.length - 1);
         int read = streamInput.read(buffer, offset, buffer.length - offset);
         target.append(new BytesRef(buffer, offset, read));
         break;
     }
   }
   assertEquals(pbr.length(), target.length());
   BytesRef targetBytes = target.get();
   assertArrayEquals(
       pbr.toBytes(),
       Arrays.copyOfRange(targetBytes.bytes, targetBytes.offset, targetBytes.length));
 }
Пример #2
0
      /* Decodes only the term bytes of the next term.  If caller then asks for
      metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
      decode all metadata up to the current term. */
      private BytesRef _next() throws IOException {
        // System.out.println("BTR._next seg=" + segment + " this=" + this + " termCount=" +
        // state.termBlockOrd + " (vs " + blockTermCount + ")");
        if (state.termBlockOrd == blockTermCount && !nextBlock()) {
          // System.out.println("  eof");
          indexIsCurrent = false;
          return null;
        }

        // TODO: cutover to something better for these ints!  simple64?
        final int suffix = termSuffixesReader.readVInt();
        // System.out.println("  suffix=" + suffix);

        term.setLength(termBlockPrefix + suffix);
        term.grow(term.length());
        termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
        state.termBlockOrd++;

        // NOTE: meaningless in the non-ord case
        state.ord++;

        // System.out.println("  return term=" + fieldInfo.name + ":" + term.utf8ToString() + " " +
        // term + " tbOrd=" + state.termBlockOrd);
        return term.get();
      }
 private void readField(BytesRef type, FieldInfo fieldInfo, StoredFieldVisitor visitor)
     throws IOException {
   readLine();
   assert StringHelper.startsWith(scratch.get(), VALUE);
   if (type == TYPE_STRING) {
     byte[] bytes = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, bytes, 0, bytes.length);
     visitor.stringField(fieldInfo, bytes);
   } else if (type == TYPE_BINARY) {
     byte[] copy = new byte[scratch.length() - VALUE.length];
     System.arraycopy(scratch.bytes(), VALUE.length, copy, 0, copy.length);
     visitor.binaryField(fieldInfo, copy);
   } else if (type == TYPE_INT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.intField(fieldInfo, Integer.parseInt(scratchUTF16.toString()));
   } else if (type == TYPE_LONG) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.longField(fieldInfo, Long.parseLong(scratchUTF16.toString()));
   } else if (type == TYPE_FLOAT) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.floatField(fieldInfo, Float.parseFloat(scratchUTF16.toString()));
   } else if (type == TYPE_DOUBLE) {
     scratchUTF16.copyUTF8Bytes(scratch.bytes(), VALUE.length, scratch.length() - VALUE.length);
     visitor.doubleField(fieldInfo, Double.parseDouble(scratchUTF16.toString()));
   }
 }
  @Override
  public SeekStatus seekCeil(final BytesRef target) throws IOException {
    if (fr.index == null) {
      throw new IllegalStateException("terms index was not loaded");
    }

    term.grow(1 + target.length);

    assert clearEOF();

    // if (DEBUG) {
    // System.out.println("\nBTTR.seekCeil seg=" + segment + " target=" + fieldInfo.name + ":" +
    // target.utf8ToString() + " " + target + " current=" + brToString(term) + " (exists?=" +
    // termExists + ") validIndexPrefix=  " + validIndexPrefix);
    // printSeekState();
    // }

    FST.Arc<Pair<BytesRef, Long>> arc;
    int targetUpto;
    Pair<BytesRef, Long> output;

    targetBeforeCurrentLength = currentFrame.ord;

    if (currentFrame != staticFrame) {

      // We are already seek'd; find the common
      // prefix of new seek term vs current term and
      // re-use the corresponding seek state.  For
      // example, if app first seeks to foobar, then
      // seeks to foobaz, we can re-use the seek state
      // for the first 5 bytes.

      // if (DEBUG) {
      // System.out.println("  re-use current seek state validIndexPrefix=" + validIndexPrefix);
      // }

      arc = arcs[0];
      assert arc.isFinal();
      output = arc.output;
      targetUpto = 0;

      IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
      assert validIndexPrefix <= term.length();

      final int targetLimit = Math.min(target.length, validIndexPrefix);

      int cmp = 0;

      // TODO: we should write our vLong backwards (MSB
      // first) to get better sharing from the FST

      // First compare up to valid seek frames:
      while (targetUpto < targetLimit) {
        cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
        // if (DEBUG) {
        // System.out.println("    cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit +
        // ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + "
        // vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"   + " arc.output=" + arc.output +
        // " output=" + output);
        // }
        if (cmp != 0) {
          break;
        }
        arc = arcs[1 + targetUpto];
        assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF)
            : "arc.label="
                + (char) arc.label
                + " targetLabel="
                + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
        // TODO: we could save the outputs in local
        // byte[][] instead of making new objs ever
        // seek; but, often the FST doesn't have any
        // shared bytes (but this could change if we
        // reverse vLong byte order)
        if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) {
          output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
        }
        if (arc.isFinal()) {
          lastFrame = stack[1 + lastFrame.ord];
        }
        targetUpto++;
      }

      if (cmp == 0) {
        final int targetUptoMid = targetUpto;
        // Second compare the rest of the term, but
        // don't save arc/output/frame:
        final int targetLimit2 = Math.min(target.length, term.length());
        while (targetUpto < targetLimit2) {
          cmp =
              (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
          // if (DEBUG) {
          // System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit
          // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto])
          // + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
          // }
          if (cmp != 0) {
            break;
          }
          targetUpto++;
        }

        if (cmp == 0) {
          cmp = term.length() - target.length;
        }
        targetUpto = targetUptoMid;
      }

      if (cmp < 0) {
        // Common case: target term is after current
        // term, ie, app is seeking multiple terms
        // in sorted order
        // if (DEBUG) {
        // System.out.println("  target is after current (shares prefixLen=" + targetUpto + ");
        // clear frame.scanned ord=" + lastFrame.ord);
        // }
        currentFrame = lastFrame;

      } else if (cmp > 0) {
        // Uncommon case: target term
        // is before current term; this means we can
        // keep the currentFrame but we must rewind it
        // (so we scan from the start)
        targetBeforeCurrentLength = 0;
        // if (DEBUG) {
        // System.out.println("  target is before current (shares prefixLen=" + targetUpto + ");
        // rewind frame ord=" + lastFrame.ord);
        // }
        currentFrame = lastFrame;
        currentFrame.rewind();
      } else {
        // Target is exactly the same as current term
        assert term.length() == target.length;
        if (termExists) {
          // if (DEBUG) {
          // System.out.println("  target is same as current; return FOUND");
          // }
          return SeekStatus.FOUND;
        } else {
          // if (DEBUG) {
          // System.out.println("  target is same as current but term doesn't exist");
          // }
        }
      }

    } else {

      targetBeforeCurrentLength = -1;
      arc = fr.index.getFirstArc(arcs[0]);

      // Empty string prefix must have an output (block) in the index!
      assert arc.isFinal();
      assert arc.output != null;

      // if (DEBUG) {
      // System.out.println("    no seek state; push root frame");
      // }

      output = arc.output;

      currentFrame = staticFrame;

      // term.length = 0;
      targetUpto = 0;
      currentFrame =
          pushFrame(
              arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
    }

    // if (DEBUG) {
    // System.out.println("  start index loop targetUpto=" + targetUpto + " output=" + output + "
    // currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" +
    // targetBeforeCurrentLength);
    // }

    // We are done sharing the common prefix with the incoming target and where we are currently
    // seek'd; now continue walking the index:
    while (targetUpto < target.length) {

      final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;

      final FST.Arc<Pair<BytesRef, Long>> nextArc =
          fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader);

      if (nextArc == null) {

        // Index is exhausted
        // if (DEBUG) {
        //   System.out.println("    index: index exhausted label=" + ((char) targetLabel) + " " +
        // toHex(targetLabel));
        // }

        validIndexPrefix = currentFrame.prefix;
        // validIndexPrefix = targetUpto;

        currentFrame.scanToFloorFrame(target);

        currentFrame.loadBlock();

        final SeekStatus result = currentFrame.scanToTerm(target, false);
        if (result == SeekStatus.END) {
          term.copyBytes(target);
          termExists = false;

          if (next() != null) {
            // if (DEBUG) {
            // System.out.println("  return NOT_FOUND term=" + brToString(term) + " " + term);
            // }
            return SeekStatus.NOT_FOUND;
          } else {
            // if (DEBUG) {
            // System.out.println("  return END");
            // }
            return SeekStatus.END;
          }
        } else {
          // if (DEBUG) {
          // System.out.println("  return " + result + " term=" + brToString(term) + " " + term);
          // }
          return result;
        }
      } else {
        // Follow this arc
        term.setByteAt(targetUpto, (byte) targetLabel);
        arc = nextArc;
        // Aggregate output as we go:
        assert arc.output != null;
        if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) {
          output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
        }

        // if (DEBUG) {
        // System.out.println("    index: follow label=" + toHex(target.bytes[target.offset +
        // targetUpto]&0xff) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
        // }
        targetUpto++;

        if (arc.isFinal()) {
          // if (DEBUG) System.out.println("    arc is final!");
          currentFrame =
              pushFrame(
                  arc,
                  VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput),
                  targetUpto);
          // if (DEBUG) System.out.println("    curFrame.ord=" + currentFrame.ord + " hasTerms=" +
          // currentFrame.hasTerms);
        }
      }
    }

    // validIndexPrefix = targetUpto;
    validIndexPrefix = currentFrame.prefix;

    currentFrame.scanToFloorFrame(target);

    currentFrame.loadBlock();

    final SeekStatus result = currentFrame.scanToTerm(target, false);

    if (result == SeekStatus.END) {
      term.copyBytes(target);
      termExists = false;
      if (next() != null) {
        // if (DEBUG) {
        // System.out.println("  return NOT_FOUND term=" + term.utf8ToString() + " " + term);
        // }
        return SeekStatus.NOT_FOUND;
      } else {
        // if (DEBUG) {
        // System.out.println("  return END");
        // }
        return SeekStatus.END;
      }
    } else {
      return result;
    }
  }
  /**
   * Optimized version of {@link #seekExact(BytesRef)} that can sometimes fail-fast if the version
   * indexed with the requested ID is less than the specified minIDVersion. Applications that index
   * a monotonically increasing global version with each document can use this for fast optimistic
   * concurrency.
   */
  public boolean seekExact(final BytesRef target, long minIDVersion) throws IOException {

    if (fr.index == null) {
      throw new IllegalStateException("terms index was not loaded");
    }

    term.grow(1 + target.length);

    assert clearEOF();

    //  if (DEBUG) {
    //    System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" +
    // fr.fieldInfo.name + ":" + brToString(target) + " minIDVersion=" + minIDVersion + " current="
    // + brToString(term) + " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix);
    //   printSeekState(System.out);
    //  }

    FST.Arc<Pair<BytesRef, Long>> arc;
    int targetUpto;
    Pair<BytesRef, Long> output;

    long startFrameFP = currentFrame.fp;

    targetBeforeCurrentLength = currentFrame.ord;

    boolean changed = false;

    // TODO: we could stop earlier w/ the version check, every time we traverse an index arc we can
    // check?

    if (currentFrame != staticFrame) {

      // We are already seek'd; find the common
      // prefix of new seek term vs current term and
      // re-use the corresponding seek state.  For
      // example, if app first seeks to foobar, then
      // seeks to foobaz, we can re-use the seek state
      // for the first 5 bytes.

      // if (DEBUG) {
      //    System.out.println("  re-use current seek state validIndexPrefix=" + validIndexPrefix);
      //  }

      arc = arcs[0];
      assert arc.isFinal();
      output = arc.output;
      targetUpto = 0;

      IDVersionSegmentTermsEnumFrame lastFrame = stack[0];
      assert validIndexPrefix <= term.length()
          : "validIndexPrefix="
              + validIndexPrefix
              + " term.length="
              + term.length()
              + " seg="
              + fr.parent;

      final int targetLimit = Math.min(target.length, validIndexPrefix);

      int cmp = 0;

      // TODO: reverse vLong byte order for better FST
      // prefix output sharing

      // First compare up to valid seek frames:
      while (targetUpto < targetLimit) {
        cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
        // if (DEBUG) {
        //    System.out.println("    cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit
        // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) +
        // " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"   + " arc.output=" + arc.output
        // + " output=" + output);
        // }
        if (cmp != 0) {
          break;
        }
        arc = arcs[1 + targetUpto];
        // if (arc.label != (target.bytes[target.offset + targetUpto] & 0xFF)) {
        // System.out.println("FAIL: arc.label=" + (char) arc.label + " targetLabel=" + (char)
        // (target.bytes[target.offset + targetUpto] & 0xFF));
        // }
        assert arc.label == (target.bytes[target.offset + targetUpto] & 0xFF)
            : "arc.label="
                + (char) arc.label
                + " targetLabel="
                + (char) (target.bytes[target.offset + targetUpto] & 0xFF);
        if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) {
          output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
        }
        if (arc.isFinal()) {
          lastFrame = stack[1 + lastFrame.ord];
        }
        targetUpto++;
      }

      if (cmp == 0) {
        final int targetUptoMid = targetUpto;

        // Second compare the rest of the term, but
        // don't save arc/output/frame; we only do this
        // to find out if the target term is before,
        // equal or after the current term
        final int targetLimit2 = Math.min(target.length, term.length());
        while (targetUpto < targetLimit2) {
          cmp =
              (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF);
          // if (DEBUG) {
          //    System.out.println("    cycle2 targetUpto=" + targetUpto + " (vs limit=" +
          // targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset +
          // targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")");
          // }
          if (cmp != 0) {
            break;
          }
          targetUpto++;
        }

        if (cmp == 0) {
          cmp = term.length() - target.length;
        }
        targetUpto = targetUptoMid;
      }

      if (cmp < 0) {
        // Common case: target term is after current
        // term, ie, app is seeking multiple terms
        // in sorted order
        // if (DEBUG) {
        //    System.out.println("  target is after current (shares prefixLen=" + targetUpto + ");
        // frame.ord=" + lastFrame.ord + "; targetUpto=" + targetUpto);
        //  }
        currentFrame = lastFrame;

      } else if (cmp > 0) {
        // Uncommon case: target term
        // is before current term; this means we can
        // keep the currentFrame but we must rewind it
        // (so we scan from the start)
        targetBeforeCurrentLength = 0;
        changed = true;
        // if (DEBUG) {
        //    System.out.println("  target is before current (shares prefixLen=" + targetUpto + ");
        // rewind frame ord=" + lastFrame.ord);
        //  }
        currentFrame = lastFrame;
        currentFrame.rewind();
      } else {
        // Target is exactly the same as current term
        assert term.length() == target.length;
        if (termExists) {

          if (currentFrame.maxIDVersion < minIDVersion) {
            // The max version for all terms in this block is lower than the minVersion
            // if (DEBUG) {
            //   System.out.println("  target is same as current maxIDVersion=" +
            // currentFrame.maxIDVersion + " is < minIDVersion=" + minIDVersion + "; return false");
            // }
            return false;
          }

          currentFrame.decodeMetaData();
          if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
            // This term's version is lower than the minVersion
            // if (DEBUG) {
            //   System.out.println("  target is same as current but version=" +
            // ((IDVersionTermState) currentFrame.state).idVersion + " is < minIDVersion=" +
            // minIDVersion + "; return false");
            // }
            return false;
          }
          // System.out.println("  term version=" + ((IDVersionTermState)
          // currentFrame.state).idVersion + " frame version=" + currentFrame.maxIDVersion + " frame
          // ord=" + currentFrame.ord);

          // if (DEBUG) {
          //    System.out.println("  target is same as current; return true");
          //  }
          return true;
        } else {
          // if (DEBUG) {
          //    System.out.println("  target is same as current but term doesn't exist");
          //  }
        }
        // validIndexPrefix = currentFrame.depth;
        // term.length = target.length;
        // return termExists;
      }

    } else {

      targetBeforeCurrentLength = -1;
      arc = fr.index.getFirstArc(arcs[0]);
      // System.out.println("first arc=" + arc);

      // Empty string prefix must have an output (block) in the index!
      assert arc.isFinal();
      assert arc.output != null;

      // if (DEBUG) {
      //    System.out.println("    no seek state; push root frame");
      //  }

      output = arc.output;

      currentFrame = staticFrame;

      // term.length = 0;
      targetUpto = 0;
      currentFrame =
          pushFrame(
              arc, VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput), 0);
    }

    // if (DEBUG) {
    //   System.out.println("  start index loop targetUpto=" + targetUpto + " output=" + output + "
    // currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" +
    // targetBeforeCurrentLength + " termExists=" + termExists);
    // }

    // We are done sharing the common prefix with the incoming target and where we are currently
    // seek'd; now continue walking the index:
    while (targetUpto < target.length) {

      final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF;

      final FST.Arc<Pair<BytesRef, Long>> nextArc =
          fr.index.findTargetArc(targetLabel, arc, getArc(1 + targetUpto), fstReader);

      if (nextArc == null) {

        // Index is exhausted
        // if (DEBUG) {
        //    System.out.println("    index: index exhausted label=" + ((char) targetLabel) + " " +
        // Integer.toHexString(targetLabel) + " termExists=" + termExists);
        //  }

        validIndexPrefix = currentFrame.prefix;
        // validIndexPrefix = targetUpto;

        currentFrame.scanToFloorFrame(target);

        if (!currentFrame.hasTerms) {
          termExists = false;
          term.setByteAt(targetUpto, (byte) targetLabel);
          term.setLength(1 + targetUpto);
          // if (DEBUG) {
          //    System.out.println("  FAST NOT_FOUND term=" + brToString(term));
          //  }
          return false;
        }
        // System.out.println("  check maxVersion=" + currentFrame.maxIDVersion + " vs " +
        // minIDVersion);

        // if (DEBUG) {
        //   System.out.println("  frame.maxIDVersion=" + currentFrame.maxIDVersion +  " vs
        // minIDVersion=" + minIDVersion);
        // }

        if (currentFrame.maxIDVersion < minIDVersion) {
          // The max version for all terms in this block is lower than the minVersion
          if (currentFrame.fp != startFrameFP || changed) {
            // if (targetUpto+1 > term.length) {
            termExists = false;
            term.setByteAt(targetUpto, (byte) targetLabel);
            term.setLength(1 + targetUpto);
            // if (DEBUG) {
            //   System.out.println("    reset current term");
            // }
            validIndexPrefix = Math.min(validIndexPrefix, term.length());
          }
          // if (currentFrame.ord != startFrameOrd) {
          // termExists = false;
          // }
          // if (DEBUG) {
          //   System.out.println("    FAST version NOT_FOUND term=" + brToString(term) + "
          // targetUpto=" + targetUpto + " currentFrame.maxIDVersion=" + currentFrame.maxIDVersion +
          // " validIndexPrefix=" + validIndexPrefix + " startFrameFP=" + startFrameFP + " vs " +
          // currentFrame.fp + " termExists=" + termExists);
          // }
          return false;
        }

        currentFrame.loadBlock();

        // if (DEBUG) {
        //   System.out.println("    scan currentFrame ord=" + currentFrame.ord);
        // }
        final SeekStatus result = currentFrame.scanToTerm(target, true);
        if (result == SeekStatus.FOUND) {
          currentFrame.decodeMetaData();
          if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
            // This term's version is lower than the minVersion
            // if (DEBUG) {
            //   System.out.println("    return NOT_FOUND: idVersion=" + ((IDVersionTermState)
            // currentFrame.state).idVersion + " vs minIDVersion=" + minIDVersion);
            // }
            return false;
          }

          // if (DEBUG) {
          //    System.out.println("  return FOUND term=" + term.utf8ToString() + " " + term);
          //  }

          return true;
        } else {
          // if (DEBUG) {
          //    System.out.println("  got " + result + "; return NOT_FOUND term=" +
          // brToString(term));
          // }
          return false;
        }
      } else {
        // Follow this arc
        arc = nextArc;
        if (term.byteAt(targetUpto) != (byte) targetLabel) {
          // if (DEBUG) {
          //   System.out.println("  now set termExists=false targetUpto=" + targetUpto + " term=" +
          // term.bytes[targetUpto] + " targetLabel=" + targetLabel);
          // }
          changed = true;
          term.setByteAt(targetUpto, (byte) targetLabel);
          termExists = false;
        }
        // Aggregate output as we go:
        assert arc.output != null;
        if (arc.output != VersionBlockTreeTermsWriter.NO_OUTPUT) {
          output = VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.output);
        }

        // if (DEBUG) {
        //    System.out.println("    index: follow label=" + (char) ((target.bytes[target.offset +
        // targetUpto]&0xff)) + " arc.output=" + arc.output + " arc.nfo=" + arc.nextFinalOutput);
        //  }
        targetUpto++;

        if (arc.isFinal()) {
          // if (DEBUG) System.out.println("    arc is final!");
          currentFrame =
              pushFrame(
                  arc,
                  VersionBlockTreeTermsWriter.FST_OUTPUTS.add(output, arc.nextFinalOutput),
                  targetUpto);
          // if (DEBUG) System.out.println("    curFrame.ord=" + currentFrame.ord + " hasTerms=" +
          // currentFrame.hasTerms);
        }
      }
    }

    // validIndexPrefix = targetUpto;
    validIndexPrefix = currentFrame.prefix;

    currentFrame.scanToFloorFrame(target);

    // Target term is entirely contained in the index:
    if (!currentFrame.hasTerms) {
      termExists = false;
      term.setLength(targetUpto);
      // if (DEBUG) {
      //    System.out.println("  FAST NOT_FOUND term=" + brToString(term));
      //  }
      return false;
    }

    // if (DEBUG) {
    //   System.out.println("  frame.maxIDVersion=" + currentFrame.maxIDVersion +  " vs
    // minIDVersion=" + minIDVersion);
    // }

    if (currentFrame.maxIDVersion < minIDVersion) {
      // The max version for all terms in this block is lower than the minVersion
      termExists = false;
      term.setLength(targetUpto);
      return false;
    }

    currentFrame.loadBlock();

    final SeekStatus result = currentFrame.scanToTerm(target, true);
    if (result == SeekStatus.FOUND) {
      // if (DEBUG) {
      //    System.out.println("  return FOUND term=" + term.utf8ToString() + " " + term);
      //  }
      currentFrame.decodeMetaData();
      if (((IDVersionTermState) currentFrame.state).idVersion < minIDVersion) {
        // This term's version is lower than the minVersion
        return false;
      }
      return true;
    } else {
      // if (DEBUG) {
      //    System.out.println("  got result " + result + "; return NOT_FOUND term=" +
      // term.utf8ToString());
      //  }

      return false;
    }
  }
  /* Decodes only the term bytes of the next term.  If caller then asks for
  metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily)
  decode all metadata up to the current term. */
  @Override
  public BytesRef next() throws IOException {

    if (in == null) {
      // Fresh TermsEnum; seek to first term:
      final FST.Arc<Pair<BytesRef, Long>> arc;
      if (fr.index != null) {
        arc = fr.index.getFirstArc(arcs[0]);
        // Empty string prefix must have an output in the index!
        assert arc.isFinal();
      } else {
        arc = null;
      }
      currentFrame = pushFrame(arc, fr.rootCode, 0);
      currentFrame.loadBlock();
    }

    targetBeforeCurrentLength = currentFrame.ord;

    assert !eof;
    // if (DEBUG) {
    // System.out.println("\nBTTR.next seg=" + segment + " term=" + brToString(term) + "
    // termExists?=" + termExists + " field=" + fieldInfo.name + " termBlockOrd=" +
    // currentFrame.state.termBlockOrd + " validIndexPrefix=" + validIndexPrefix);
    // printSeekState();
    // }

    if (currentFrame == staticFrame) {
      // If seek was previously called and the term was
      // cached, or seek(TermState) was called, usually
      // caller is just going to pull a D/&PEnum or get
      // docFreq, etc.  But, if they then call next(),
      // this method catches up all internal state so next()
      // works properly:
      // if (DEBUG) System.out.println("  re-seek to pending term=" + term.utf8ToString() + " " +
      // term);
      final boolean result = seekExact(term.get());
      assert result;
    }

    // Pop finished blocks
    while (currentFrame.nextEnt == currentFrame.entCount) {
      if (!currentFrame.isLastInFloor) {
        currentFrame.loadNextFloorBlock();
      } else {
        // if (DEBUG) System.out.println("  pop frame");
        if (currentFrame.ord == 0) {
          // if (DEBUG) System.out.println("  return null");
          assert setEOF();
          term.clear();
          validIndexPrefix = 0;
          currentFrame.rewind();
          termExists = false;
          return null;
        }
        final long lastFP = currentFrame.fpOrig;
        currentFrame = stack[currentFrame.ord - 1];

        if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) {
          // We popped into a frame that's not loaded
          // yet or not scan'd to the right entry
          currentFrame.scanToFloorFrame(term.get());
          currentFrame.loadBlock();
          currentFrame.scanToSubBlock(lastFP);
        }

        // Note that the seek state (last seek) has been
        // invalidated beyond this depth
        validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix);
        // if (DEBUG) {
        // System.out.println("  reset validIndexPrefix=" + validIndexPrefix);
        // }
      }
    }

    while (true) {
      if (currentFrame.next()) {
        // Push to new block:
        // if (DEBUG) System.out.println("  push frame");
        currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length());
        // This is a "next" frame -- even if it's
        // floor'd we must pretend it isn't so we don't
        // try to scan to the right floor frame:
        currentFrame.isFloor = false;
        // currentFrame.hasTerms = true;
        currentFrame.loadBlock();
      } else {
        // if (DEBUG) System.out.println("  return term=" + term.utf8ToString() + " " + term + "
        // currentFrame.ord=" + currentFrame.ord);
        return term.get();
      }
    }
  }
 private int parseIntAt(int offset) {
   scratchUTF16.copyUTF8Bytes(scratch.bytes(), offset, scratch.length() - offset);
   return ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
 }
 // helper method to strip strip away 'prefix' from 'scratch' and return as String
 private String stripPrefix(BytesRefBuilder scratch, BytesRef prefix) throws IOException {
   return new String(
       scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
 }
Пример #9
0
      // TODO: we may want an alternate mode here which is
      // "if you are about to return NOT_FOUND I won't use
      // the terms data from that"; eg FuzzyTermsEnum will
      // (usually) just immediately call seek again if we
      // return NOT_FOUND so it's a waste for us to fill in
      // the term that was actually NOT_FOUND
      @Override
      public SeekStatus seekCeil(final BytesRef target) throws IOException {

        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" +
        // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term()
        // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending="
        // + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
        if (didIndexNext) {
          if (nextIndexTerm == null) {
            // System.out.println("  nextIndexTerm=null");
          } else {
            // System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
          }
        }

        boolean doSeek = true;

        // See if we can avoid seeking, because target term
        // is after current term but before next index term:
        if (indexIsCurrent) {

          final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target);

          if (cmp == 0) {
            // Already at the requested term
            return SeekStatus.FOUND;
          } else if (cmp < 0) {

            // Target term is after current term
            if (!didIndexNext) {
              if (indexEnum.next() == -1) {
                nextIndexTerm = null;
              } else {
                nextIndexTerm = indexEnum.term();
              }
              // System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null
              // ? "null" : nextIndexTerm.utf8ToString()));
              didIndexNext = true;
            }

            if (nextIndexTerm == null
                || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
              // Optimization: requested term is within the
              // same term block we are now in; skip seeking
              // (but do scanning):
              doSeek = false;
              // System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null"
              // : nextIndexTerm.utf8ToString()));
            }
          }
        }

        if (doSeek) {
          // System.out.println("  seek");

          // Ask terms index to find biggest indexed term (=
          // first term in a block) that's <= our text:
          in.seek(indexEnum.seek(target));
          boolean result = nextBlock();

          // Block must exist since, at least, the indexed term
          // is in the block:
          assert result;

          indexIsCurrent = true;
          didIndexNext = false;

          if (doOrd) {
            state.ord = indexEnum.ord() - 1;
          }

          term.copyBytes(indexEnum.term());
          // System.out.println("  seek: term=" + term.utf8ToString());
        } else {
          // System.out.println("  skip seek");
          if (state.termBlockOrd == blockTermCount && !nextBlock()) {
            indexIsCurrent = false;
            return SeekStatus.END;
          }
        }

        seekPending = false;

        int common = 0;

        // Scan within block.  We could do this by calling
        // _next() and testing the resulting term, but this
        // is wasteful.  Instead, we first confirm the
        // target matches the common prefix of this block,
        // and then we scan the term bytes directly from the
        // termSuffixesreader's byte[], saving a copy into
        // the BytesRef term per term.  Only when we return
        // do we then copy the bytes into the term.

        while (true) {

          // First, see if target term matches common prefix
          // in this block:
          if (common < termBlockPrefix) {
            final int cmp =
                (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF);
            if (cmp < 0) {

              // TODO: maybe we should store common prefix
              // in block header?  (instead of relying on
              // last term of previous block)

              // Target's prefix is after the common block
              // prefix, so term cannot be in this block
              // but it could be in next block.  We
              // must scan to end-of-block to set common
              // prefix for next block:
              if (state.termBlockOrd < blockTermCount) {
                while (state.termBlockOrd < blockTermCount - 1) {
                  state.termBlockOrd++;
                  state.ord++;
                  termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
                }
                final int suffix = termSuffixesReader.readVInt();
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              }
              state.ord++;

              if (!nextBlock()) {
                indexIsCurrent = false;
                return SeekStatus.END;
              }
              common = 0;

            } else if (cmp > 0) {
              // Target's prefix is before the common prefix
              // of this block, so we position to start of
              // block and return NOT_FOUND:
              assert state.termBlockOrd == 0;

              final int suffix = termSuffixesReader.readVInt();
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              return SeekStatus.NOT_FOUND;
            } else {
              common++;
            }

            continue;
          }

          // Test every term in this block
          while (true) {
            state.termBlockOrd++;
            state.ord++;

            final int suffix = termSuffixesReader.readVInt();

            // We know the prefix matches, so just compare the new suffix:
            final int termLen = termBlockPrefix + suffix;
            int bytePos = termSuffixesReader.getPosition();

            boolean next = false;
            final int limit = target.offset + (termLen < target.length ? termLen : target.length);
            int targetPos = target.offset + termBlockPrefix;
            while (targetPos < limit) {
              final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
              if (cmp < 0) {
                // Current term is still before the target;
                // keep scanning
                next = true;
                break;
              } else if (cmp > 0) {
                // Done!  Current term is after target. Stop
                // here, fill in real term, return NOT_FOUND.
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (!next && target.length <= termLen) {
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);

              if (target.length == termLen) {
                // Done!  Exact match.  Stop here, fill in
                // real term, return FOUND.
                // System.out.println("  FOUND");
                return SeekStatus.FOUND;
              } else {
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (state.termBlockOrd == blockTermCount) {
              // Must pre-fill term for next block's common prefix
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              break;
            } else {
              termSuffixesReader.skipBytes(suffix);
            }
          }

          // The purpose of the terms dict index is to seek
          // the enum to the closest index term before the
          // term we are looking for.  So, we should never
          // cross another index term (besides the first
          // one) while we are scanning:

          assert indexIsCurrent;

          if (!nextBlock()) {
            // System.out.println("  END");
            indexIsCurrent = false;
            return SeekStatus.END;
          }
          common = 0;
        }
      }
 private String readString(int offset, BytesRefBuilder scratch) {
   scratchUTF16.copyUTF8Bytes(scratch.bytes(), offset, scratch.length() - offset);
   return scratchUTF16.toString();
 }
  @Override
  public Fields get(int doc) throws IOException {
    SortedMap<String, SimpleTVTerms> fields = new TreeMap<>();
    in.seek(offsets[doc]);
    readLine();
    assert StringHelper.startsWith(scratch.get(), NUMFIELDS);
    int numFields = parseIntAt(NUMFIELDS.length);
    if (numFields == 0) {
      return null; // no vectors for this doc
    }
    for (int i = 0; i < numFields; i++) {
      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELD);
      // skip fieldNumber:
      parseIntAt(FIELD.length);

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDNAME);
      String fieldName = readString(FIELDNAME.length, scratch);

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDPOSITIONS);
      boolean positions = Boolean.parseBoolean(readString(FIELDPOSITIONS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDOFFSETS);
      boolean offsets = Boolean.parseBoolean(readString(FIELDOFFSETS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDPAYLOADS);
      boolean payloads = Boolean.parseBoolean(readString(FIELDPAYLOADS.length, scratch));

      readLine();
      assert StringHelper.startsWith(scratch.get(), FIELDTERMCOUNT);
      int termCount = parseIntAt(FIELDTERMCOUNT.length);

      SimpleTVTerms terms = new SimpleTVTerms(offsets, positions, payloads);
      fields.put(fieldName, terms);

      BytesRefBuilder term = new BytesRefBuilder();
      for (int j = 0; j < termCount; j++) {
        readLine();
        assert StringHelper.startsWith(scratch.get(), TERMTEXT);
        int termLength = scratch.length() - TERMTEXT.length;
        term.grow(termLength);
        term.setLength(termLength);
        System.arraycopy(scratch.bytes(), TERMTEXT.length, term.bytes(), 0, termLength);

        SimpleTVPostings postings = new SimpleTVPostings();
        terms.terms.put(term.toBytesRef(), postings);

        readLine();
        assert StringHelper.startsWith(scratch.get(), TERMFREQ);
        postings.freq = parseIntAt(TERMFREQ.length);

        if (positions || offsets) {
          if (positions) {
            postings.positions = new int[postings.freq];
            if (payloads) {
              postings.payloads = new BytesRef[postings.freq];
            }
          }

          if (offsets) {
            postings.startOffsets = new int[postings.freq];
            postings.endOffsets = new int[postings.freq];
          }

          for (int k = 0; k < postings.freq; k++) {
            if (positions) {
              readLine();
              assert StringHelper.startsWith(scratch.get(), POSITION);
              postings.positions[k] = parseIntAt(POSITION.length);
              if (payloads) {
                readLine();
                assert StringHelper.startsWith(scratch.get(), PAYLOAD);
                if (scratch.length() - PAYLOAD.length == 0) {
                  postings.payloads[k] = null;
                } else {
                  byte payloadBytes[] = new byte[scratch.length() - PAYLOAD.length];
                  System.arraycopy(
                      scratch.bytes(), PAYLOAD.length, payloadBytes, 0, payloadBytes.length);
                  postings.payloads[k] = new BytesRef(payloadBytes);
                }
              }
            }

            if (offsets) {
              readLine();
              assert StringHelper.startsWith(scratch.get(), STARTOFFSET);
              postings.startOffsets[k] = parseIntAt(STARTOFFSET.length);

              readLine();
              assert StringHelper.startsWith(scratch.get(), ENDOFFSET);
              postings.endOffsets[k] = parseIntAt(ENDOFFSET.length);
            }
          }
        }
      }
    }
    return new SimpleTVFields(fields);
  }