Beispiel #1
0
  /**
   * Returns the next String in lexicographic order that will not put the machine into a reject
   * state.
   *
   * <p>This method traverses the DFA from the given position in the String, starting at the given
   * state.
   *
   * <p>If this cannot satisfy the machine, returns false. This method will walk the minimal path,
   * in lexicographic order, as long as possible.
   *
   * <p>If this method returns false, then there might still be more solutions, it is necessary to
   * backtrack to find out.
   *
   * @param state current non-reject state
   * @param position useful portion of the string
   * @return true if more possible solutions exist for the DFA from this position
   */
  private boolean nextString(int state, int position) {
    /*
     * the next lexicographic character must be greater than the existing
     * character, if it exists.
     */
    int c = 0;
    if (position < seekBytesRef.length) {
      c = seekBytesRef.bytes[position] & 0xff;
      // if the next byte is 0xff and is not part of the useful portion,
      // then by definition it puts us in a reject state, and therefore this
      // path is dead. there cannot be any higher transitions. backtrack.
      if (c++ == 0xff) return false;
    }

    seekBytesRef.length = position;
    visited[state] = curGen;

    Transition transitions[] = allTransitions[state];

    // find the minimal path (lexicographic order) that is >= c

    for (int i = 0; i < transitions.length; i++) {
      Transition transition = transitions[i];
      if (transition.getMax() >= c) {
        int nextChar = Math.max(c, transition.getMin());
        // append either the next sequential char, or the minimum transition
        seekBytesRef.grow(seekBytesRef.length + 1);
        seekBytesRef.length++;
        seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) nextChar;
        state = transition.getDest().getNumber();
        /*
         * as long as is possible, continue down the minimal path in
         * lexicographic order. if a loop or accept state is encountered, stop.
         */
        while (visited[state] != curGen && !runAutomaton.isAccept(state)) {
          visited[state] = curGen;
          /*
           * Note: we work with a DFA with no transitions to dead states.
           * so the below is ok, if it is not an accept state,
           * then there MUST be at least one transition.
           */
          transition = allTransitions[state][0];
          state = transition.getDest().getNumber();

          // append the minimum transition
          seekBytesRef.grow(seekBytesRef.length + 1);
          seekBytesRef.length++;
          seekBytesRef.bytes[seekBytesRef.length - 1] = (byte) transition.getMin();

          // we found a loop, record it for faster enumeration
          if (!finite && !linear && visited[state] == curGen) {
            setLinear(seekBytesRef.length - 1);
          }
        }
        return true;
      }
    }
    return false;
  }
    @Override
    public BytesRef getPayload() throws IOException {
      if (!payloadPending) {
        return null;
      }

      if (pendingPayloadBytes == 0) {
        return payload;
      }

      assert pendingPayloadBytes >= payloadLength;

      if (pendingPayloadBytes > payloadLength) {
        payloadIn.seek(payloadIn.getFilePointer() + (pendingPayloadBytes - payloadLength));
      }

      if (payload == null) {
        payload = new BytesRef();
        payload.bytes = new byte[payloadLength];
      } else if (payload.bytes.length < payloadLength) {
        payload.grow(payloadLength);
      }

      payloadIn.readBytes(payload.bytes, 0, payloadLength);
      payload.length = payloadLength;
      pendingPayloadBytes = 0;
      return payload;
    }
    @Override
    public int nextPosition() throws IOException {
      final int pos;
      if (readPositions) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, POS) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + POS.length,
            scratch.length - POS.length,
            scratchUTF16_2);
        pos = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      } else {
        pos = -1;
      }

      if (readOffsets) {
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, START_OFFSET)
            : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + START_OFFSET.length,
            scratch.length - START_OFFSET.length,
            scratchUTF16_2);
        startOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
        SimpleTextUtil.readLine(in, scratch);
        assert StringHelper.startsWith(scratch, END_OFFSET) : "got line=" + scratch.utf8ToString();
        UnicodeUtil.UTF8toUTF16(
            scratch.bytes,
            scratch.offset + END_OFFSET.length,
            scratch.length - END_OFFSET.length,
            scratchUTF16_2);
        endOffset = ArrayUtil.parseInt(scratchUTF16_2.chars, 0, scratchUTF16_2.length);
      }

      final long fp = in.getFilePointer();
      SimpleTextUtil.readLine(in, scratch);
      if (StringHelper.startsWith(scratch, PAYLOAD)) {
        final int len = scratch.length - PAYLOAD.length;
        if (scratch2.bytes.length < len) {
          scratch2.grow(len);
        }
        System.arraycopy(scratch.bytes, PAYLOAD.length, scratch2.bytes, 0, len);
        scratch2.length = len;
        payload = scratch2;
      } else {
        payload = null;
        in.seek(fp);
      }
      return pos;
    }
Beispiel #4
0
 public Term next() {
   assert hasNext();
   try {
     int code = input.readVInt();
     if ((code & 1) != 0) {
       // new field
       field = input.readString();
     }
     int prefix = code >>> 1;
     int suffix = input.readVInt();
     bytes.grow(prefix + suffix);
     input.readBytes(bytes.bytes, prefix, suffix);
     bytes.length = prefix + suffix;
     term.set(field, bytes);
     return term;
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
 private void loadTerms() throws IOException {
   PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton(false);
   final Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>> b;
   final PairOutputs<Long, Long> outputsInner =
       new PairOutputs<Long, Long>(posIntOutputs, posIntOutputs);
   final PairOutputs<Long, PairOutputs.Pair<Long, Long>> outputs =
       new PairOutputs<Long, PairOutputs.Pair<Long, Long>>(posIntOutputs, outputsInner);
   b =
       new Builder<PairOutputs.Pair<Long, PairOutputs.Pair<Long, Long>>>(
           FST.INPUT_TYPE.BYTE1, outputs);
   IndexInput in = (IndexInput) SimpleTextFieldsReader.this.in.clone();
   in.seek(termsStart);
   final BytesRef lastTerm = new BytesRef(10);
   long lastDocsStart = -1;
   int docFreq = 0;
   long totalTermFreq = 0;
   OpenBitSet visitedDocs = new OpenBitSet();
   final IntsRef scratchIntsRef = new IntsRef();
   while (true) {
     SimpleTextUtil.readLine(in, scratch);
     if (scratch.equals(END) || StringHelper.startsWith(scratch, FIELD)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
         sumTotalTermFreq += totalTermFreq;
       }
       break;
     } else if (StringHelper.startsWith(scratch, DOC)) {
       docFreq++;
       sumDocFreq++;
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       int docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       visitedDocs.set(docID);
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, TERM)) {
       if (lastDocsStart != -1) {
         b.add(
             Util.toIntsRef(lastTerm, scratchIntsRef),
             outputs.newPair(
                 lastDocsStart, outputsInner.newPair((long) docFreq, totalTermFreq)));
       }
       lastDocsStart = in.getFilePointer();
       final int len = scratch.length - TERM.length;
       if (len > lastTerm.length) {
         lastTerm.grow(len);
       }
       System.arraycopy(scratch.bytes, TERM.length, lastTerm.bytes, 0, len);
       lastTerm.length = len;
       docFreq = 0;
       sumTotalTermFreq += totalTermFreq;
       totalTermFreq = 0;
       termCount++;
     }
   }
   docCount = (int) visitedDocs.cardinality();
   fst = b.finish();
   /*
   PrintStream ps = new PrintStream("out.dot");
   fst.toDot(ps);
   ps.close();
   System.out.println("SAVED out.dot");
   */
   // System.out.println("FST " + fst.sizeInBytes());
 }