コード例 #1
0
 @Override
 public void seekExact(long ord) throws IOException {
   // TODO: would be better to make this simpler and faster.
   // but we dont want to introduce a bug that corrupts our enum state!
   bytesReader.setPosition(0);
   fst.getFirstArc(firstArc);
   IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts);
   BytesRefBuilder scratchBytes = new BytesRefBuilder();
   scratchBytes.clear();
   Util.toBytesRef(output, scratchBytes);
   // TODO: we could do this lazily, better to try to push into FSTEnum though?
   in.seekExact(scratchBytes.get());
 }
コード例 #2
0
  private Long lookupPrefix(BytesRef scratch, Arc<Long> arc) throws /*Bogus*/ IOException {
    assert 0 == fst.outputs.getNoOutput().longValue();
    long output = 0;
    BytesReader bytesReader = fst.getBytesReader(0);

    fst.getFirstArc(arc);

    byte[] bytes = scratch.bytes;
    int pos = scratch.offset;
    int end = pos + scratch.length;
    while (pos < end) {
      if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
        return null;
      } else {
        output += arc.output.longValue();
      }
    }

    return output;
  }
コード例 #3
0
 static <T> void walk(FST<T> fst) throws IOException {
   final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
   final BitSet seen = new BitSet();
   final FST.BytesReader reader = fst.getBytesReader();
   final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
   queue.add(startArc);
   while (!queue.isEmpty()) {
     final FST.Arc<T> arc = queue.remove(0);
     final long node = arc.target;
     // System.out.println(arc);
     if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
       seen.set((int) node);
       fst.readFirstRealTargetArc(node, arc, reader);
       while (true) {
         queue.add(new FST.Arc<T>().copyFrom(arc));
         if (arc.isLast()) {
           break;
         } else {
           fst.readNextRealArc(arc, reader);
         }
       }
     }
   }
 }
コード例 #4
0
 /** Load frame for start arc(node) on fst */
 Frame loadFirstFrame(Frame frame) throws IOException {
   frame.fstArc = fst.getFirstArc(frame.fstArc);
   frame.fsaState = fsa.getInitialState();
   return frame;
 }
コード例 #5
0
  private void parse() throws IOException {
    // System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken:
    while (true) {

      // Pull next token's chars:
      final char[] buffer;
      final int bufferLen;
      // System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

      int inputEndOffset = 0;

      if (curNextRead == nextWrite) {

        // We used up our lookahead buffer of input tokens
        // -- pull next real input token:

        if (finished) {
          break;
        } else {
          // System.out.println("  input.incrToken");
          assert futureInputs[nextWrite].consumed;
          // Not correct: a syn match whose output is longer
          // than its input can set future inputs keepOrig
          // to true:
          // assert !futureInputs[nextWrite].keepOrig;
          if (input.incrementToken()) {
            buffer = termAtt.buffer();
            bufferLen = termAtt.length();
            final PendingInput input = futureInputs[nextWrite];
            lastStartOffset = input.startOffset = offsetAtt.startOffset();
            lastEndOffset = input.endOffset = offsetAtt.endOffset();
            inputEndOffset = input.endOffset;
            // System.out.println("  new token=" + new String(buffer, 0, bufferLen));
            if (nextRead != nextWrite) {
              capture();
            } else {
              input.consumed = false;
            }

          } else {
            // No more input tokens
            // System.out.println("      set end");
            finished = true;
            break;
          }
        }
      } else {
        // Still in our lookahead
        buffer = futureInputs[curNextRead].term.chars();
        bufferLen = futureInputs[curNextRead].term.length();
        inputEndOffset = futureInputs[curNextRead].endOffset;
        // System.out.println("  old token=" + new String(buffer, 0, bufferLen));
      }

      tokenCount++;

      // Run each char in this token through the FST:
      int bufUpto = 0;
      while (bufUpto < bufferLen) {
        final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
        if (fst.findTargetArc(
                ignoreCase ? Character.toLowerCase(codePoint) : codePoint,
                scratchArc,
                scratchArc,
                fstReader)
            == null) {
          // System.out.println("    stop");
          break byToken;
        }

        // Accum the output
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        // System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + "
        // arc.output=" + scratchArc.output);
        bufUpto += Character.charCount(codePoint);
      }

      // OK, entire token matched; now see if this is a final
      // state:
      if (scratchArc.isFinal()) {
        matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
        matchInputLength = tokenCount;
        matchEndOffset = inputEndOffset;
        // System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
      }

      // See if the FST wants to continue matching (ie, needs to
      // see the next input token):
      if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
        // No further rules can match here; we're done
        // searching for matching rules starting at the
        // current input position.
        break;
      } else {
        // More matching is possible -- accum the output (if
        // any) of the WORD_SEP arc:
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        if (nextRead == nextWrite) {
          capture();
        }
      }

      curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
      // System.out.println("  skip write slot=" + nextWrite);
      nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
      // System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
      inputSkipCount = matchInputLength;
      addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
      // Even though we had no match here, we set to 1
      // because we need to skip current input token before
      // trying to match again:
      inputSkipCount = 1;
    } else {
      assert finished;
    }

    // System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead
    // + " nextWrite=" + nextWrite);
  }