@Override
 public void finish(long termsFilePointer) throws IOException {
   fst = fstBuilder.finish();
   if (fst != null) {
     fst.save(out);
   }
 }
Пример #2
0
  /**
   * @param input input tokenstream
   * @param synonyms synonym map
   * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. Note,
   *     if you set this to true, it's your responsibility to lowercase the input entries when you
   *     create the {@link SynonymMap}
   */
  public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
    super(input);
    this.synonyms = synonyms;
    this.ignoreCase = ignoreCase;
    this.fst = synonyms.fst;
    if (fst == null) {
      throw new IllegalArgumentException("fst must be non-null");
    }
    this.fstReader = fst.getBytesReader();

    // Must be 1+ so that when roll buffer is at full
    // lookahead we can distinguish this full buffer from
    // the empty buffer:
    rollBufferSize = 1 + synonyms.maxHorizontalContext;

    futureInputs = new PendingInput[rollBufferSize];
    futureOutputs = new PendingOutputs[rollBufferSize];
    for (int pos = 0; pos < rollBufferSize; pos++) {
      futureInputs[pos] = new PendingInput();
      futureOutputs[pos] = new PendingOutputs();
    }

    // System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);

    scratchArc = new FST.Arc<>();
  }
Пример #3
0
      IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
        super();
        // if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
        this.fst = dict;
        this.fstReader = fst.getBytesReader();
        this.fstOutputs = dict.outputs;
        this.fsa = compiled.runAutomaton;
        this.level = -1;
        this.stack = new Frame[16];
        for (int i = 0; i < stack.length; i++) {
          this.stack[i] = new Frame();
        }

        Frame frame;
        frame = loadVirtualFrame(newFrame());
        this.level++;
        frame = loadFirstFrame(newFrame());
        pushFrame(frame);

        this.meta = null;
        this.metaUpto = 1;
        this.decoded = false;
        this.pending = false;

        if (startTerm == null) {
          pending = isAccept(topFrame());
        } else {
          doSeekCeil(startTerm);
          pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame());
        }
      }
  private Long lookupPrefix(BytesRef scratch, Arc<Long> arc) throws /*Bogus*/ IOException {
    assert 0 == fst.outputs.getNoOutput().longValue();
    long output = 0;
    BytesReader bytesReader = fst.getBytesReader(0);

    fst.getFirstArc(arc);

    byte[] bytes = scratch.bytes;
    int pos = scratch.offset;
    int end = pos + scratch.length;
    while (pos < end) {
      if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
        return null;
      } else {
        output += arc.output.longValue();
      }
    }

    return output;
  }
Пример #5
0
 /** Load frame for target arc(node) on fst */
 Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
   if (!canGrow(top)) {
     return null;
   }
   frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
   frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
   // if (TEST) System.out.println(" loadExpand frame="+frame);
   if (frame.fsaState == -1) {
     return loadNextFrame(top, frame);
   }
   return frame;
 }
 @Override
 public boolean store(OutputStream output) throws IOException {
   try {
     if (fst == null) {
       return false;
     }
     fst.save(new OutputStreamDataOutput(output));
   } finally {
     IOUtils.close(output);
   }
   return true;
 }
 @Override
 public void seekExact(long ord) throws IOException {
   // TODO: would be better to make this simpler and faster.
   // but we dont want to introduce a bug that corrupts our enum state!
   bytesReader.setPosition(0);
   fst.getFirstArc(firstArc);
   IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts);
   BytesRefBuilder scratchBytes = new BytesRefBuilder();
   scratchBytes.clear();
   Util.toBytesRef(output, scratchBytes);
   // TODO: we could do this lazily, better to try to push into FSTEnum though?
   in.seekExact(scratchBytes.get());
 }
Пример #8
0
 static <T> void walk(FST<T> fst) throws IOException {
   final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
   final BitSet seen = new BitSet();
   final FST.BytesReader reader = fst.getBytesReader();
   final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
   queue.add(startArc);
   while (!queue.isEmpty()) {
     final FST.Arc<T> arc = queue.remove(0);
     final long node = arc.target;
     // System.out.println(arc);
     if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
       seen.set((int) node);
       fst.readFirstRealTargetArc(node, arc, reader);
       while (true) {
         queue.add(new FST.Arc<T>().copyFrom(arc));
         if (arc.isLast()) {
           break;
         } else {
           fst.readNextRealArc(arc, reader);
         }
       }
     }
   }
 }
Пример #9
0
 /** Load frame for sibling arc(node) on fst */
 Frame loadNextFrame(Frame top, Frame frame) throws IOException {
   if (!canRewind(frame)) {
     return null;
   }
   while (!frame.fstArc.isLast()) {
     frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader);
     frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
     if (frame.fsaState != -1) {
       break;
     }
   }
   // if (TEST) System.out.println(" loadNext frame="+frame);
   if (frame.fsaState == -1) {
     return null;
   }
   return frame;
 }
Пример #10
0
 @Override
 public long ramBytesUsed() {
   return BASE_RAM_BYTES_USED + dict.ramBytesUsed();
 }
Пример #11
0
 @Override
 public long ramBytesUsed() {
   return ((index != null) ? index.ramBytesUsed() : 0);
 }
Пример #12
0
 boolean canGrow(Frame frame) { // can walk forward on both fst&fsa
   return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc);
 }
Пример #13
0
 /** Load frame for start arc(node) on fst */
 Frame loadFirstFrame(Frame frame) throws IOException {
   frame.fstArc = fst.getFirstArc(frame.fstArc);
   frame.fsaState = fsa.getInitialState();
   return frame;
 }
Пример #14
0
  private void parse() throws IOException {
    // System.out.println("\nS: parse");

    assert inputSkipCount == 0;

    int curNextRead = nextRead;

    // Holds the longest match we've seen so far:
    BytesRef matchOutput = null;
    int matchInputLength = 0;
    int matchEndOffset = -1;

    BytesRef pendingOutput = fst.outputs.getNoOutput();
    fst.getFirstArc(scratchArc);

    assert scratchArc.output == fst.outputs.getNoOutput();

    int tokenCount = 0;

    byToken:
    while (true) {

      // Pull next token's chars:
      final char[] buffer;
      final int bufferLen;
      // System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);

      int inputEndOffset = 0;

      if (curNextRead == nextWrite) {

        // We used up our lookahead buffer of input tokens
        // -- pull next real input token:

        if (finished) {
          break;
        } else {
          // System.out.println("  input.incrToken");
          assert futureInputs[nextWrite].consumed;
          // Not correct: a syn match whose output is longer
          // than its input can set future inputs keepOrig
          // to true:
          // assert !futureInputs[nextWrite].keepOrig;
          if (input.incrementToken()) {
            buffer = termAtt.buffer();
            bufferLen = termAtt.length();
            final PendingInput input = futureInputs[nextWrite];
            lastStartOffset = input.startOffset = offsetAtt.startOffset();
            lastEndOffset = input.endOffset = offsetAtt.endOffset();
            inputEndOffset = input.endOffset;
            // System.out.println("  new token=" + new String(buffer, 0, bufferLen));
            if (nextRead != nextWrite) {
              capture();
            } else {
              input.consumed = false;
            }

          } else {
            // No more input tokens
            // System.out.println("      set end");
            finished = true;
            break;
          }
        }
      } else {
        // Still in our lookahead
        buffer = futureInputs[curNextRead].term.chars();
        bufferLen = futureInputs[curNextRead].term.length();
        inputEndOffset = futureInputs[curNextRead].endOffset;
        // System.out.println("  old token=" + new String(buffer, 0, bufferLen));
      }

      tokenCount++;

      // Run each char in this token through the FST:
      int bufUpto = 0;
      while (bufUpto < bufferLen) {
        final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
        if (fst.findTargetArc(
                ignoreCase ? Character.toLowerCase(codePoint) : codePoint,
                scratchArc,
                scratchArc,
                fstReader)
            == null) {
          // System.out.println("    stop");
          break byToken;
        }

        // Accum the output
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        // System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + "
        // arc.output=" + scratchArc.output);
        bufUpto += Character.charCount(codePoint);
      }

      // OK, entire token matched; now see if this is a final
      // state:
      if (scratchArc.isFinal()) {
        matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
        matchInputLength = tokenCount;
        matchEndOffset = inputEndOffset;
        // System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
      }

      // See if the FST wants to continue matching (ie, needs to
      // see the next input token):
      if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) {
        // No further rules can match here; we're done
        // searching for matching rules starting at the
        // current input position.
        break;
      } else {
        // More matching is possible -- accum the output (if
        // any) of the WORD_SEP arc:
        pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
        if (nextRead == nextWrite) {
          capture();
        }
      }

      curNextRead = rollIncr(curNextRead);
    }

    if (nextRead == nextWrite && !finished) {
      // System.out.println("  skip write slot=" + nextWrite);
      nextWrite = rollIncr(nextWrite);
    }

    if (matchOutput != null) {
      // System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
      inputSkipCount = matchInputLength;
      addOutput(matchOutput, matchInputLength, matchEndOffset);
    } else if (nextRead != nextWrite) {
      // Even though we had no match here, we set to 1
      // because we need to skip current input token before
      // trying to match again:
      inputSkipCount = 1;
    } else {
      assert finished;
    }

    // System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead
    // + " nextWrite=" + nextWrite);
  }
 FSTTermsEnum(FST<Long> fst) {
   this.fst = fst;
   in = new BytesRefFSTEnum<>(fst);
   bytesReader = fst.getBytesReader();
 }
  @Override
  public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException {
    final FSTEntry entry = fsts.get(field.name);
    if (entry.numOrds == 0) {
      return DocValues.emptySortedSet(); // empty FST!
    }
    FST<Long> instance;
    synchronized (this) {
      instance = fstInstances.get(field.name);
      if (instance == null) {
        data.seek(entry.offset);
        instance = new FST<>(data, PositiveIntOutputs.getSingleton());
        if (!merging) {
          ramBytesUsed.addAndGet(instance.ramBytesUsed());
          fstInstances.put(field.name, instance);
        }
      }
    }
    final BinaryDocValues docToOrds = getBinary(field);
    final FST<Long> fst = instance;

    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);
    final ByteArrayDataInput input = new ByteArrayDataInput();
    return new SortedSetDocValues() {
      final BytesRefBuilder term = new BytesRefBuilder();
      BytesRef ordsRef;
      long currentOrd;

      @Override
      public long nextOrd() {
        if (input.eof()) {
          return NO_MORE_ORDS;
        } else {
          currentOrd += input.readVLong();
          return currentOrd;
        }
      }

      @Override
      public void setDocument(int docID) {
        ordsRef = docToOrds.get(docID);
        input.reset(ordsRef.bytes, ordsRef.offset, ordsRef.length);
        currentOrd = 0;
      }

      @Override
      public BytesRef lookupOrd(long ord) {
        try {
          in.setPosition(0);
          fst.getFirstArc(firstArc);
          IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
          term.grow(output.length);
          term.clear();
          return Util.toBytesRef(output, term);
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public long lookupTerm(BytesRef key) {
        try {
          InputOutput<Long> o = fstEnum.seekCeil(key);
          if (o == null) {
            return -getValueCount() - 1;
          } else if (o.input.equals(key)) {
            return o.output.intValue();
          } else {
            return -o.output - 1;
          }
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public long getValueCount() {
        return entry.numOrds;
      }

      @Override
      public TermsEnum termsEnum() {
        return new FSTTermsEnum(fst);
      }
    };
  }
  @Override
  public SortedDocValues getSorted(FieldInfo field) throws IOException {
    final FSTEntry entry = fsts.get(field.name);
    FST<Long> instance;
    synchronized (this) {
      instance = fstInstances.get(field.name);
      if (instance == null) {
        data.seek(entry.offset);
        instance = new FST<>(data, PositiveIntOutputs.getSingleton());
        if (!merging) {
          ramBytesUsed.addAndGet(instance.ramBytesUsed());
          fstInstances.put(field.name, instance);
        }
      }
    }
    final NumericDocValues docToOrd = getNumeric(field);
    final FST<Long> fst = instance;

    // per-thread resources
    final BytesReader in = fst.getBytesReader();
    final Arc<Long> firstArc = new Arc<>();
    final Arc<Long> scratchArc = new Arc<>();
    final IntsRefBuilder scratchInts = new IntsRefBuilder();
    final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst);

    return new SortedDocValues() {

      final BytesRefBuilder term = new BytesRefBuilder();

      @Override
      public int getOrd(int docID) {
        return (int) docToOrd.get(docID);
      }

      @Override
      public BytesRef lookupOrd(int ord) {
        try {
          in.setPosition(0);
          fst.getFirstArc(firstArc);
          IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts);
          term.grow(output.length);
          term.clear();
          return Util.toBytesRef(output, term);
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public int lookupTerm(BytesRef key) {
        try {
          InputOutput<Long> o = fstEnum.seekCeil(key);
          if (o == null) {
            return -getValueCount() - 1;
          } else if (o.input.equals(key)) {
            return o.output.intValue();
          } else {
            return (int) -o.output - 1;
          }
        } catch (IOException bogus) {
          throw new RuntimeException(bogus);
        }
      }

      @Override
      public int getValueCount() {
        return (int) entry.numOrds;
      }

      @Override
      public TermsEnum termsEnum() {
        return new FSTTermsEnum(fst);
      }
    };
  }