@Override public void finish(long termsFilePointer) throws IOException { fst = fstBuilder.finish(); if (fst != null) { fst.save(out); } }
/** * @param input input tokenstream * @param synonyms synonym map * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. Note, * if you set this to true, it's your responsibility to lowercase the input entries when you * create the {@link SynonymMap} */ public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { super(input); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } this.fstReader = fst.getBytesReader(); // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } // System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc<>(); }
IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { super(); // if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = dict; this.fstReader = fst.getBytesReader(); this.fstOutputs = dict.outputs; this.fsa = compiled.runAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.length; i++) { this.stack[i] = new Frame(); } Frame frame; frame = loadVirtualFrame(newFrame()); this.level++; frame = loadFirstFrame(newFrame()); pushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame()); } }
private Long lookupPrefix(BytesRef scratch, Arc<Long> arc) throws /*Bogus*/ IOException { assert 0 == fst.outputs.getNoOutput().longValue(); long output = 0; BytesReader bytesReader = fst.getBytesReader(0); fst.getFirstArc(arc); byte[] bytes = scratch.bytes; int pos = scratch.offset; int end = pos + scratch.length; while (pos < end) { if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return null; } else { output += arc.output.longValue(); } } return output; }
/** Load frame for target arc(node) on fst */ Frame loadExpandFrame(Frame top, Frame frame) throws IOException { if (!canGrow(top)) { return null; } frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader); frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label); // if (TEST) System.out.println(" loadExpand frame="+frame); if (frame.fsaState == -1) { return loadNextFrame(top, frame); } return frame; }
@Override public boolean store(OutputStream output) throws IOException { try { if (fst == null) { return false; } fst.save(new OutputStreamDataOutput(output)); } finally { IOUtils.close(output); } return true; }
@Override public void seekExact(long ord) throws IOException { // TODO: would be better to make this simpler and faster. // but we dont want to introduce a bug that corrupts our enum state! bytesReader.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, bytesReader, firstArc, scratchArc, scratchInts); BytesRefBuilder scratchBytes = new BytesRefBuilder(); scratchBytes.clear(); Util.toBytesRef(output, scratchBytes); // TODO: we could do this lazily, better to try to push into FSTEnum though? in.seekExact(scratchBytes.get()); }
static <T> void walk(FST<T> fst) throws IOException { final ArrayList<FST.Arc<T>> queue = new ArrayList<>(); final BitSet seen = new BitSet(); final FST.BytesReader reader = fst.getBytesReader(); final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>()); queue.add(startArc); while (!queue.isEmpty()) { final FST.Arc<T> arc = queue.remove(0); final long node = arc.target; // System.out.println(arc); if (FST.targetHasArcs(arc) && !seen.get((int) node)) { seen.set((int) node); fst.readFirstRealTargetArc(node, arc, reader); while (true) { queue.add(new FST.Arc<T>().copyFrom(arc)); if (arc.isLast()) { break; } else { fst.readNextRealArc(arc, reader); } } } } }
/** Load frame for sibling arc(node) on fst */ Frame loadNextFrame(Frame top, Frame frame) throws IOException { if (!canRewind(frame)) { return null; } while (!frame.fstArc.isLast()) { frame.fstArc = fst.readNextRealArc(frame.fstArc, fstReader); frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label); if (frame.fsaState != -1) { break; } } // if (TEST) System.out.println(" loadNext frame="+frame); if (frame.fsaState == -1) { return null; } return frame; }
@Override public long ramBytesUsed() { return BASE_RAM_BYTES_USED + dict.ramBytesUsed(); }
@Override public long ramBytesUsed() { return ((index != null) ? index.ramBytesUsed() : 0); }
boolean canGrow(Frame frame) { // can walk forward on both fst&fsa return frame.fsaState != -1 && FST.targetHasArcs(frame.fstArc); }
/** Load frame for start arc(node) on fst */ Frame loadFirstFrame(Frame frame) throws IOException { frame.fstArc = fst.getFirstArc(frame.fstArc); frame.fsaState = fsa.getInitialState(); return frame; }
private void parse() throws IOException { // System.out.println("\nS: parse"); assert inputSkipCount == 0; int curNextRead = nextRead; // Holds the longest match we've seen so far: BytesRef matchOutput = null; int matchInputLength = 0; int matchEndOffset = -1; BytesRef pendingOutput = fst.outputs.getNoOutput(); fst.getFirstArc(scratchArc); assert scratchArc.output == fst.outputs.getNoOutput(); int tokenCount = 0; byToken: while (true) { // Pull next token's chars: final char[] buffer; final int bufferLen; // System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite); int inputEndOffset = 0; if (curNextRead == nextWrite) { // We used up our lookahead buffer of input tokens // -- pull next real input token: if (finished) { break; } else { // System.out.println(" input.incrToken"); assert futureInputs[nextWrite].consumed; // Not correct: a syn match whose output is longer // than its input can set future inputs keepOrig // to true: // assert !futureInputs[nextWrite].keepOrig; if (input.incrementToken()) { buffer = termAtt.buffer(); bufferLen = termAtt.length(); final PendingInput input = futureInputs[nextWrite]; lastStartOffset = input.startOffset = offsetAtt.startOffset(); lastEndOffset = input.endOffset = offsetAtt.endOffset(); inputEndOffset = input.endOffset; // System.out.println(" new token=" + new String(buffer, 0, bufferLen)); if (nextRead != nextWrite) { capture(); } else { input.consumed = false; } } else { // No more input tokens // System.out.println(" set end"); finished = true; break; } } } else { // Still in our lookahead buffer = futureInputs[curNextRead].term.chars(); bufferLen = futureInputs[curNextRead].term.length(); inputEndOffset = futureInputs[curNextRead].endOffset; // System.out.println(" old token=" + new String(buffer, 0, bufferLen)); } tokenCount++; // Run each char in this token through the FST: int bufUpto = 0; while (bufUpto < bufferLen) { final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen); if (fst.findTargetArc( ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc, fstReader) == null) { // System.out.println(" stop"); break byToken; } // Accum the output pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); // System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " // arc.output=" + scratchArc.output); bufUpto += Character.charCount(codePoint); } // OK, entire token matched; now see if this is a final // state: if (scratchArc.isFinal()) { matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput); matchInputLength = tokenCount; matchEndOffset = inputEndOffset; // System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput); } // See if the FST wants to continue matching (ie, needs to // see the next input token): if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc, fstReader) == null) { // No further rules can match here; we're done // searching for matching rules starting at the // current input position. break; } else { // More matching is possible -- accum the output (if // any) of the WORD_SEP arc: pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output); if (nextRead == nextWrite) { capture(); } } curNextRead = rollIncr(curNextRead); } if (nextRead == nextWrite && !finished) { // System.out.println(" skip write slot=" + nextWrite); nextWrite = rollIncr(nextWrite); } if (matchOutput != null) { // System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput); inputSkipCount = matchInputLength; addOutput(matchOutput, matchInputLength, matchEndOffset); } else if (nextRead != nextWrite) { // Even though we had no match here, we set to 1 // because we need to skip current input token before // trying to match again: inputSkipCount = 1; } else { assert finished; } // System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead // + " nextWrite=" + nextWrite); }
FSTTermsEnum(FST<Long> fst) { this.fst = fst; in = new BytesRefFSTEnum<>(fst); bytesReader = fst.getBytesReader(); }
@Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { final FSTEntry entry = fsts.get(field.name); if (entry.numOrds == 0) { return DocValues.emptySortedSet(); // empty FST! } FST<Long> instance; synchronized (this) { instance = fstInstances.get(field.name); if (instance == null) { data.seek(entry.offset); instance = new FST<>(data, PositiveIntOutputs.getSingleton()); if (!merging) { ramBytesUsed.addAndGet(instance.ramBytesUsed()); fstInstances.put(field.name, instance); } } } final BinaryDocValues docToOrds = getBinary(field); final FST<Long> fst = instance; // per-thread resources final BytesReader in = fst.getBytesReader(); final Arc<Long> firstArc = new Arc<>(); final Arc<Long> scratchArc = new Arc<>(); final IntsRefBuilder scratchInts = new IntsRefBuilder(); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); final ByteArrayDataInput input = new ByteArrayDataInput(); return new SortedSetDocValues() { final BytesRefBuilder term = new BytesRefBuilder(); BytesRef ordsRef; long currentOrd; @Override public long nextOrd() { if (input.eof()) { return NO_MORE_ORDS; } else { currentOrd += input.readVLong(); return currentOrd; } } @Override public void setDocument(int docID) { ordsRef = docToOrds.get(docID); input.reset(ordsRef.bytes, ordsRef.offset, ordsRef.length); currentOrd = 0; } @Override public BytesRef lookupOrd(long ord) { try { in.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); term.grow(output.length); term.clear(); return Util.toBytesRef(output, term); } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long lookupTerm(BytesRef key) { try { InputOutput<Long> o = fstEnum.seekCeil(key); if (o == null) { return -getValueCount() - 1; } else if (o.input.equals(key)) { return o.output.intValue(); } else { return -o.output - 1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long getValueCount() { return entry.numOrds; } @Override public TermsEnum termsEnum() { return new FSTTermsEnum(fst); } }; }
@Override public SortedDocValues getSorted(FieldInfo field) throws IOException { final FSTEntry entry = fsts.get(field.name); FST<Long> instance; synchronized (this) { instance = fstInstances.get(field.name); if (instance == null) { data.seek(entry.offset); instance = new FST<>(data, PositiveIntOutputs.getSingleton()); if (!merging) { ramBytesUsed.addAndGet(instance.ramBytesUsed()); fstInstances.put(field.name, instance); } } } final NumericDocValues docToOrd = getNumeric(field); final FST<Long> fst = instance; // per-thread resources final BytesReader in = fst.getBytesReader(); final Arc<Long> firstArc = new Arc<>(); final Arc<Long> scratchArc = new Arc<>(); final IntsRefBuilder scratchInts = new IntsRefBuilder(); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); return new SortedDocValues() { final BytesRefBuilder term = new BytesRefBuilder(); @Override public int getOrd(int docID) { return (int) docToOrd.get(docID); } @Override public BytesRef lookupOrd(int ord) { try { in.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); term.grow(output.length); term.clear(); return Util.toBytesRef(output, term); } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public int lookupTerm(BytesRef key) { try { InputOutput<Long> o = fstEnum.seekCeil(key); if (o == null) { return -getValueCount() - 1; } else if (o.input.equals(key)) { return o.output.intValue(); } else { return (int) -o.output - 1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public int getValueCount() { return (int) entry.numOrds; } @Override public TermsEnum termsEnum() { return new FSTTermsEnum(fst); } }; }