IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { super(); // if (TEST) System.out.println("Enum init, startTerm=" + startTerm); this.fst = dict; this.fstReader = fst.getBytesReader(); this.fstOutputs = dict.outputs; this.fsa = compiled.runAutomaton; this.level = -1; this.stack = new Frame[16]; for (int i = 0; i < stack.length; i++) { this.stack[i] = new Frame(); } Frame frame; frame = loadVirtualFrame(newFrame()); this.level++; frame = loadFirstFrame(newFrame()); pushFrame(frame); this.meta = null; this.metaUpto = 1; this.decoded = false; this.pending = false; if (startTerm == null) { pending = isAccept(topFrame()); } else { doSeekCeil(startTerm); pending = !startTerm.equals(term) && isValid(topFrame()) && isAccept(topFrame()); } }
/** * @param input input tokenstream * @param synonyms synonym map * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. Note, * if you set this to true, it's your responsibility to lowercase the input entries when you * create the {@link SynonymMap} */ public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { super(input); this.synonyms = synonyms; this.ignoreCase = ignoreCase; this.fst = synonyms.fst; if (fst == null) { throw new IllegalArgumentException("fst must be non-null"); } this.fstReader = fst.getBytesReader(); // Must be 1+ so that when roll buffer is at full // lookahead we can distinguish this full buffer from // the empty buffer: rollBufferSize = 1 + synonyms.maxHorizontalContext; futureInputs = new PendingInput[rollBufferSize]; futureOutputs = new PendingOutputs[rollBufferSize]; for (int pos = 0; pos < rollBufferSize; pos++) { futureInputs[pos] = new PendingInput(); futureOutputs[pos] = new PendingOutputs(); } // System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext); scratchArc = new FST.Arc<>(); }
private Long lookupPrefix(BytesRef scratch, Arc<Long> arc) throws /*Bogus*/ IOException { assert 0 == fst.outputs.getNoOutput().longValue(); long output = 0; BytesReader bytesReader = fst.getBytesReader(0); fst.getFirstArc(arc); byte[] bytes = scratch.bytes; int pos = scratch.offset; int end = pos + scratch.length; while (pos < end) { if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) { return null; } else { output += arc.output.longValue(); } } return output; }
static <T> void walk(FST<T> fst) throws IOException { final ArrayList<FST.Arc<T>> queue = new ArrayList<>(); final BitSet seen = new BitSet(); final FST.BytesReader reader = fst.getBytesReader(); final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>()); queue.add(startArc); while (!queue.isEmpty()) { final FST.Arc<T> arc = queue.remove(0); final long node = arc.target; // System.out.println(arc); if (FST.targetHasArcs(arc) && !seen.get((int) node)) { seen.set((int) node); fst.readFirstRealTargetArc(node, arc, reader); while (true) { queue.add(new FST.Arc<T>().copyFrom(arc)); if (arc.isLast()) { break; } else { fst.readNextRealArc(arc, reader); } } } } }
FSTTermsEnum(FST<Long> fst) { this.fst = fst; in = new BytesRefFSTEnum<>(fst); bytesReader = fst.getBytesReader(); }
@Override public SortedSetDocValues getSortedSet(FieldInfo field) throws IOException { final FSTEntry entry = fsts.get(field.name); if (entry.numOrds == 0) { return DocValues.emptySortedSet(); // empty FST! } FST<Long> instance; synchronized (this) { instance = fstInstances.get(field.name); if (instance == null) { data.seek(entry.offset); instance = new FST<>(data, PositiveIntOutputs.getSingleton()); if (!merging) { ramBytesUsed.addAndGet(instance.ramBytesUsed()); fstInstances.put(field.name, instance); } } } final BinaryDocValues docToOrds = getBinary(field); final FST<Long> fst = instance; // per-thread resources final BytesReader in = fst.getBytesReader(); final Arc<Long> firstArc = new Arc<>(); final Arc<Long> scratchArc = new Arc<>(); final IntsRefBuilder scratchInts = new IntsRefBuilder(); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); final ByteArrayDataInput input = new ByteArrayDataInput(); return new SortedSetDocValues() { final BytesRefBuilder term = new BytesRefBuilder(); BytesRef ordsRef; long currentOrd; @Override public long nextOrd() { if (input.eof()) { return NO_MORE_ORDS; } else { currentOrd += input.readVLong(); return currentOrd; } } @Override public void setDocument(int docID) { ordsRef = docToOrds.get(docID); input.reset(ordsRef.bytes, ordsRef.offset, ordsRef.length); currentOrd = 0; } @Override public BytesRef lookupOrd(long ord) { try { in.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); term.grow(output.length); term.clear(); return Util.toBytesRef(output, term); } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long lookupTerm(BytesRef key) { try { InputOutput<Long> o = fstEnum.seekCeil(key); if (o == null) { return -getValueCount() - 1; } else if (o.input.equals(key)) { return o.output.intValue(); } else { return -o.output - 1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public long getValueCount() { return entry.numOrds; } @Override public TermsEnum termsEnum() { return new FSTTermsEnum(fst); } }; }
@Override public SortedDocValues getSorted(FieldInfo field) throws IOException { final FSTEntry entry = fsts.get(field.name); FST<Long> instance; synchronized (this) { instance = fstInstances.get(field.name); if (instance == null) { data.seek(entry.offset); instance = new FST<>(data, PositiveIntOutputs.getSingleton()); if (!merging) { ramBytesUsed.addAndGet(instance.ramBytesUsed()); fstInstances.put(field.name, instance); } } } final NumericDocValues docToOrd = getNumeric(field); final FST<Long> fst = instance; // per-thread resources final BytesReader in = fst.getBytesReader(); final Arc<Long> firstArc = new Arc<>(); final Arc<Long> scratchArc = new Arc<>(); final IntsRefBuilder scratchInts = new IntsRefBuilder(); final BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<>(fst); return new SortedDocValues() { final BytesRefBuilder term = new BytesRefBuilder(); @Override public int getOrd(int docID) { return (int) docToOrd.get(docID); } @Override public BytesRef lookupOrd(int ord) { try { in.setPosition(0); fst.getFirstArc(firstArc); IntsRef output = Util.getByOutput(fst, ord, in, firstArc, scratchArc, scratchInts); term.grow(output.length); term.clear(); return Util.toBytesRef(output, term); } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public int lookupTerm(BytesRef key) { try { InputOutput<Long> o = fstEnum.seekCeil(key); if (o == null) { return -getValueCount() - 1; } else if (o.input.equals(key)) { return o.output.intValue(); } else { return (int) -o.output - 1; } } catch (IOException bogus) { throw new RuntimeException(bogus); } } @Override public int getValueCount() { return (int) entry.numOrds; } @Override public TermsEnum termsEnum() { return new FSTTermsEnum(fst); } }; }