/** * Logically casts input to UTF32 ints then looks up the output or null if the input is not * accepted. FST must be INPUT_TYPE.BYTE4. */ public static <T> T get(FST<T> fst, CharSequence input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE4; // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); int charIdx = 0; final int charLimit = input.length(); // Accumulate output as we go final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; while (charIdx < charLimit) { final int utf32 = Character.codePointAt(input, charIdx); charIdx += Character.charCount(utf32); if (fst.findTargetArc(utf32, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } }
/** Rewinds enum state to match the shared prefix between current term and target term */ protected final void rewindPrefix() throws IOException { if (upto == 0) { // System.out.println(" init"); upto = 1; fst.readFirstTargetArc(getArc(0), getArc(1)); return; } // System.out.println(" rewind upto=" + upto + " vs targetLength=" + targetLength); final int currentLimit = upto; upto = 1; while (upto < currentLimit && upto <= targetLength + 1) { final int cmp = getCurrentLabel() - getTargetLabel(); if (cmp < 0) { // seek forward break; } else if (cmp > 0) { // seek backwards -- reset this arc to the first arc final FST.Arc<T> arc = getArc(upto); fst.readFirstTargetArc(getArc(upto - 1), arc); // System.out.println(" seek first arc"); break; } upto++; } }
/** * Looks up the output for this input, or null if the input is not accepted. FST must be * INPUT_TYPE.BYTE4. */ public static <T> T get(FST<T> fst, IntsRef input) throws IOException { assert fst.inputType == FST.INPUT_TYPE.BYTE4; // TODO: would be nice not to alloc this on every lookup final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>()); // Accumulate output as we go final T NO_OUTPUT = fst.outputs.getNoOutput(); T output = NO_OUTPUT; for (int i = 0; i < input.length; i++) { if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { output = fst.outputs.add(output, arc.output); } } if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { return null; } else if (arc.output != NO_OUTPUT) { return fst.outputs.add(output, arc.output); } else { return output; } }
@Test public void testFST2() throws IOException { String inputValues[] = { "brats", "cat", "dog", "dogs", "rat", }; int outputValues[] = {1, 3, 5, 7, 11}; Builder builder = new Builder(); builder.build(inputValues, outputValues); for (int i = 0; i < inputValues.length; i++) { assertEquals(outputValues[i], builder.transduce(inputValues[i])); } Compiler compiledFST = builder.getCompiler(); FST fst = new FST(compiledFST.getByteArray()); assertEquals(0, fst.lookup("brat")); // Prefix match assertEquals(1, fst.lookup("brats")); assertEquals(3, fst.lookup("cat")); assertEquals(5, fst.lookup("dog")); assertEquals(7, fst.lookup("dogs")); assertEquals(11, fst.lookup("rat")); assertEquals(-1, fst.lookup("rats")); // No match }
protected void doNext() throws IOException { // System.out.println("FE: next upto=" + upto); if (upto == 0) { // System.out.println(" init"); upto = 1; fst.readFirstTargetArc(getArc(0), getArc(1)); } else { // pop // System.out.println(" check pop curArc target=" + arcs[upto].target + " label=" + // arcs[upto].label + " isLast?=" + arcs[upto].isLast()); while (arcs[upto].isLast()) { upto--; if (upto == 0) { // System.out.println(" eof"); return; } } fst.readNextArc(arcs[upto]); } pushFirst(); }
// Recurses from current arc, appending last arc all the // way to the first final node private void pushLast() throws IOException { FST.Arc<T> arc = arcs[upto]; assert arc != null; while (true) { setCurrentLabel(arc.label); output[upto] = fst.outputs.add(output[upto - 1], arc.output); if (arc.label == FST.END_LABEL) { // Final node break; } incr(); arc = fst.readLastTargetArc(arc, getArc(upto)); } }
// Appends current arc, and then recurses from its target, // appending first arc all the way to the final node private void pushFirst() throws IOException { FST.Arc<T> arc = arcs[upto]; assert arc != null; while (true) { output[upto] = fst.outputs.add(output[upto - 1], arc.output); if (arc.label == FST.END_LABEL) { // Final node break; } // System.out.println(" pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" + // fst.outputs.outputToString(output[upto])); setCurrentLabel(arc.label); incr(); final FST.Arc<T> nextArc = getArc(upto); fst.readFirstTargetArc(arc, nextArc); arc = nextArc; } }
@Override public void print(StringBuilder out, FST value) { out.append("type=").append(value.getType().getShortName()); }
/** * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to * the biggest term before target. */ protected FSTEnum(FST<T> fst) { this.fst = fst; NO_OUTPUT = fst.outputs.getNoOutput(); fst.getFirstArc(getArc(0)); output[0] = NO_OUTPUT; }
/** Seeks to largest term that's <= target. */ protected void doSeekFloor() throws IOException { // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own // automaton // System.out.println("FE: seek floor upto=" + upto); // Save CPU by starting at the end of the shared prefix // b/w our current term & the target: rewindPrefix(); // System.out.println("FE: after rewind upto=" + upto); FST.Arc<T> arc = getArc(upto); int targetLabel = getTargetLabel(); // System.out.println("FE: init targetLabel=" + targetLabel); // Now scan forward, matching the new suffix of the target while (true) { // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast()); if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) { // Arcs are fixed array -- use binary search to find // the target. final FST<T>.BytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs - 1; int mid = 0; // System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + // targetLabel); boolean found = false; while (low <= high) { mid = (low + high) >>> 1; in.pos = arc.posArcsStart - arc.bytesPerArc * mid - 1; final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " // midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else { found = true; break; } } // NOTE: this code is dup'd w/ the code below (in // the outer else clause): if (found) { // Match -- recurse // System.out.println(" match! arcIdx=" + mid); arc.arcIdx = mid - 1; fst.readNextRealArc(arc, in); assert arc.arcIdx == mid; assert arc.label == targetLabel : "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto - 1], arc.output); if (targetLabel == FST.END_LABEL) { return; } setCurrentLabel(arc.label); incr(); arc = fst.readFirstTargetArc(arc, getArc(upto)); targetLabel = getTargetLabel(); continue; } else if (high == -1) { // System.out.println(" before first"); // Very first arc is after our target // TODO: if each arc could somehow read the arc just // before, we can save this re-scan. The ceil case // doesn't need this because it reads the next arc // instead: while (true) { // First, walk backwards until we find a first arc // that's before our target label: fst.readFirstTargetArc(getArc(upto - 1), arc); if (arc.label < targetLabel) { // Then, scan forwards to the arc just before // the targetLabel: while (!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) { fst.readNextArc(arc); } pushLast(); return; } upto--; if (upto == 0) { return; } targetLabel = getTargetLabel(); arc = getArc(upto); } } else { // There is a floor arc: arc.arcIdx = (low > high ? high : low) - 1; // System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1)); fst.readNextRealArc(arc, in); assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel; assert arc.label < targetLabel; pushLast(); return; } } else { if (arc.label == targetLabel) { // Match -- recurse output[upto] = fst.outputs.add(output[upto - 1], arc.output); if (targetLabel == FST.END_LABEL) { return; } setCurrentLabel(arc.label); incr(); arc = fst.readFirstTargetArc(arc, getArc(upto)); targetLabel = getTargetLabel(); } else if (arc.label > targetLabel) { // TODO: if each arc could somehow read the arc just // before, we can save this re-scan. The ceil case // doesn't need this because it reads the next arc // instead: while (true) { // First, walk backwards until we find a first arc // that's before our target label: fst.readFirstTargetArc(getArc(upto - 1), arc); if (arc.label < targetLabel) { // Then, scan forwards to the arc just before // the targetLabel: while (!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) { fst.readNextArc(arc); } pushLast(); return; } upto--; if (upto == 0) { return; } targetLabel = getTargetLabel(); arc = getArc(upto); } } else if (!arc.isLast()) { // System.out.println(" check next label=" + fst.readNextArcLabel(arc) + " (" + (char) // fst.readNextArcLabel(arc) + ")"); if (fst.readNextArcLabel(arc) > targetLabel) { pushLast(); return; } else { // keep scanning fst.readNextArc(arc); } } else { pushLast(); return; } } } }
/** Seeks to smallest term that's >= target. */ protected void doSeekCeil() throws IOException { // System.out.println(" advance len=" + target.length + " curlen=" + current.length); // TODO: possibly caller could/should provide common // prefix length? ie this work may be redundant if // caller is in fact intersecting against its own // automaton // System.out.println("FE.seekCeil upto=" + upto); // Save time by starting at the end of the shared prefix // b/w our current term & the target: rewindPrefix(); // System.out.println(" after rewind upto=" + upto); FST.Arc<T> arc = getArc(upto); int targetLabel = getTargetLabel(); // System.out.println(" init targetLabel=" + targetLabel); // Now scan forward, matching the new suffix of the target while (true) { // System.out.println(" cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char) // arc.label + ") vs targetLabel=" + targetLabel); if (arc.bytesPerArc != 0 && arc.label != -1) { // Arcs are fixed array -- use binary search to find // the target. final FST<T>.BytesReader in = fst.getBytesReader(0); int low = arc.arcIdx; int high = arc.numArcs - 1; int mid = 0; // System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" + // targetLabel); boolean found = false; while (low <= high) { mid = (low + high) >>> 1; in.pos = arc.posArcsStart - arc.bytesPerArc * mid - 1; final int midLabel = fst.readLabel(in); final int cmp = midLabel - targetLabel; // System.out.println(" cycle low=" + low + " high=" + high + " mid=" + mid + " // midLabel=" + midLabel + " cmp=" + cmp); if (cmp < 0) low = mid + 1; else if (cmp > 0) high = mid - 1; else { found = true; break; } } // NOTE: this code is dup'd w/ the code below (in // the outer else clause): if (found) { // Match arc.arcIdx = mid - 1; fst.readNextRealArc(arc, in); assert arc.arcIdx == mid; assert arc.label == targetLabel : "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid; output[upto] = fst.outputs.add(output[upto - 1], arc.output); if (targetLabel == FST.END_LABEL) { return; } setCurrentLabel(arc.label); incr(); arc = fst.readFirstTargetArc(arc, getArc(upto)); targetLabel = getTargetLabel(); continue; } else if (low == arc.numArcs) { // Dead end arc.arcIdx = arc.numArcs - 2; fst.readNextRealArc(arc, in); assert arc.isLast(); // Dead end (target is after the last arc); // rollback to last fork then push upto--; while (true) { if (upto == 0) { return; } final FST.Arc<T> prevArc = getArc(upto); // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " // isLast?=" + prevArc.isLast()); if (!prevArc.isLast()) { fst.readNextArc(prevArc); pushFirst(); return; } upto--; } } else { arc.arcIdx = (low > high ? low : high) - 1; fst.readNextRealArc(arc, in); assert arc.label > targetLabel; pushFirst(); return; } } else { // Arcs are not array'd -- must do linear scan: if (arc.label == targetLabel) { // recurse output[upto] = fst.outputs.add(output[upto - 1], arc.output); if (targetLabel == FST.END_LABEL) { return; } setCurrentLabel(arc.label); incr(); arc = fst.readFirstTargetArc(arc, getArc(upto)); targetLabel = getTargetLabel(); } else if (arc.label > targetLabel) { pushFirst(); return; } else if (arc.isLast()) { // Dead end (target is after the last arc); // rollback to last fork then push upto--; while (true) { if (upto == 0) { return; } final FST.Arc<T> prevArc = getArc(upto); // System.out.println(" rollback upto=" + upto + " arc.label=" + prevArc.label + " // isLast?=" + prevArc.isLast()); if (!prevArc.isLast()) { fst.readNextArc(prevArc); pushFirst(); return; } upto--; } } else { // keep scanning // System.out.println(" next scan"); fst.readNextArc(arc); } } } }
/** * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description for visualization. * Example of use: * * <pre> * PrintStream ps = new PrintStream("out.dot"); * fst.toDot(ps); * ps.close(); * </pre> * * and then, from command line: * * <pre> * dot -Tpng -o out.png out.dot * </pre> * * <p>Note: larger FSTs (a few thousand nodes) won't even render, don't bother. * * @param sameRank If <code>true</code>, the resulting <code>dot</code> file will try to order * states in layers of breadth-first traversal. This may mess up arcs, but makes the output * FST's structure a bit clearer. * @param labelStates If <code>true</code> states will have labels equal to their offsets in their * binary format. Expands the graph considerably. * @see "http://www.graphviz.org/" */ public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates) throws IOException { final String expandedNodeColor = "blue"; // This is the start arc in the automaton (from the epsilon state to the first state // with outgoing transitions. final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>()); // A queue of transitions to consider for the next level. final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>(); // A queue of transitions to consider when processing the next level. final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>(); nextLevelQueue.add(startArc); // A list of states on the same level (for ranking). final List<Integer> sameLevelStates = new ArrayList<Integer>(); // A bitset of already seen states (target offset). final BitSet seen = new BitSet(); seen.set(startArc.target); // Shape for states. final String stateShape = "circle"; // Emit DOT prologue. out.write("digraph FST {\n"); out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); if (!labelStates) { out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); } emitDotState(out, "initial", "point", "white", ""); emitDotState( out, Integer.toString(startArc.target), stateShape, fst.isExpandedTarget(startArc) ? expandedNodeColor : null, ""); out.write(" initial -> " + startArc.target + "\n"); final T NO_OUTPUT = fst.outputs.getNoOutput(); int level = 0; while (!nextLevelQueue.isEmpty()) { // we could double buffer here, but it doesn't matter probably. thisLevelQueue.addAll(nextLevelQueue); nextLevelQueue.clear(); level++; out.write("\n // Transitions and states at level: " + level + "\n"); while (!thisLevelQueue.isEmpty()) { final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); if (fst.targetHasArcs(arc)) { // scan all arcs final int node = arc.target; fst.readFirstTargetArc(arc, arc); while (true) { // Emit the unseen state and add it to the queue for the next level. if (arc.target >= 0 && !seen.get(arc.target)) { final boolean isExpanded = fst.isExpandedTarget(arc); emitDotState( out, Integer.toString(arc.target), stateShape, isExpanded ? expandedNodeColor : null, labelStates ? Integer.toString(arc.target) : ""); seen.set(arc.target); nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc)); sameLevelStates.add(arc.target); } String outs; if (arc.output != NO_OUTPUT) { outs = "/" + fst.outputs.outputToString(arc.output); } else { outs = ""; } final String cl; if (arc.label == FST.END_LABEL) { cl = "~"; } else { cl = printableLabel(arc.label); } out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n"); // Break the loop if we're on the last arc of this state. if (arc.isLast()) { break; } fst.readNextArc(arc); } } } // Emit state ranking information. if (sameRank && sameLevelStates.size() > 1) { out.write(" {rank=same; "); for (int state : sameLevelStates) { out.write(state + "; "); } out.write(" }\n"); } sameLevelStates.clear(); } // Emit terminating state (always there anyway). out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); out.write(" {rank=sink; -1 }\n"); out.write("}\n"); out.flush(); }