Пример #1
0
  /**
   * Logically casts input to UTF32 ints then looks up the output or null if the input is not
   * accepted. FST must be INPUT_TYPE.BYTE4.
   */
  public static <T> T get(FST<T> fst, CharSequence input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE4;

    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

    int charIdx = 0;
    final int charLimit = input.length();

    // Accumulate output as we go
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    T output = NO_OUTPUT;

    while (charIdx < charLimit) {
      final int utf32 = Character.codePointAt(input, charIdx);
      charIdx += Character.charCount(utf32);

      if (fst.findTargetArc(utf32, arc, arc) == null) {
        return null;
      } else if (arc.output != NO_OUTPUT) {
        output = fst.outputs.add(output, arc.output);
      }
    }

    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
      return null;
    } else if (arc.output != NO_OUTPUT) {
      return fst.outputs.add(output, arc.output);
    } else {
      return output;
    }
  }
  /** Rewinds enum state to match the shared prefix between current term and target term */
  protected final void rewindPrefix() throws IOException {
    if (upto == 0) {
      // System.out.println("  init");
      upto = 1;
      fst.readFirstTargetArc(getArc(0), getArc(1));
      return;
    }
    // System.out.println("  rewind upto=" + upto + " vs targetLength=" + targetLength);

    final int currentLimit = upto;
    upto = 1;
    while (upto < currentLimit && upto <= targetLength + 1) {
      final int cmp = getCurrentLabel() - getTargetLabel();
      if (cmp < 0) {
        // seek forward
        break;
      } else if (cmp > 0) {
        // seek backwards -- reset this arc to the first arc
        final FST.Arc<T> arc = getArc(upto);
        fst.readFirstTargetArc(getArc(upto - 1), arc);
        // System.out.println("    seek first arc");
        break;
      }
      upto++;
    }
  }
Пример #3
0
  /**
   * Looks up the output for this input, or null if the input is not accepted. FST must be
   * INPUT_TYPE.BYTE4.
   */
  public static <T> T get(FST<T> fst, IntsRef input) throws IOException {
    assert fst.inputType == FST.INPUT_TYPE.BYTE4;

    // TODO: would be nice not to alloc this on every lookup
    final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());

    // Accumulate output as we go
    final T NO_OUTPUT = fst.outputs.getNoOutput();
    T output = NO_OUTPUT;
    for (int i = 0; i < input.length; i++) {
      if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
        return null;
      } else if (arc.output != NO_OUTPUT) {
        output = fst.outputs.add(output, arc.output);
      }
    }

    if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
      return null;
    } else if (arc.output != NO_OUTPUT) {
      return fst.outputs.add(output, arc.output);
    } else {
      return output;
    }
  }
Пример #4
0
  @Test
  public void testFST2() throws IOException {
    String inputValues[] = {
      "brats", "cat", "dog", "dogs", "rat",
    };

    int outputValues[] = {1, 3, 5, 7, 11};

    Builder builder = new Builder();
    builder.build(inputValues, outputValues);

    for (int i = 0; i < inputValues.length; i++) {
      assertEquals(outputValues[i], builder.transduce(inputValues[i]));
    }

    Compiler compiledFST = builder.getCompiler();
    FST fst = new FST(compiledFST.getByteArray());

    assertEquals(0, fst.lookup("brat")); // Prefix match
    assertEquals(1, fst.lookup("brats"));
    assertEquals(3, fst.lookup("cat"));
    assertEquals(5, fst.lookup("dog"));
    assertEquals(7, fst.lookup("dogs"));
    assertEquals(11, fst.lookup("rat"));
    assertEquals(-1, fst.lookup("rats")); // No match
  }
  protected void doNext() throws IOException {
    // System.out.println("FE: next upto=" + upto);
    if (upto == 0) {
      // System.out.println("  init");
      upto = 1;
      fst.readFirstTargetArc(getArc(0), getArc(1));
    } else {
      // pop
      // System.out.println("  check pop curArc target=" + arcs[upto].target + " label=" +
      // arcs[upto].label + " isLast?=" + arcs[upto].isLast());
      while (arcs[upto].isLast()) {
        upto--;
        if (upto == 0) {
          // System.out.println("  eof");
          return;
        }
      }
      fst.readNextArc(arcs[upto]);
    }

    pushFirst();
  }
  // Recurses from current arc, appending last arc all the
  // way to the first final node
  private void pushLast() throws IOException {

    FST.Arc<T> arc = arcs[upto];
    assert arc != null;

    while (true) {
      setCurrentLabel(arc.label);
      output[upto] = fst.outputs.add(output[upto - 1], arc.output);
      if (arc.label == FST.END_LABEL) {
        // Final node
        break;
      }
      incr();

      arc = fst.readLastTargetArc(arc, getArc(upto));
    }
  }
  // Appends current arc, and then recurses from its target,
  // appending first arc all the way to the final node
  private void pushFirst() throws IOException {

    FST.Arc<T> arc = arcs[upto];
    assert arc != null;

    while (true) {
      output[upto] = fst.outputs.add(output[upto - 1], arc.output);
      if (arc.label == FST.END_LABEL) {
        // Final node
        break;
      }
      // System.out.println("  pushFirst label=" + (char) arc.label + " upto=" + upto + " output=" +
      // fst.outputs.outputToString(output[upto]));
      setCurrentLabel(arc.label);
      incr();

      final FST.Arc<T> nextArc = getArc(upto);
      fst.readFirstTargetArc(arc, nextArc);
      arc = nextArc;
    }
  }
Пример #8
0
 @Override
 public void print(StringBuilder out, FST value) {
   out.append("type=").append(value.getType().getShortName());
 }
 /**
  * doFloor controls the behavior of advance: if it's true doFloor is true, advance positions to
  * the biggest term before target.
  */
 protected FSTEnum(FST<T> fst) {
   this.fst = fst;
   NO_OUTPUT = fst.outputs.getNoOutput();
   fst.getFirstArc(getArc(0));
   output[0] = NO_OUTPUT;
 }
  /** Seeks to largest term that's <= target. */
  protected void doSeekFloor() throws IOException {

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton
    // System.out.println("FE: seek floor upto=" + upto);

    // Save CPU by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();

    // System.out.println("FE: after rewind upto=" + upto);

    FST.Arc<T> arc = getArc(upto);
    int targetLabel = getTargetLabel();

    // System.out.println("FE: init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while (true) {
      // System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char)
      // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());

      if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST<T>.BytesReader in = fst.getBytesReader(0);
        int low = arc.arcIdx;
        int high = arc.numArcs - 1;
        int mid = 0;
        // System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" +
        // targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.pos = arc.posArcsStart - arc.bytesPerArc * mid - 1;
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          // System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + "
          // midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0) low = mid + 1;
          else if (cmp > 0) high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match -- recurse
          // System.out.println("  match!  arcIdx=" + mid);
          arc.arcIdx = mid - 1;
          fst.readNextRealArc(arc, in);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel
              : "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
          continue;
        } else if (high == -1) {
          // System.out.println("  before first");
          // Very first arc is after our target
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while (true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto - 1), arc);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while (!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) {
                fst.readNextArc(arc);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else {
          // There is a floor arc:
          arc.arcIdx = (low > high ? high : low) - 1;
          // System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
          fst.readNextRealArc(arc, in);
          assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
          assert arc.label < targetLabel;
          pushLast();
          return;
        }
      } else {

        if (arc.label == targetLabel) {
          // Match -- recurse
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while (true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto - 1), arc);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while (!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) {
                fst.readNextArc(arc);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else if (!arc.isLast()) {
          // System.out.println("  check next label=" + fst.readNextArcLabel(arc) + " (" + (char)
          // fst.readNextArcLabel(arc) + ")");
          if (fst.readNextArcLabel(arc) > targetLabel) {
            pushLast();
            return;
          } else {
            // keep scanning
            fst.readNextArc(arc);
          }
        } else {
          pushLast();
          return;
        }
      }
    }
  }
  /** Seeks to smallest term that's >= target. */
  protected void doSeekCeil() throws IOException {

    // System.out.println("    advance len=" + target.length + " curlen=" + current.length);

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton

    // System.out.println("FE.seekCeil upto=" + upto);

    // Save time by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();
    // System.out.println("  after rewind upto=" + upto);

    FST.Arc<T> arc = getArc(upto);
    int targetLabel = getTargetLabel();
    // System.out.println("  init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while (true) {

      // System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char)
      // arc.label + ") vs targetLabel=" + targetLabel);

      if (arc.bytesPerArc != 0 && arc.label != -1) {

        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST<T>.BytesReader in = fst.getBytesReader(0);
        int low = arc.arcIdx;
        int high = arc.numArcs - 1;
        int mid = 0;
        // System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" +
        // targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.pos = arc.posArcsStart - arc.bytesPerArc * mid - 1;
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          // System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + "
          // midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0) low = mid + 1;
          else if (cmp > 0) high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match
          arc.arcIdx = mid - 1;
          fst.readNextRealArc(arc, in);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel
              : "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
          continue;
        } else if (low == arc.numArcs) {
          // Dead end
          arc.arcIdx = arc.numArcs - 2;
          fst.readNextRealArc(arc, in);
          assert arc.isLast();
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while (true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc<T> prevArc = getArc(upto);
            // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
            // isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          arc.arcIdx = (low > high ? low : high) - 1;
          fst.readNextRealArc(arc, in);
          assert arc.label > targetLabel;
          pushFirst();
          return;
        }
      } else {
        // Arcs are not array'd -- must do linear scan:
        if (arc.label == targetLabel) {
          // recurse
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          pushFirst();
          return;
        } else if (arc.isLast()) {
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while (true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc<T> prevArc = getArc(upto);
            // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
            // isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          // keep scanning
          // System.out.println("    next scan");
          fst.readNextArc(arc);
        }
      }
    }
  }
Пример #12
0
  /**
   * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description for visualization.
   * Example of use:
   *
   * <pre>
   * PrintStream ps = new PrintStream(&quot;out.dot&quot;);
   * fst.toDot(ps);
   * ps.close();
   * </pre>
   *
   * and then, from command line:
   *
   * <pre>
   * dot -Tpng -o out.png out.dot
   * </pre>
   *
   * <p>Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
   *
   * @param sameRank If <code>true</code>, the resulting <code>dot</code> file will try to order
   *     states in layers of breadth-first traversal. This may mess up arcs, but makes the output
   *     FST's structure a bit clearer.
   * @param labelStates If <code>true</code> states will have labels equal to their offsets in their
   *     binary format. Expands the graph considerably.
   * @see "http://www.graphviz.org/"
   */
  public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
      throws IOException {
    final String expandedNodeColor = "blue";

    // This is the start arc in the automaton (from the epsilon state to the first state
    // with outgoing transitions.
    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());

    // A queue of transitions to consider for the next level.
    final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();

    // A queue of transitions to consider when processing the next level.
    final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
    nextLevelQueue.add(startArc);

    // A list of states on the same level (for ranking).
    final List<Integer> sameLevelStates = new ArrayList<Integer>();

    // A bitset of already seen states (target offset).
    final BitSet seen = new BitSet();
    seen.set(startArc.target);

    // Shape for states.
    final String stateShape = "circle";

    // Emit DOT prologue.
    out.write("digraph FST {\n");
    out.write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");

    if (!labelStates) {
      out.write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
    }

    emitDotState(out, "initial", "point", "white", "");
    emitDotState(
        out,
        Integer.toString(startArc.target),
        stateShape,
        fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
        "");
    out.write("  initial -> " + startArc.target + "\n");

    final T NO_OUTPUT = fst.outputs.getNoOutput();
    int level = 0;

    while (!nextLevelQueue.isEmpty()) {
      // we could double buffer here, but it doesn't matter probably.
      thisLevelQueue.addAll(nextLevelQueue);
      nextLevelQueue.clear();

      level++;
      out.write("\n  // Transitions and states at level: " + level + "\n");
      while (!thisLevelQueue.isEmpty()) {
        final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);

        if (fst.targetHasArcs(arc)) {
          // scan all arcs
          final int node = arc.target;
          fst.readFirstTargetArc(arc, arc);

          while (true) {
            // Emit the unseen state and add it to the queue for the next level.
            if (arc.target >= 0 && !seen.get(arc.target)) {
              final boolean isExpanded = fst.isExpandedTarget(arc);
              emitDotState(
                  out,
                  Integer.toString(arc.target),
                  stateShape,
                  isExpanded ? expandedNodeColor : null,
                  labelStates ? Integer.toString(arc.target) : "");
              seen.set(arc.target);
              nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
              sameLevelStates.add(arc.target);
            }

            String outs;
            if (arc.output != NO_OUTPUT) {
              outs = "/" + fst.outputs.outputToString(arc.output);
            } else {
              outs = "";
            }

            final String cl;
            if (arc.label == FST.END_LABEL) {
              cl = "~";
            } else {
              cl = printableLabel(arc.label);
            }

            out.write("  " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");

            // Break the loop if we're on the last arc of this state.
            if (arc.isLast()) {
              break;
            }
            fst.readNextArc(arc);
          }
        }
      }

      // Emit state ranking information.
      if (sameRank && sameLevelStates.size() > 1) {
        out.write("  {rank=same; ");
        for (int state : sameLevelStates) {
          out.write(state + "; ");
        }
        out.write(" }\n");
      }
      sameLevelStates.clear();
    }

    // Emit terminating state (always there anyway).
    out.write("  -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
    out.write("  {rank=sink; -1 }\n");

    out.write("}\n");
    out.flush();
  }