/** Seeks to smallest term that's >= target. */
  protected void doSeekCeil() throws IOException {

    // System.out.println("    advance len=" + target.length + " curlen=" + current.length);

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton

    // System.out.println("FE.seekCeil upto=" + upto);

    // Save time by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();
    // System.out.println("  after rewind upto=" + upto);

    FST.Arc<T> arc = getArc(upto);
    int targetLabel = getTargetLabel();
    // System.out.println("  init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while (true) {

      // System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char)
      // arc.label + ") vs targetLabel=" + targetLabel);

      if (arc.bytesPerArc != 0 && arc.label != -1) {

        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST<T>.BytesReader in = fst.getBytesReader(0);
        int low = arc.arcIdx;
        int high = arc.numArcs - 1;
        int mid = 0;
        // System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" +
        // targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.pos = arc.posArcsStart - arc.bytesPerArc * mid - 1;
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          // System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + "
          // midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0) low = mid + 1;
          else if (cmp > 0) high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match
          arc.arcIdx = mid - 1;
          fst.readNextRealArc(arc, in);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel
              : "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
          continue;
        } else if (low == arc.numArcs) {
          // Dead end
          arc.arcIdx = arc.numArcs - 2;
          fst.readNextRealArc(arc, in);
          assert arc.isLast();
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while (true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc<T> prevArc = getArc(upto);
            // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
            // isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          arc.arcIdx = (low > high ? low : high) - 1;
          fst.readNextRealArc(arc, in);
          assert arc.label > targetLabel;
          pushFirst();
          return;
        }
      } else {
        // Arcs are not array'd -- must do linear scan:
        if (arc.label == targetLabel) {
          // recurse
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          pushFirst();
          return;
        } else if (arc.isLast()) {
          // Dead end (target is after the last arc);
          // rollback to last fork then push
          upto--;
          while (true) {
            if (upto == 0) {
              return;
            }
            final FST.Arc<T> prevArc = getArc(upto);
            // System.out.println("  rollback upto=" + upto + " arc.label=" + prevArc.label + "
            // isLast?=" + prevArc.isLast());
            if (!prevArc.isLast()) {
              fst.readNextArc(prevArc);
              pushFirst();
              return;
            }
            upto--;
          }
        } else {
          // keep scanning
          // System.out.println("    next scan");
          fst.readNextArc(arc);
        }
      }
    }
  }
  /** Seeks to largest term that's <= target. */
  protected void doSeekFloor() throws IOException {

    // TODO: possibly caller could/should provide common
    // prefix length?  ie this work may be redundant if
    // caller is in fact intersecting against its own
    // automaton
    // System.out.println("FE: seek floor upto=" + upto);

    // Save CPU by starting at the end of the shared prefix
    // b/w our current term & the target:
    rewindPrefix();

    // System.out.println("FE: after rewind upto=" + upto);

    FST.Arc<T> arc = getArc(upto);
    int targetLabel = getTargetLabel();

    // System.out.println("FE: init targetLabel=" + targetLabel);

    // Now scan forward, matching the new suffix of the target
    while (true) {
      // System.out.println("  cycle upto=" + upto + " arc.label=" + arc.label + " (" + (char)
      // arc.label + ") targetLabel=" + targetLabel + " isLast?=" + arc.isLast());

      if (arc.bytesPerArc != 0 && arc.label != FST.END_LABEL) {
        // Arcs are fixed array -- use binary search to find
        // the target.

        final FST<T>.BytesReader in = fst.getBytesReader(0);
        int low = arc.arcIdx;
        int high = arc.numArcs - 1;
        int mid = 0;
        // System.out.println("do arc array low=" + low + " high=" + high + " targetLabel=" +
        // targetLabel);
        boolean found = false;
        while (low <= high) {
          mid = (low + high) >>> 1;
          in.pos = arc.posArcsStart - arc.bytesPerArc * mid - 1;
          final int midLabel = fst.readLabel(in);
          final int cmp = midLabel - targetLabel;
          // System.out.println("  cycle low=" + low + " high=" + high + " mid=" + mid + "
          // midLabel=" + midLabel + " cmp=" + cmp);
          if (cmp < 0) low = mid + 1;
          else if (cmp > 0) high = mid - 1;
          else {
            found = true;
            break;
          }
        }

        // NOTE: this code is dup'd w/ the code below (in
        // the outer else clause):
        if (found) {
          // Match -- recurse
          // System.out.println("  match!  arcIdx=" + mid);
          arc.arcIdx = mid - 1;
          fst.readNextRealArc(arc, in);
          assert arc.arcIdx == mid;
          assert arc.label == targetLabel
              : "arc.label=" + arc.label + " vs targetLabel=" + targetLabel + " mid=" + mid;
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
          continue;
        } else if (high == -1) {
          // System.out.println("  before first");
          // Very first arc is after our target
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while (true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto - 1), arc);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while (!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) {
                fst.readNextArc(arc);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else {
          // There is a floor arc:
          arc.arcIdx = (low > high ? high : low) - 1;
          // System.out.println(" hasFloor arcIdx=" + (arc.arcIdx+1));
          fst.readNextRealArc(arc, in);
          assert arc.isLast() || fst.readNextArcLabel(arc) > targetLabel;
          assert arc.label < targetLabel;
          pushLast();
          return;
        }
      } else {

        if (arc.label == targetLabel) {
          // Match -- recurse
          output[upto] = fst.outputs.add(output[upto - 1], arc.output);
          if (targetLabel == FST.END_LABEL) {
            return;
          }
          setCurrentLabel(arc.label);
          incr();
          arc = fst.readFirstTargetArc(arc, getArc(upto));
          targetLabel = getTargetLabel();
        } else if (arc.label > targetLabel) {
          // TODO: if each arc could somehow read the arc just
          // before, we can save this re-scan.  The ceil case
          // doesn't need this because it reads the next arc
          // instead:
          while (true) {
            // First, walk backwards until we find a first arc
            // that's before our target label:
            fst.readFirstTargetArc(getArc(upto - 1), arc);
            if (arc.label < targetLabel) {
              // Then, scan forwards to the arc just before
              // the targetLabel:
              while (!arc.isLast() && fst.readNextArcLabel(arc) < targetLabel) {
                fst.readNextArc(arc);
              }
              pushLast();
              return;
            }
            upto--;
            if (upto == 0) {
              return;
            }
            targetLabel = getTargetLabel();
            arc = getArc(upto);
          }
        } else if (!arc.isLast()) {
          // System.out.println("  check next label=" + fst.readNextArcLabel(arc) + " (" + (char)
          // fst.readNextArcLabel(arc) + ")");
          if (fst.readNextArcLabel(arc) > targetLabel) {
            pushLast();
            return;
          } else {
            // keep scanning
            fst.readNextArc(arc);
          }
        } else {
          pushLast();
          return;
        }
      }
    }
  }
示例#3
0
  /**
   * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description for visualization.
   * Example of use:
   *
   * <pre>
   * PrintStream ps = new PrintStream(&quot;out.dot&quot;);
   * fst.toDot(ps);
   * ps.close();
   * </pre>
   *
   * and then, from command line:
   *
   * <pre>
   * dot -Tpng -o out.png out.dot
   * </pre>
   *
   * <p>Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
   *
   * @param sameRank If <code>true</code>, the resulting <code>dot</code> file will try to order
   *     states in layers of breadth-first traversal. This may mess up arcs, but makes the output
   *     FST's structure a bit clearer.
   * @param labelStates If <code>true</code> states will have labels equal to their offsets in their
   *     binary format. Expands the graph considerably.
   * @see "http://www.graphviz.org/"
   */
  public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
      throws IOException {
    final String expandedNodeColor = "blue";

    // This is the start arc in the automaton (from the epsilon state to the first state
    // with outgoing transitions.
    final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());

    // A queue of transitions to consider for the next level.
    final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();

    // A queue of transitions to consider when processing the next level.
    final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
    nextLevelQueue.add(startArc);

    // A list of states on the same level (for ranking).
    final List<Integer> sameLevelStates = new ArrayList<Integer>();

    // A bitset of already seen states (target offset).
    final BitSet seen = new BitSet();
    seen.set(startArc.target);

    // Shape for states.
    final String stateShape = "circle";

    // Emit DOT prologue.
    out.write("digraph FST {\n");
    out.write("  rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");

    if (!labelStates) {
      out.write("  node [shape=circle, width=.2, height=.2, style=filled]\n");
    }

    emitDotState(out, "initial", "point", "white", "");
    emitDotState(
        out,
        Integer.toString(startArc.target),
        stateShape,
        fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
        "");
    out.write("  initial -> " + startArc.target + "\n");

    final T NO_OUTPUT = fst.outputs.getNoOutput();
    int level = 0;

    while (!nextLevelQueue.isEmpty()) {
      // we could double buffer here, but it doesn't matter probably.
      thisLevelQueue.addAll(nextLevelQueue);
      nextLevelQueue.clear();

      level++;
      out.write("\n  // Transitions and states at level: " + level + "\n");
      while (!thisLevelQueue.isEmpty()) {
        final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);

        if (fst.targetHasArcs(arc)) {
          // scan all arcs
          final int node = arc.target;
          fst.readFirstTargetArc(arc, arc);

          while (true) {
            // Emit the unseen state and add it to the queue for the next level.
            if (arc.target >= 0 && !seen.get(arc.target)) {
              final boolean isExpanded = fst.isExpandedTarget(arc);
              emitDotState(
                  out,
                  Integer.toString(arc.target),
                  stateShape,
                  isExpanded ? expandedNodeColor : null,
                  labelStates ? Integer.toString(arc.target) : "");
              seen.set(arc.target);
              nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
              sameLevelStates.add(arc.target);
            }

            String outs;
            if (arc.output != NO_OUTPUT) {
              outs = "/" + fst.outputs.outputToString(arc.output);
            } else {
              outs = "";
            }

            final String cl;
            if (arc.label == FST.END_LABEL) {
              cl = "~";
            } else {
              cl = printableLabel(arc.label);
            }

            out.write("  " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");

            // Break the loop if we're on the last arc of this state.
            if (arc.isLast()) {
              break;
            }
            fst.readNextArc(arc);
          }
        }
      }

      // Emit state ranking information.
      if (sameRank && sameLevelStates.size() > 1) {
        out.write("  {rank=same; ");
        for (int state : sameLevelStates) {
          out.write(state + "; ");
        }
        out.write(" }\n");
      }
      sameLevelStates.clear();
    }

    // Emit terminating state (always there anyway).
    out.write("  -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
    out.write("  {rank=sink; -1 }\n");

    out.write("}\n");
    out.flush();
  }