Пример #1
0
  private void ukkonenExtendSuffixTree(int arrayIdx) {
    logger.entering("UkkonenSuffixTree", "ukkonenExtendSuffixTree");
    logger.log(Level.FINEST, String.format("Ukkonen Algorithm String #%d", arrayIdx));

    TreeString string = strings.get(arrayIdx);
    extState = new UkkonenState(string);

    logger.log(
        Level.FINEST,
        String.format("Ukkonen: (%d,%d)", extState.nextPhaseStart, extState.string.length()));

    for (int phase = extState.nextPhaseStart; phase < extState.string.length(); phase++) {
      ukkonenSPA(phase);

      System.err.println(String.format("Phase %d results: ", phase));
      print(System.err);
      System.err.println();
      System.err.flush();
    }

    logger.log(Level.FINEST, String.format("Finishing edges: %d", extState.lastE));
    extState.finishFinalEdges();

    System.err.println(String.format("Finished results: "));
    print(System.err);
    System.err.println();
    System.err.flush();

    logger.exiting("UkkonenSuffixTree", "ukkonenExtendSuffixTree");
  }
Пример #2
0
  /*
   * ukkonenSEA(i, j) performs extension j of phase i of Ukkonen's algorithm.
   * This means that we're making sure that array[j,i] (note the inclusivity!)
   * is a part of the current suffix tree.
   *
   * Original Description: pg. 100 of Gusfield
   */
  private boolean ukkonenSEA(int i, int j) {
    logger.exiting("UkkonenSuffixTree", "ukkonenSEA");
    logger.log(Level.FINEST, String.format("j=%d", j));

    assert j <= i;

    boolean rule3 = false;

    TreeNode newRule2Node = null;

    EdgeMatch m = extState.matcher;
    char lastChar = extState.string.getChar(i);
    boolean lastCharIsTerminal = isTerminal(lastChar);

    /*
     * SEA Step 1:
     * "Find the first node v at or above the end of S[j-1,i] that either
     * has a suffix link from it or is the root.  This requires walking up
     * at most one edge from the end of S[j-1,i] in the current tree.  Let
     * \gamma (possibly empty) denote the string between v and the
     * end of S[j-1,i]."
     */

    /*
     * SEA Step 2:
     * "If v is not the root, traverse the suffix link from v to node
     * s(v) and then walk down from s(v) following the path for string
     * gamma.  If v is the root, then follow the path for S[j,i] from the
     * root (as in the naive algorithm)."
     */

    int gammaEnd = i;
    int gammaStart = gammaEnd - extState.gammaLength;

    if (extState.nextNode == null || extState.nextNode.isRoot()) {
      String beta = extState.string.substring(j, i);
      logger.log(Level.FINEST, String.format("beta: %d,%d <%s>%c", j, i, beta, lastChar));

      m.reset(j, i);
      m.matchFrom(root, true);
    } else {
      logger.log(Level.FINEST, String.format("gammaLength:%d", extState.gammaLength));
      String gamma = extState.string.substring(gammaStart, gammaEnd);
      logger.log(
          Level.FINEST,
          String.format("gamma: %d,%d <%s>%c", gammaStart, gammaEnd, gamma, lastChar));

      m.reset(gammaStart, gammaEnd);
      m.matchFrom(extState.nextNode, true);
    }

    /*
     * SEA Step 3:
     * "Using the extension rules, ensure that the string S[j,i]S(i+1) is
     * in the tree."
     *
     * In our coordinates, this is array[j,i)+array[i]
     * \beta = array[j,i)
     *
     * Rule 1: the path \beta ends at a leaf.  (we shouldn't see this case).
     * Rule 2: the path \beta is not continued by array[i].  That is, \beta
     *         ends either at a node (in which case, no child of the node
     *         starts with array[i]), or in an edge (in which case, the edge
     *         doesn't continue with array[i]). Either way, we create a new
     *         edge that is labeled with array[i] (coordinates: [i,i+1) ).
     * Rule 3: \beta+array[i] is already in the tree -- either \beta ends in
     *         an edge that continues with array[i], or at a node that has
     *         a child under array[i].  Either way, return false (break!).
     */

    TreeEdge newEdge = null;

    if (m.lastEdge == null) {
      logger.log(Level.FINEST, String.format("Found root."));
      // the \beta string matched to the root (was empty).  So we need
      // to simply check the children of the root.

      boolean foundLastChar =
          !lastCharIsTerminal
              ? root.childEdges.containsKey(lastChar)
              : root.terminalEdges.containsKey(extState.string.getIndex());

      if (foundLastChar) {
        // Rule 3
        rule3 = true;
        logger.log(Level.FINEST, "Rule #3, Root");

        extState.nextNode = null;
        extState.gammaLength = 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      } else {
        // Rule 2
        logger.log(Level.FINEST, "Rule #2, Root");
        newEdge = new TreeEdge(extState.string, i, null, root);
        root.addEdge(newEdge);

        extState.nextNode = null;
        extState.gammaLength = 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      }

    } else if (m.inEdgeMiddle()) {
      int offset = m.lastMatchLength();
      logger.log(Level.FINEST, String.format("Found edge middle: %d", offset));

      boolean foundLastChar =
          !lastCharIsTerminal
              ? m.lastEdge.getChar(offset) == lastChar
              : (m.lastEdge.string.getIndex() == extState.string.getIndex()
                  && offset == m.lastEdge.length() - 1);

      logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar));

      if (foundLastChar) {
        // Rule 3
        rule3 = true;
        logger.log(Level.FINEST, "Rule #3, Edge");

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastMatchLength() + 1;
        extState.gammaLength = m.lastMatchLength() + (j == i ? 1 : 0);

        assert extState.gammaLength >= 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      } else {
        // Rule 2
        logger.log(Level.FINEST, "Rule #2, Edge");

        TreeEdge newLowerEdge = m.lastEdge.split(offset);
        extState.edgesWithE.add(newLowerEdge);

        newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode);
        m.lastEdge.tailNode.addEdge(newEdge);
        newRule2Node = m.lastEdge.tailNode;

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastEdge.length() + 1;
        extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0);

        assert extState.gammaLength >= 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      }

      if (extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) {
        logger.log(
            Level.FINEST,
            String.format("Walking up edge: %d", extState.nextNode.parentEdge.length()));
        extState.gammaLength += extState.nextNode.parentEdge.length();
        extState.nextNode = extState.nextNode.parentEdge.headNode;
      }

    } else {
      logger.log(Level.FINEST, String.format("Found node."));

      boolean foundLastChar =
          !lastCharIsTerminal
              ? m.lastEdge.tailNode.childEdges.containsKey(lastChar)
              : m.lastEdge.tailNode.terminalEdges.containsKey(extState.string.getIndex());

      logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar));

      if (foundLastChar) {
        // Rule 3
        rule3 = true;
        logger.log(Level.FINEST, "Rule #3, Node");

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastEdge.length() + 1;
        extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0);

        assert extState.gammaLength >= 0;

        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      } else {
        // Rule 2
        logger.log(Level.FINEST, "Rule #2, Node");
        newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode);
        m.lastEdge.tailNode.addEdge(newEdge);

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastEdge.length() + 1;
        extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0);

        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
        assert extState.gammaLength >= 0;
      }

      if (extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) {
        logger.log(
            Level.FINEST,
            String.format("Walking up edge: %d", extState.nextNode.parentEdge.length()));
        extState.gammaLength += extState.nextNode.parentEdge.length();
        extState.nextNode = extState.nextNode.parentEdge.headNode;
      }
    }

    if (extState.nextNode != null) {
      logger.log(Level.FINEST, "Following suffix link.");
      extState.nextNode = extState.nextNode.suffixLink;
    } else {
      logger.log(Level.FINEST, "Suffix link not found.");
    }

    if (newEdge != null) {
      newEdge.tailNode.suffixes.add(extState.currentSuffix);
      extState.nextSuffix();

      extState.edgesWithE.add(newEdge);
      logger.log(Level.FINEST, String.format("Added suffix: %d", j));
    }

    /*
     * SEA Step 4:
     * "If a new internal node w was created in extension j-1 (by extension rule 2)
     * then by Lemma 6.1.1 string alpha must end at node s(w), the end node for the
     * suffix link from w.  Create the suffix link (w, s(w)) from w to s(w)."
     *
     * This wording is confusing -- is there a typo in Gusfield?  I'm not sure where
     * the 'w' comes from.
     */
    if (extState.rule2Node != null) {
      if (m.lastEdge != null) {
        extState.rule2Node.suffixLink = m.lastEdge.tailNode;
        logger.log(Level.FINEST, "Adding suffix link --> internal node.");
      } else {
        extState.rule2Node.suffixLink = root;
        logger.log(Level.FINEST, "Adding suffix link --> root.");
      }
    }

    /*
     * Update any state that will be needed in the next extension.
     */
    extState.rule2Node = newRule2Node;

    logger.exiting("UkkonenSuffixTree", "ukkonenSEA");

    // "Rule 3 is a show stopper" means that, if we encounter rule 3,
    // we *don't* continue.
    return !rule3;
  }
Пример #3
0
  /* ukkonenSPA(i) performs phase i of Ukkonen's algorithm.  This
   * means that we're making sure that array[0,i] (note the inclusivity!)
   * is a part of the current suffix tree.
   *
   * Original Description: pg. 106 of Gusfield
   */
  private void ukkonenSPA(int i) {
    logger.entering("UkkonenSuffixTree", "ukkonenSPA");
    logger.log(Level.FINEST, String.format("i=%d", i));

    assert i >= 0;

    /*
     * SPA Step 1:
     * "Increment index e to i+1"
     *
     * The equivalent of Gusfield's i+1 is, in our situation, just i.
     * However, the coordinates are inclusive in Gusfield,
     * and exclusive in our case (along the tree edges).  Therefore,
     * lastE should be updated to be i+1, exactly.
     */
    extState.lastE = i + 1;
    logger.log(Level.FINEST, String.format("e=%d", extState.lastE));

    /*
     * SPA Step 2:
     * "Explicitly compute successive extensions, using the SEA algorithm,
     * starting at j_i + 1 until reaching the first extension j* where rule3
     * applies or until all extensions are done in this phase."
     *
     * extState.nextExtStart encodes the (j_i)+1 value.  We start there, and
     * iterate forward until all extensions have been performed, or until
     * ukkonenSEA returns false (ukkonenSEA returns a true if rule 1 or rule 2
     * applies in its extension).
     *
     * We extend until j==i, because the last extension of each phase is
     * the extension that *just* adds the new character into the tree.
     */

    logger.log(Level.FINEST, String.format("jstart=%d", extState.nextExtStart));
    boolean keepExtending = true;
    int j = extState.nextExtStart;

    while (keepExtending && j <= i) {
      if (ukkonenSEA(i, j)) {
        j++;

        // we don't want to just put in the terminal character.
        if (i == extState.string.length() - 1 && j == i) {
          keepExtending = false;
        }
      } else {
        keepExtending = false;
      }

      System.out.println(String.format("Phase %d, Extension %d tree: ", i, j));
      print(System.out);
      System.out.println();
      System.out.flush();
      System.err.println();
      System.err.flush();
    }

    /*
     * SPA Step 3:
     * "Set j_{i+1} to j*-1, to prepare for the next phase."
     */
    extState.nextExtStart = j;

    logger.log(Level.FINEST, String.format("j*=%d", extState.nextExtStart));
    logger.exiting("UkkonenSuffixTree", "ukkonenSPA");
  }