Exemplo n.º 1
0
  private void naiveExtendSuffix(TreeString string, int start) {
    EdgeMatch em = findEdge(root, string, start, string.length(), false);
    StringSuffix stringSuffix = new StringSuffix(string, start);

    TreeEdge leafEdge = null;
    if (em.completedMatch()) {
      leafEdge = em.lastEdge;
    } else {
      if (em.lastEdge == null) {
        leafEdge = new TreeEdge(string, start, string.length(), root);
        root.addEdge(leafEdge);
      } else {
        leafEdge = new TreeEdge(string, em.matchedTo, string.length(), em.lastEdge.tailNode);
        if (em.inEdgeMiddle()) {
          int offset = em.lastMatchLength();
          em.lastEdge.split(offset);
        }
        em.lastEdge.tailNode.addEdge(leafEdge);
      }
    }

    leafEdge.tailNode.suffixes.add(stringSuffix);
  }
Exemplo n.º 2
0
  /*
   * ukkonenSEA(i, j) performs extension j of phase i of Ukkonen's algorithm.
   * This means that we're making sure that array[j,i] (note the inclusivity!)
   * is a part of the current suffix tree.
   *
   * Original Description: pg. 100 of Gusfield
   */
  private boolean ukkonenSEA(int i, int j) {
    logger.exiting("UkkonenSuffixTree", "ukkonenSEA");
    logger.log(Level.FINEST, String.format("j=%d", j));

    assert j <= i;

    boolean rule3 = false;

    TreeNode newRule2Node = null;

    EdgeMatch m = extState.matcher;
    char lastChar = extState.string.getChar(i);
    boolean lastCharIsTerminal = isTerminal(lastChar);

    /*
     * SEA Step 1:
     * "Find the first node v at or above the end of S[j-1,i] that either
     * has a suffix link from it or is the root.  This requires walking up
     * at most one edge from the end of S[j-1,i] in the current tree.  Let
     * \gamma (possibly empty) denote the string between v and the
     * end of S[j-1,i]."
     */

    /*
     * SEA Step 2:
     * "If v is not the root, traverse the suffix link from v to node
     * s(v) and then walk down from s(v) following the path for string
     * gamma.  If v is the root, then follow the path for S[j,i] from the
     * root (as in the naive algorithm)."
     */

    int gammaEnd = i;
    int gammaStart = gammaEnd - extState.gammaLength;

    if (extState.nextNode == null || extState.nextNode.isRoot()) {
      String beta = extState.string.substring(j, i);
      logger.log(Level.FINEST, String.format("beta: %d,%d <%s>%c", j, i, beta, lastChar));

      m.reset(j, i);
      m.matchFrom(root, true);
    } else {
      logger.log(Level.FINEST, String.format("gammaLength:%d", extState.gammaLength));
      String gamma = extState.string.substring(gammaStart, gammaEnd);
      logger.log(
          Level.FINEST,
          String.format("gamma: %d,%d <%s>%c", gammaStart, gammaEnd, gamma, lastChar));

      m.reset(gammaStart, gammaEnd);
      m.matchFrom(extState.nextNode, true);
    }

    /*
     * SEA Step 3:
     * "Using the extension rules, ensure that the string S[j,i]S(i+1) is
     * in the tree."
     *
     * In our coordinates, this is array[j,i)+array[i]
     * \beta = array[j,i)
     *
     * Rule 1: the path \beta ends at a leaf.  (we shouldn't see this case).
     * Rule 2: the path \beta is not continued by array[i].  That is, \beta
     *         ends either at a node (in which case, no child of the node
     *         starts with array[i]), or in an edge (in which case, the edge
     *         doesn't continue with array[i]). Either way, we create a new
     *         edge that is labeled with array[i] (coordinates: [i,i+1) ).
     * Rule 3: \beta+array[i] is already in the tree -- either \beta ends in
     *         an edge that continues with array[i], or at a node that has
     *         a child under array[i].  Either way, return false (break!).
     */

    TreeEdge newEdge = null;

    if (m.lastEdge == null) {
      logger.log(Level.FINEST, String.format("Found root."));
      // the \beta string matched to the root (was empty).  So we need
      // to simply check the children of the root.

      boolean foundLastChar =
          !lastCharIsTerminal
              ? root.childEdges.containsKey(lastChar)
              : root.terminalEdges.containsKey(extState.string.getIndex());

      if (foundLastChar) {
        // Rule 3
        rule3 = true;
        logger.log(Level.FINEST, "Rule #3, Root");

        extState.nextNode = null;
        extState.gammaLength = 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      } else {
        // Rule 2
        logger.log(Level.FINEST, "Rule #2, Root");
        newEdge = new TreeEdge(extState.string, i, null, root);
        root.addEdge(newEdge);

        extState.nextNode = null;
        extState.gammaLength = 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      }

    } else if (m.inEdgeMiddle()) {
      int offset = m.lastMatchLength();
      logger.log(Level.FINEST, String.format("Found edge middle: %d", offset));

      boolean foundLastChar =
          !lastCharIsTerminal
              ? m.lastEdge.getChar(offset) == lastChar
              : (m.lastEdge.string.getIndex() == extState.string.getIndex()
                  && offset == m.lastEdge.length() - 1);

      logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar));

      if (foundLastChar) {
        // Rule 3
        rule3 = true;
        logger.log(Level.FINEST, "Rule #3, Edge");

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastMatchLength() + 1;
        extState.gammaLength = m.lastMatchLength() + (j == i ? 1 : 0);

        assert extState.gammaLength >= 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      } else {
        // Rule 2
        logger.log(Level.FINEST, "Rule #2, Edge");

        TreeEdge newLowerEdge = m.lastEdge.split(offset);
        extState.edgesWithE.add(newLowerEdge);

        newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode);
        m.lastEdge.tailNode.addEdge(newEdge);
        newRule2Node = m.lastEdge.tailNode;

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastEdge.length() + 1;
        extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0);

        assert extState.gammaLength >= 0;
        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      }

      if (extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) {
        logger.log(
            Level.FINEST,
            String.format("Walking up edge: %d", extState.nextNode.parentEdge.length()));
        extState.gammaLength += extState.nextNode.parentEdge.length();
        extState.nextNode = extState.nextNode.parentEdge.headNode;
      }

    } else {
      logger.log(Level.FINEST, String.format("Found node."));

      boolean foundLastChar =
          !lastCharIsTerminal
              ? m.lastEdge.tailNode.childEdges.containsKey(lastChar)
              : m.lastEdge.tailNode.terminalEdges.containsKey(extState.string.getIndex());

      logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar));

      if (foundLastChar) {
        // Rule 3
        rule3 = true;
        logger.log(Level.FINEST, "Rule #3, Node");

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastEdge.length() + 1;
        extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0);

        assert extState.gammaLength >= 0;

        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
      } else {
        // Rule 2
        logger.log(Level.FINEST, "Rule #2, Node");
        newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode);
        m.lastEdge.tailNode.addEdge(newEdge);

        extState.nextNode = m.lastEdge.headNode;
        // extState.gammaLength = m.lastEdge.length() + 1;
        extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0);

        logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength));
        assert extState.gammaLength >= 0;
      }

      if (extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) {
        logger.log(
            Level.FINEST,
            String.format("Walking up edge: %d", extState.nextNode.parentEdge.length()));
        extState.gammaLength += extState.nextNode.parentEdge.length();
        extState.nextNode = extState.nextNode.parentEdge.headNode;
      }
    }

    if (extState.nextNode != null) {
      logger.log(Level.FINEST, "Following suffix link.");
      extState.nextNode = extState.nextNode.suffixLink;
    } else {
      logger.log(Level.FINEST, "Suffix link not found.");
    }

    if (newEdge != null) {
      newEdge.tailNode.suffixes.add(extState.currentSuffix);
      extState.nextSuffix();

      extState.edgesWithE.add(newEdge);
      logger.log(Level.FINEST, String.format("Added suffix: %d", j));
    }

    /*
     * SEA Step 4:
     * "If a new internal node w was created in extension j-1 (by extension rule 2)
     * then by Lemma 6.1.1 string alpha must end at node s(w), the end node for the
     * suffix link from w.  Create the suffix link (w, s(w)) from w to s(w)."
     *
     * This wording is confusing -- is there a typo in Gusfield?  I'm not sure where
     * the 'w' comes from.
     */
    if (extState.rule2Node != null) {
      if (m.lastEdge != null) {
        extState.rule2Node.suffixLink = m.lastEdge.tailNode;
        logger.log(Level.FINEST, "Adding suffix link --> internal node.");
      } else {
        extState.rule2Node.suffixLink = root;
        logger.log(Level.FINEST, "Adding suffix link --> root.");
      }
    }

    /*
     * Update any state that will be needed in the next extension.
     */
    extState.rule2Node = newRule2Node;

    logger.exiting("UkkonenSuffixTree", "ukkonenSEA");

    // "Rule 3 is a show stopper" means that, if we encounter rule 3,
    // we *don't* continue.
    return !rule3;
  }