private void ukkonenExtendSuffixTree(int arrayIdx) { logger.entering("UkkonenSuffixTree", "ukkonenExtendSuffixTree"); logger.log(Level.FINEST, String.format("Ukkonen Algorithm String #%d", arrayIdx)); TreeString string = strings.get(arrayIdx); extState = new UkkonenState(string); logger.log( Level.FINEST, String.format("Ukkonen: (%d,%d)", extState.nextPhaseStart, extState.string.length())); for (int phase = extState.nextPhaseStart; phase < extState.string.length(); phase++) { ukkonenSPA(phase); System.err.println(String.format("Phase %d results: ", phase)); print(System.err); System.err.println(); System.err.flush(); } logger.log(Level.FINEST, String.format("Finishing edges: %d", extState.lastE)); extState.finishFinalEdges(); System.err.println(String.format("Finished results: ")); print(System.err); System.err.println(); System.err.flush(); logger.exiting("UkkonenSuffixTree", "ukkonenExtendSuffixTree"); }
/* * ukkonenSEA(i, j) performs extension j of phase i of Ukkonen's algorithm. * This means that we're making sure that array[j,i] (note the inclusivity!) * is a part of the current suffix tree. * * Original Description: pg. 100 of Gusfield */ private boolean ukkonenSEA(int i, int j) { logger.exiting("UkkonenSuffixTree", "ukkonenSEA"); logger.log(Level.FINEST, String.format("j=%d", j)); assert j <= i; boolean rule3 = false; TreeNode newRule2Node = null; EdgeMatch m = extState.matcher; char lastChar = extState.string.getChar(i); boolean lastCharIsTerminal = isTerminal(lastChar); /* * SEA Step 1: * "Find the first node v at or above the end of S[j-1,i] that either * has a suffix link from it or is the root. This requires walking up * at most one edge from the end of S[j-1,i] in the current tree. Let * \gamma (possibly empty) denote the string between v and the * end of S[j-1,i]." */ /* * SEA Step 2: * "If v is not the root, traverse the suffix link from v to node * s(v) and then walk down from s(v) following the path for string * gamma. If v is the root, then follow the path for S[j,i] from the * root (as in the naive algorithm)." */ int gammaEnd = i; int gammaStart = gammaEnd - extState.gammaLength; if (extState.nextNode == null || extState.nextNode.isRoot()) { String beta = extState.string.substring(j, i); logger.log(Level.FINEST, String.format("beta: %d,%d <%s>%c", j, i, beta, lastChar)); m.reset(j, i); m.matchFrom(root, true); } else { logger.log(Level.FINEST, String.format("gammaLength:%d", extState.gammaLength)); String gamma = extState.string.substring(gammaStart, gammaEnd); logger.log( Level.FINEST, String.format("gamma: %d,%d <%s>%c", gammaStart, gammaEnd, gamma, lastChar)); m.reset(gammaStart, gammaEnd); m.matchFrom(extState.nextNode, true); } /* * SEA Step 3: * "Using the extension rules, ensure that the string S[j,i]S(i+1) is * in the tree." * * In our coordinates, this is array[j,i)+array[i] * \beta = array[j,i) * * Rule 1: the path \beta ends at a leaf. (we shouldn't see this case). * Rule 2: the path \beta is not continued by array[i]. That is, \beta * ends either at a node (in which case, no child of the node * starts with array[i]), or in an edge (in which case, the edge * doesn't continue with array[i]). Either way, we create a new * edge that is labeled with array[i] (coordinates: [i,i+1) ). * Rule 3: \beta+array[i] is already in the tree -- either \beta ends in * an edge that continues with array[i], or at a node that has * a child under array[i]. Either way, return false (break!). */ TreeEdge newEdge = null; if (m.lastEdge == null) { logger.log(Level.FINEST, String.format("Found root.")); // the \beta string matched to the root (was empty). So we need // to simply check the children of the root. boolean foundLastChar = !lastCharIsTerminal ? root.childEdges.containsKey(lastChar) : root.terminalEdges.containsKey(extState.string.getIndex()); if (foundLastChar) { // Rule 3 rule3 = true; logger.log(Level.FINEST, "Rule #3, Root"); extState.nextNode = null; extState.gammaLength = 0; logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength)); } else { // Rule 2 logger.log(Level.FINEST, "Rule #2, Root"); newEdge = new TreeEdge(extState.string, i, null, root); root.addEdge(newEdge); extState.nextNode = null; extState.gammaLength = 0; logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength)); } } else if (m.inEdgeMiddle()) { int offset = m.lastMatchLength(); logger.log(Level.FINEST, String.format("Found edge middle: %d", offset)); boolean foundLastChar = !lastCharIsTerminal ? m.lastEdge.getChar(offset) == lastChar : (m.lastEdge.string.getIndex() == extState.string.getIndex() && offset == m.lastEdge.length() - 1); logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar)); if (foundLastChar) { // Rule 3 rule3 = true; logger.log(Level.FINEST, "Rule #3, Edge"); extState.nextNode = m.lastEdge.headNode; // extState.gammaLength = m.lastMatchLength() + 1; extState.gammaLength = m.lastMatchLength() + (j == i ? 1 : 0); assert extState.gammaLength >= 0; logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength)); } else { // Rule 2 logger.log(Level.FINEST, "Rule #2, Edge"); TreeEdge newLowerEdge = m.lastEdge.split(offset); extState.edgesWithE.add(newLowerEdge); newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode); m.lastEdge.tailNode.addEdge(newEdge); newRule2Node = m.lastEdge.tailNode; extState.nextNode = m.lastEdge.headNode; // extState.gammaLength = m.lastEdge.length() + 1; extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0); assert extState.gammaLength >= 0; logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength)); } if (extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) { logger.log( Level.FINEST, String.format("Walking up edge: %d", extState.nextNode.parentEdge.length())); extState.gammaLength += extState.nextNode.parentEdge.length(); extState.nextNode = extState.nextNode.parentEdge.headNode; } } else { logger.log(Level.FINEST, String.format("Found node.")); boolean foundLastChar = !lastCharIsTerminal ? m.lastEdge.tailNode.childEdges.containsKey(lastChar) : m.lastEdge.tailNode.terminalEdges.containsKey(extState.string.getIndex()); logger.log(Level.FINEST, String.format("foundLastChar: %s", foundLastChar)); if (foundLastChar) { // Rule 3 rule3 = true; logger.log(Level.FINEST, "Rule #3, Node"); extState.nextNode = m.lastEdge.headNode; // extState.gammaLength = m.lastEdge.length() + 1; extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0); assert extState.gammaLength >= 0; logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength)); } else { // Rule 2 logger.log(Level.FINEST, "Rule #2, Node"); newEdge = new TreeEdge(extState.string, i, null, m.lastEdge.tailNode); m.lastEdge.tailNode.addEdge(newEdge); extState.nextNode = m.lastEdge.headNode; // extState.gammaLength = m.lastEdge.length() + 1; extState.gammaLength = m.lastEdge.length() + (j == i ? 1 : 0); logger.log(Level.FINEST, String.format("nextGamma: %d", extState.gammaLength)); assert extState.gammaLength >= 0; } if (extState.nextNode.suffixLink == null && !extState.nextNode.isRoot()) { logger.log( Level.FINEST, String.format("Walking up edge: %d", extState.nextNode.parentEdge.length())); extState.gammaLength += extState.nextNode.parentEdge.length(); extState.nextNode = extState.nextNode.parentEdge.headNode; } } if (extState.nextNode != null) { logger.log(Level.FINEST, "Following suffix link."); extState.nextNode = extState.nextNode.suffixLink; } else { logger.log(Level.FINEST, "Suffix link not found."); } if (newEdge != null) { newEdge.tailNode.suffixes.add(extState.currentSuffix); extState.nextSuffix(); extState.edgesWithE.add(newEdge); logger.log(Level.FINEST, String.format("Added suffix: %d", j)); } /* * SEA Step 4: * "If a new internal node w was created in extension j-1 (by extension rule 2) * then by Lemma 6.1.1 string alpha must end at node s(w), the end node for the * suffix link from w. Create the suffix link (w, s(w)) from w to s(w)." * * This wording is confusing -- is there a typo in Gusfield? I'm not sure where * the 'w' comes from. */ if (extState.rule2Node != null) { if (m.lastEdge != null) { extState.rule2Node.suffixLink = m.lastEdge.tailNode; logger.log(Level.FINEST, "Adding suffix link --> internal node."); } else { extState.rule2Node.suffixLink = root; logger.log(Level.FINEST, "Adding suffix link --> root."); } } /* * Update any state that will be needed in the next extension. */ extState.rule2Node = newRule2Node; logger.exiting("UkkonenSuffixTree", "ukkonenSEA"); // "Rule 3 is a show stopper" means that, if we encounter rule 3, // we *don't* continue. return !rule3; }
/* ukkonenSPA(i) performs phase i of Ukkonen's algorithm. This * means that we're making sure that array[0,i] (note the inclusivity!) * is a part of the current suffix tree. * * Original Description: pg. 106 of Gusfield */ private void ukkonenSPA(int i) { logger.entering("UkkonenSuffixTree", "ukkonenSPA"); logger.log(Level.FINEST, String.format("i=%d", i)); assert i >= 0; /* * SPA Step 1: * "Increment index e to i+1" * * The equivalent of Gusfield's i+1 is, in our situation, just i. * However, the coordinates are inclusive in Gusfield, * and exclusive in our case (along the tree edges). Therefore, * lastE should be updated to be i+1, exactly. */ extState.lastE = i + 1; logger.log(Level.FINEST, String.format("e=%d", extState.lastE)); /* * SPA Step 2: * "Explicitly compute successive extensions, using the SEA algorithm, * starting at j_i + 1 until reaching the first extension j* where rule3 * applies or until all extensions are done in this phase." * * extState.nextExtStart encodes the (j_i)+1 value. We start there, and * iterate forward until all extensions have been performed, or until * ukkonenSEA returns false (ukkonenSEA returns a true if rule 1 or rule 2 * applies in its extension). * * We extend until j==i, because the last extension of each phase is * the extension that *just* adds the new character into the tree. */ logger.log(Level.FINEST, String.format("jstart=%d", extState.nextExtStart)); boolean keepExtending = true; int j = extState.nextExtStart; while (keepExtending && j <= i) { if (ukkonenSEA(i, j)) { j++; // we don't want to just put in the terminal character. if (i == extState.string.length() - 1 && j == i) { keepExtending = false; } } else { keepExtending = false; } System.out.println(String.format("Phase %d, Extension %d tree: ", i, j)); print(System.out); System.out.println(); System.out.flush(); System.err.println(); System.err.flush(); } /* * SPA Step 3: * "Set j_{i+1} to j*-1, to prepare for the next phase." */ extState.nextExtStart = j; logger.log(Level.FINEST, String.format("j*=%d", extState.nextExtStart)); logger.exiting("UkkonenSuffixTree", "ukkonenSPA"); }