boolean isEqualLMSSubstring(long pos1, long pos2) { boolean prevLS = SType; long offset = 0; for (; offset < N; ++offset) { long p1 = (pos1 + offset) % N; long p2 = (pos2 + offset) % N; if (T.lookup(p1) == T.lookup(p2) && LS.get(p1) == LS.get(p2)) { if (prevLS == LType && LS.get(p1) == SType) return true; // equal LMS substring prevLS = LS.get(p1); continue; } else return false; } return false; }
private void induceSA(LSeq SA) { long[] cursorInBucket = Arrays.copyOf(bucketEnd, bucketEnd.length); // induce left for (long i = 0; i < N; ++i) { long si = SA.lookup(i); if (si == N) continue; si = (si - 1 + N) % N; if (LS.get(si) == LType) SA.set(cursorInBucket[(int) T.lookup(si) - 1]++, si); } // induce right System.arraycopy(bucketEnd, 0, cursorInBucket, 0, bucketEnd.length); for (long i = N - 1; i >= 0; --i) { long si = SA.lookup(i); if (si == N) continue; si = (si - 1 + N) % N; if (LS.get(si) == SType) SA.set(--cursorInBucket[(int) T.lookup(si)], si); } }
boolean isLMS(long pos) { return LS.get(pos % N) == SType && LS.get((pos - 1 + N) % N) == LType; }
public void SAIS(LSeq SA) { StopWatch timer = new StopWatch(); _logger.info("SAIS: N=" + SA.textSize()); // Determin T[N-1]'s LS-type // T[i] is SType if T[i,_) < T[i+1,_) // T[i] is LType if T[i,_) > T[i+1,_) { long i = 0; for (; i < N; ++i) { long x = T.lookup((N + i - 1) % N); long y = T.lookup((N + i) % N); if (x == y) continue; if (x < y) { LS.set(N - 1, SType); break; } else { LS.set(N - 1, LType); break; } } if (i == N) { // When T = AAAA... , etc. LS.set(N - 1, LType); } } // T[i] is SType if T[i] < T[i+1] or T[i] = T[i+1] and T[i+1] is S-type // T[i] is LType if T[i] > T[i+1] or T[i] = T[i+1] and T[i+1] is L-type // Set the LS type of each character for (long i = N - 1; i > 0; --i) { long x = T.lookup(i); long y = T.lookup(i - 1); if (x < y) LS.set(i - 1, LType); else if (x > y) LS.set(i - 1, SType); else LS.set(i - 1, LS.get(i)); } // Initialize the buckets. // A bucket is a container of the suffixes sharing the same first character { _logger.trace("Initialize the buckets"); Arrays.fill(bucketEnd, 0); // Compute the size of each bucket for (long i = 0; i < N; ++i) { ++bucketEnd[(int) T.lookup(i)]; } // Accumulate the character counts. The bucketStart holds the pointers to beginning of the // buckets in SA for (int i = 1; i < bucketEnd.length; ++i) { bucketEnd[i] += bucketEnd[i - 1]; } _logger.trace("Done."); } // initialize the suffix array for (long i = 0; i < N; ++i) SA.set(i, N); // Step 1: reduce the problem by at least 1/2 // Sort all the S-substrings // Find LMS characters long[] cursorInBucket = Arrays.copyOf(bucketEnd, bucketEnd.length); for (long i = 0; i < N; ++i) { if (isLMS(i)) SA.set(--cursorInBucket[(int) T.lookup(i)], i); } // Induced sorting LMS prefixes { _logger.trace(String.format("[N=%,d] induceSA", SA.textSize())); induceSA(SA); _logger.trace("Done."); } int numLMS = 0; // Compact all the sorted substrings into the first M items of SA // 2*M must be not larger than N for (long i = 0; i < N; ++i) { long si = SA.lookup(i); if (si != N && isLMS(si)) SA.set(numLMS++, si); } // Initialize the name array buffer for (long i = numLMS; i < N; ++i) SA.set(i, N); // Find the lexicographic names of the LMS substrings _logger.trace("Sorting LMS substrings: N=" + SA.textSize()); int name = 1; SA.set(numLMS + (SA.lookup(0) / 2), name++); for (long i = 1; i < numLMS; ++i) { final long prev = SA.lookup(i - 1); final long current = SA.lookup(i); if (!isEqualLMSSubstring(prev, current)) { name++; } SA.set(numLMS + (current / 2), name - 1); } for (long i = N - 1, j = N - 1; i >= numLMS; --i) { if (SA.lookup(i) != N) SA.set(j--, SA.lookup(i) - 1); } // Step 2: solve the reduced problem // Create SA1, a view of SA[0, numLMS-1] _logger.trace("Solving the reduced problem: N=" + SA.textSize()); LSeq SA1 = new ArrayWrap(SA, 0, numLMS); LSeq T1 = new ArrayWrap(SA, N - numLMS, numLMS); if (name - 1 != numLMS) { new CyclicSAIS(T1, name - 1).SAIS(SA1); } else { // When all LMS substrings have unique names for (long i = 0; i < numLMS; i++) SA1.set(T1.lookup(i), i); } // Step 3: Induce SA from SA1 // Construct P1 using T1 buffer for (long i = 0, j = 0; i < N; ++i) { if (isLMS(i)) T1.set(j++, i); // } // Translate short name into the original index at T // SA1 now holds the LMS-substring indexes for (long i = 0; i < numLMS; ++i) { SA1.set(i, T1.lookup(SA1.lookup(i))); } // Step 3-1: Put all the items in SA1 into corresponding S-type buckets of SA // Clear SA[N1 .. N-1] for (long i = numLMS; i < N; ++i) { SA.set(i, N); } // Put SA1 contents into S-type buckets of SA System.arraycopy(bucketEnd, 0, cursorInBucket, 0, bucketEnd.length); for (int i = numLMS - 1; i >= 0; --i) { long si = SA1.lookup(i); SA.set(i, N); long index = --cursorInBucket[(int) T.lookup(si)]; SA.set(index, si); } // SA.set(0, T.textSize() - 1); // Step 3-2, 3-3 _logger.trace("Inducing SA from SA1: N=" + SA.textSize()); induceSA(SA); _logger.info(String.format("done. %.2f sec.", timer.getElapsedTime())); }