/** * Determines the pointer on a token. * * @param token token looking for * @return int pointer or {@code -1} if token was not found */ private int token(final byte[] token) { final int tl = token.length; // left limit int l = tp[tl]; if (l == -1) return -1; int i = 1; int r; // find right limit do r = tp[tl + i++]; while (r == -1); final int x = r; // binary search final int o = tl + ENTRY; while (l < r) { final int m = l + (r - l >> 1) / o * o; final int c = diff(inY.readBytes(m, tl), token); if (c == 0) return m; if (c < 0) l = m + o; else r = m - o; } // accept entry if pointer is inside relevant tokens return r != x && l == r && eq(inY.readBytes(l, tl), token) ? l : -1; }
/** * Performs a wildcard search for the specified token. * * @param token token to look for * @return iterator */ private synchronized IndexIterator wc(final byte[] token) { final FTIndexIterator it = FTIndexIterator.FTEMPTY; final FTWildcard wc = new FTWildcard(token); if (!wc.parse()) return it; final IntList pr = new IntList(); final IntList ps = new IntList(); final byte[] pref = wc.prefix(); final int pl = pref.length, tl = tp.length; final int l = Math.min(tl - 1, wc.max()); for (int ti = pl; ti <= l; ti++) { int i = tp[ti]; if (i == -1) continue; int c = ti + 1; int e = -1; while (c < tl && e == -1) e = tp[c++]; i = find(pref, i, e, ti); while (i < e) { final byte[] t = inY.readBytes(i, ti); if (!startsWith(t, pref)) break; if (wc.match(t)) { inZ.cursor(pointer(i, ti)); final int s = size(i, ti); for (int d = 0; d < s; d++) { pr.add(inZ.readNum()); ps.add(inZ.readNum()); } } i += ti + ENTRY; } } return iter(new FTCache(pr, ps), token); }
/** * Returns an iterator for an index entry. * * @param off offset on entries * @param size number of id/pos entries * @param da data source * @param token index token * @return iterator */ private static FTIndexIterator iter( final long off, final int size, final DataAccess da, final byte[] token) { da.cursor(off); final IntList pr = new IntList(size); final IntList ps = new IntList(size); for (int c = 0; c < size; c++) { pr.add(da.readNum()); ps.add(da.readNum()); } return iter(new FTCache(pr, ps), token); }
@Override public synchronized byte[] info(final MainOptions options) { final TokenBuilder tb = new TokenBuilder(); final long l = inX.length() + inY.length() + inZ.length(); tb.add(LI_NAMES).add(data.meta.ftinclude).add(NL); tb.add(LI_SIZE + Performance.format(l, true) + NL); final IndexStats stats = new IndexStats(options.get(MainOptions.MAXSTAT)); addOccs(stats); stats.print(tb); return tb.finish(); }
/** * Constructor, initializing the index structure. * * @param data data reference * @throws IOException I/O Exception */ public FTIndex(final Data data) throws IOException { super(data, true); // cache token length index inY = new DataAccess(data.meta.dbfile(DATAFTX + 'y')); inZ = new DataAccess(data.meta.dbfile(DATAFTX + 'z')); inX = new DataAccess(data.meta.dbfile(DATAFTX + 'x')); tp = new int[data.meta.maxlen + 3]; final int tl = tp.length; for (int i = 0; i < tl; ++i) tp[i] = -1; int is = inX.readNum(); while (--is >= 0) { int p = inX.readNum(); final int r; if (p < tl) { r = inX.read4(); } else { // legacy issue (7.0.2 -> 7.1) r = p << 24 | (inX.read1() & 0xFF) << 16 | (inX.read1() & 0xFF) << 8 | inX.read1() & 0xFF; p = p >> 8 | 0x40; } tp[p] = r; } tp[tl - 1] = (int) inY.length(); }
/** * Collects all tokens and their sizes found in the index structure. * * @param stats statistics */ private void addOccs(final IndexStats stats) { int i = 0; final int tl = tp.length; while (i < tl && tp[i] == -1) ++i; int p = tp[i], j = i + 1; while (j < tl && tp[j] == -1) ++j; final int max = tp[tl - 1]; while (p < max) { final int oc = size(p, i); if (stats.adding(oc)) stats.add(inY.readBytes(p, i), oc); p += i + ENTRY; if (p == tp[j]) { i = j; while (j + 1 < tl && tp[++j] == -1) ; } } }
/** * Binary search. * * @param token token to look for * @param start start position * @param end end position * @param ti entry length * @return position where the key was found, or would have been found */ private int find(final byte[] token, final int start, final int end, final int ti) { final int tl = ti + ENTRY; int l = 0, h = (end - start) / tl; while (l <= h) { final int m = l + h >>> 1; final int p = start + m * tl; byte[] txt = ctext.get(p); if (txt == null) { txt = inY.readBytes(p, ti); ctext.put(p, txt); } final int d = diff(txt, token); if (d == 0) return start + m * tl; if (d < 0) l = m + 1; else h = m - 1; } return start + l * tl; }
/** * Performs a fuzzy search for the specified token with a maximum number of errors. * * @param token token to look for * @param k number of errors allowed * @return iterator */ private synchronized IndexIterator fuzzy(final byte[] token, final int k) { FTIndexIterator it = FTIndexIterator.FTEMPTY; final int tokl = token.length, tl = tp.length; final int e = Math.min(tl - 1, tokl + k); int s = Math.max(1, tokl - k) - 1; while (++s <= e) { int p = tp[s]; if (p == -1) continue; int t = s + 1, r = -1; while (t < tl && r == -1) r = tp[t++]; while (p < r) { if (ls.similar(inY.readBytes(p, s), token, k)) { it = FTIndexIterator.union(iter(pointer(p, s), size(p, s), inZ, token), it); } p += s + ENTRY; } } return it; }
/** * Reads the size of ftdata from disk. * * @param pt pointer on token * @param lt length of the token * @return size of the ftdata */ private int size(final long pt, final int lt) { return inY.read4(pt + lt + 5); }
/** * Gets the pointer on ftdata for a token. * * @param pt pointer on token * @param lt length of the token * @return int pointer on ftdata */ private long pointer(final long pt, final int lt) { return inY.read5(pt + lt); }
@Override public synchronized void close() { inX.close(); inY.close(); inZ.close(); }