/** * Extracts and indexes words from the specified data reference. * * @throws IOException I/O Exception */ final void index() throws IOException { // delete old index abort(); final Performance perf = Prop.debug ? new Performance() : null; Util.debug(det()); for (pre = 0; pre < size; ++pre) { if ((pre & 0xFFFF) == 0) check(); final int k = data.kind(pre); if (k != Data.TEXT) { if (scm == 1 && k == Data.DOC) unit.add(pre); continue; } if (scm == 2) unit.add(pre); pos = -1; final StopWords sw = lex.ftOpt().sw; lex.init(data.text(pre, true)); while (lex.hasNext()) { final byte[] tok = lex.nextToken(); ++pos; // skip too long and stopword tokens if (tok.length <= data.meta.maxlen && (sw.size() == 0 || !sw.contains(tok))) { // check if main memory is exhausted if ((ntok++ & 0xFFF) == 0 && scm == 0 && memFull()) { // currently no frequency support for tf/idf based scoring writeIndex(csize++); Performance.gc(2); } index(tok); } } } // calculate term frequencies if (scm > 0) { maxfreq = new int[unit.size() + 1]; ntoken = new int[nrTokens()]; token = 0; calcFreq(); } // write tokens token = 0; write(); // set meta data if (scm > 0) { data.meta.maxscore = max; data.meta.minscore = min; } data.meta.ftxtindex = true; Util.memory(perf); }
@Override public boolean indexAccessible(final IndexInfo ii) { /* If the following conditions yield true, the index is accessed: * - all query terms are statically available * - no FTTimes option is specified * - explicitly set case, diacritics and stemming match options do not * conflict with index options. */ data = ii.ic.data; final MetaData md = data.meta; final FTOpt fto = ftt.opt; /* Index will be applied if no explicit match options have been set * that conflict with the index options. As a consequence, though, index- * based querying might yield other results than sequential scanning. */ if (occ != null || fto.cs != null && md.casesens == (fto.cs == FTCase.INSENSITIVE) || fto.isSet(DC) && md.diacritics != fto.is(DC) || fto.isSet(ST) && md.stemming != fto.is(ST) || fto.ln != null && !fto.ln.equals(md.language)) return false; // adopt database options to tokenizer fto.copy(md); // estimate costs if text is not known at compile time if (tokens == null) { ii.costs = Math.max(2, data.meta.size / 30); return true; } // summarize number of hits; break loop if no hits are expected final FTLexer ft = new FTLexer(fto); ii.costs = 0; for (byte[] t : tokens) { ft.init(t); while (ft.hasNext()) { final byte[] tok = ft.nextToken(); if (fto.sw != null && fto.sw.contains(tok)) continue; if (fto.is(WC)) { // don't use index if one of the terms starts with a wildcard t = ft.get(); if (t[0] == '.') return false; // don't use index if certain characters or more than 1 dot are found int d = 0; for (final byte w : t) { if (w == '{' || w == '\\' || w == '.' && ++d > 1) return false; } } // favor full-text index requests over exact queries final int costs = data.costs(ft); if (costs != 0) ii.costs += Math.max(2, costs / 100); } } return true; }
@Override public synchronized IndexIterator iter(final IndexToken it) { final byte[] tok = it.get(); // wildcard search final FTLexer lexer = (FTLexer) it; final FTOpt opt = lexer.ftOpt(); if (opt.is(WC)) return wc(tok); // fuzzy search if (opt.is(FZ)) return fuzzy(tok, lexer.lserror(tok)); // return cached or new result final IndexEntry e = entry(tok); return e.size > 0 ? iter(e.offset, e.size, inZ, tok) : FTIndexIterator.FTEMPTY; }
/** * Caches and returns all unique tokens specified in a query. * * @param list token list * @return token set */ private TokenSet unique(final TokenList list) { // cache all query tokens in a set (duplicates are removed) final TokenSet ts = new TokenSet(); switch (mode) { case ALL: case ANY: for (final byte[] t : list) ts.add(t); break; case ALL_WORDS: case ANY_WORD: final FTLexer l = new FTLexer(ftt.opt); for (final byte[] t : list) { l.init(t); while (l.hasNext()) ts.add(l.nextToken()); } break; case PHRASE: final TokenBuilder tb = new TokenBuilder(); for (final byte[] t : list) tb.add(t).add(' '); ts.add(tb.trim().finish()); } return ts; }
/** * Returns a scan-based index iterator. * * @param lex lexer, including the queried value * @return node iterator * @throws QueryException query exception */ private FTIndexIterator scan(final FTLexer lex) throws QueryException { final FTLexer input = new FTLexer(ftt.opt); final FTTokens fttokens = ftt.cache(lex.get()); return new FTIndexIterator() { final int sz = data.meta.size; int pre = -1, ps; @Override public int pre() { return pre; } @Override public boolean more() { while (++pre < sz) { if (data.kind(pre) != Data.TEXT) continue; input.init(data.text(pre, true)); matches.reset(ps); try { if (ftt.contains(fttokens, input) != 0) return true; } catch (final QueryException ignore) { // ignore exceptions } } return false; } @Override public FTMatches matches() { return matches; } @Override public void pos(final int p) { ps = p; } @Override public int size() { // worst case return Math.max(1, sz >>> 1); } }; }
/** * Builds full-text information. * * @param d data reference * @param p pre value * @param str string value * @return number of added nodes */ TokenList build(final Data d, final int p, final byte[] str) { final FTPos ftp = ftpos.get(d, p); if (ftp == null) return null; boolean marked = false; final TokenList tl = new TokenList(); final TokenBuilder tb = new TokenBuilder(); final FTLexer lex = new FTLexer().sc().init(str); int len = -ftlen; while (lex.hasNext()) { final FTSpan span = lex.next(); // check if current text is still to be marked or already marked if (ftp.contains(span.pos) || marked) { if (tb.size() != 0) { // write current text node tl.add(tb.finish()); len += tb.size(); tb.reset(); // skip construction if (len >= 0 && tl.size() > 1 && !marked) break; } if (!marked) tl.add((byte[]) null); marked ^= true; } // add span tb.add(span.text); } // write last text node if (tb.size() != 0) { tl.add(tb.finish()); len += tb.size(); } // chop first and last text if (len > 0) { final int ts = tl.size(); // get first text (empty if it is a full-text match) final byte[] first = tl.get(0) != null ? tl.get(0) : EMPTY; // get last text (empty if it is a full-text match) final byte[] last = tl.get(ts - 2) != null ? tl.get(ts - 1) : EMPTY; if (first != EMPTY) { // remove leading characters of first text final double l = first.length + last.length; final int ll = Math.min(first.length, (int) (first.length / l * len)); tl.set(0, concat(DOTS, subtoken(first, ll))); len -= ll; } if (last != EMPTY && len > 0) { // remove trailing characters of last text final int ll = Math.min(last.length, len); tl.set(ts - 1, concat(subtoken(last, 0, last.length - ll), DOTS)); len -= ll; } // still too much text: shorten inner texts for (int t = ts - 2; t > 0 && len > 0; t--) { final byte[] txt = tl.get(t); // skip elements, marked texts and too short text snippets if (txt == null || tl.get(t - 1) == null) continue; final int ll = Math.min(txt.length, len); tl.set( t, concat( subtoken(txt, 0, (txt.length - ll) / 2), DOTS, subtoken(txt, (txt.length + ll) / 2))); len -= ll; } } return tl; }