/** * Extracts and indexes words from the specified data reference. * * @throws IOException I/O Exception */ final void index() throws IOException { // delete old index abort(); final Performance perf = Prop.debug ? new Performance() : null; Util.debug(det()); for (pre = 0; pre < size; ++pre) { if ((pre & 0xFFFF) == 0) check(); final int k = data.kind(pre); if (k != Data.TEXT) { if (scm == 1 && k == Data.DOC) unit.add(pre); continue; } if (scm == 2) unit.add(pre); pos = -1; final StopWords sw = lex.ftOpt().sw; lex.init(data.text(pre, true)); while (lex.hasNext()) { final byte[] tok = lex.nextToken(); ++pos; // skip too long and stopword tokens if (tok.length <= data.meta.maxlen && (sw.size() == 0 || !sw.contains(tok))) { // check if main memory is exhausted if ((ntok++ & 0xFFF) == 0 && scm == 0 && memFull()) { // currently no frequency support for tf/idf based scoring writeIndex(csize++); Performance.gc(2); } index(tok); } } } // calculate term frequencies if (scm > 0) { maxfreq = new int[unit.size() + 1]; ntoken = new int[nrTokens()]; token = 0; calcFreq(); } // write tokens token = 0; write(); // set meta data if (scm > 0) { data.meta.maxscore = max; data.meta.minscore = min; } data.meta.ftxtindex = true; Util.memory(perf); }
@Override public boolean indexAccessible(final IndexInfo ii) { /* If the following conditions yield true, the index is accessed: * - all query terms are statically available * - no FTTimes option is specified * - explicitly set case, diacritics and stemming match options do not * conflict with index options. */ data = ii.ic.data; final MetaData md = data.meta; final FTOpt fto = ftt.opt; /* Index will be applied if no explicit match options have been set * that conflict with the index options. As a consequence, though, index- * based querying might yield other results than sequential scanning. */ if (occ != null || fto.cs != null && md.casesens == (fto.cs == FTCase.INSENSITIVE) || fto.isSet(DC) && md.diacritics != fto.is(DC) || fto.isSet(ST) && md.stemming != fto.is(ST) || fto.ln != null && !fto.ln.equals(md.language)) return false; // adopt database options to tokenizer fto.copy(md); // estimate costs if text is not known at compile time if (tokens == null) { ii.costs = Math.max(2, data.meta.size / 30); return true; } // summarize number of hits; break loop if no hits are expected final FTLexer ft = new FTLexer(fto); ii.costs = 0; for (byte[] t : tokens) { ft.init(t); while (ft.hasNext()) { final byte[] tok = ft.nextToken(); if (fto.sw != null && fto.sw.contains(tok)) continue; if (fto.is(WC)) { // don't use index if one of the terms starts with a wildcard t = ft.get(); if (t[0] == '.') return false; // don't use index if certain characters or more than 1 dot are found int d = 0; for (final byte w : t) { if (w == '{' || w == '\\' || w == '.' && ++d > 1) return false; } } // favor full-text index requests over exact queries final int costs = data.costs(ft); if (costs != 0) ii.costs += Math.max(2, costs / 100); } } return true; }
/** * Caches and returns all unique tokens specified in a query. * * @param list token list * @return token set */ private TokenSet unique(final TokenList list) { // cache all query tokens in a set (duplicates are removed) final TokenSet ts = new TokenSet(); switch (mode) { case ALL: case ANY: for (final byte[] t : list) ts.add(t); break; case ALL_WORDS: case ANY_WORD: final FTLexer l = new FTLexer(ftt.opt); for (final byte[] t : list) { l.init(t); while (l.hasNext()) ts.add(l.nextToken()); } break; case PHRASE: final TokenBuilder tb = new TokenBuilder(); for (final byte[] t : list) tb.add(t).add(' '); ts.add(tb.trim().finish()); } return ts; }