/** * Extracts and indexes words from the specified data reference. * * @throws IOException I/O Exception */ final void index() throws IOException { // delete old index abort(); final Performance perf = Prop.debug ? new Performance() : null; Util.debug(det()); for (pre = 0; pre < size; ++pre) { if ((pre & 0xFFFF) == 0) check(); final int k = data.kind(pre); if (k != Data.TEXT) { if (scm == 1 && k == Data.DOC) unit.add(pre); continue; } if (scm == 2) unit.add(pre); pos = -1; final StopWords sw = lex.ftOpt().sw; lex.init(data.text(pre, true)); while (lex.hasNext()) { final byte[] tok = lex.nextToken(); ++pos; // skip too long and stopword tokens if (tok.length <= data.meta.maxlen && (sw.size() == 0 || !sw.contains(tok))) { // check if main memory is exhausted if ((ntok++ & 0xFFF) == 0 && scm == 0 && memFull()) { // currently no frequency support for tf/idf based scoring writeIndex(csize++); Performance.gc(2); } index(tok); } } } // calculate term frequencies if (scm > 0) { maxfreq = new int[unit.size() + 1]; ntoken = new int[nrTokens()]; token = 0; calcFreq(); } // write tokens token = 0; write(); // set meta data if (scm > 0) { data.meta.maxscore = max; data.meta.minscore = min; } data.meta.ftxtindex = true; Util.memory(perf); }
@Override public synchronized IndexIterator iter(final IndexToken it) { final byte[] tok = it.get(); // wildcard search final FTLexer lexer = (FTLexer) it; final FTOpt opt = lexer.ftOpt(); if (opt.is(WC)) return wc(tok); // fuzzy search if (opt.is(FZ)) return fuzzy(tok, lexer.lserror(tok)); // return cached or new result final IndexEntry e = entry(tok); return e.size > 0 ? iter(e.offset, e.size, inZ, tok) : FTIndexIterator.FTEMPTY; }