Пример #1
0
  /**
   * Extracts and indexes words from the specified data reference.
   *
   * @throws IOException I/O Exception
   */
  final void index() throws IOException {
    // delete old index
    abort();

    final Performance perf = Prop.debug ? new Performance() : null;
    Util.debug(det());

    for (pre = 0; pre < size; ++pre) {
      if ((pre & 0xFFFF) == 0) check();

      final int k = data.kind(pre);
      if (k != Data.TEXT) {
        if (scm == 1 && k == Data.DOC) unit.add(pre);
        continue;
      }
      if (scm == 2) unit.add(pre);

      pos = -1;
      final StopWords sw = lex.ftOpt().sw;
      lex.init(data.text(pre, true));
      while (lex.hasNext()) {
        final byte[] tok = lex.nextToken();
        ++pos;
        // skip too long and stopword tokens
        if (tok.length <= data.meta.maxlen && (sw.size() == 0 || !sw.contains(tok))) {
          // check if main memory is exhausted
          if ((ntok++ & 0xFFF) == 0 && scm == 0 && memFull()) {
            // currently no frequency support for tf/idf based scoring
            writeIndex(csize++);
            Performance.gc(2);
          }
          index(tok);
        }
      }
    }

    // calculate term frequencies
    if (scm > 0) {
      maxfreq = new int[unit.size() + 1];
      ntoken = new int[nrTokens()];
      token = 0;
      calcFreq();
    }

    // write tokens
    token = 0;
    write();

    // set meta data
    if (scm > 0) {
      data.meta.maxscore = max;
      data.meta.minscore = min;
    }
    data.meta.ftxtindex = true;
    Util.memory(perf);
  }
Пример #2
0
  @Override
  public boolean indexAccessible(final IndexInfo ii) {
    /* If the following conditions yield true, the index is accessed:
     * - all query terms are statically available
     * - no FTTimes option is specified
     * - explicitly set case, diacritics and stemming match options do not
     *   conflict with index options. */
    data = ii.ic.data;
    final MetaData md = data.meta;
    final FTOpt fto = ftt.opt;

    /* Index will be applied if no explicit match options have been set
     * that conflict with the index options. As a consequence, though, index-
     * based querying might yield other results than sequential scanning. */
    if (occ != null
        || fto.cs != null && md.casesens == (fto.cs == FTCase.INSENSITIVE)
        || fto.isSet(DC) && md.diacritics != fto.is(DC)
        || fto.isSet(ST) && md.stemming != fto.is(ST)
        || fto.ln != null && !fto.ln.equals(md.language)) return false;

    // adopt database options to tokenizer
    fto.copy(md);

    // estimate costs if text is not known at compile time
    if (tokens == null) {
      ii.costs = Math.max(2, data.meta.size / 30);
      return true;
    }

    // summarize number of hits; break loop if no hits are expected
    final FTLexer ft = new FTLexer(fto);
    ii.costs = 0;
    for (byte[] t : tokens) {
      ft.init(t);
      while (ft.hasNext()) {
        final byte[] tok = ft.nextToken();
        if (fto.sw != null && fto.sw.contains(tok)) continue;

        if (fto.is(WC)) {
          // don't use index if one of the terms starts with a wildcard
          t = ft.get();
          if (t[0] == '.') return false;
          // don't use index if certain characters or more than 1 dot are found
          int d = 0;
          for (final byte w : t) {
            if (w == '{' || w == '\\' || w == '.' && ++d > 1) return false;
          }
        }
        // favor full-text index requests over exact queries
        final int costs = data.costs(ft);
        if (costs != 0) ii.costs += Math.max(2, costs / 100);
      }
    }
    return true;
  }
Пример #3
0
 /**
  * Caches and returns all unique tokens specified in a query.
  *
  * @param list token list
  * @return token set
  */
 private TokenSet unique(final TokenList list) {
   // cache all query tokens in a set (duplicates are removed)
   final TokenSet ts = new TokenSet();
   switch (mode) {
     case ALL:
     case ANY:
       for (final byte[] t : list) ts.add(t);
       break;
     case ALL_WORDS:
     case ANY_WORD:
       final FTLexer l = new FTLexer(ftt.opt);
       for (final byte[] t : list) {
         l.init(t);
         while (l.hasNext()) ts.add(l.nextToken());
       }
       break;
     case PHRASE:
       final TokenBuilder tb = new TokenBuilder();
       for (final byte[] t : list) tb.add(t).add(' ');
       ts.add(tb.trim().finish());
   }
   return ts;
 }
Пример #4
0
  /**
   * Builds full-text information.
   *
   * @param d data reference
   * @param p pre value
   * @param str string value
   * @return number of added nodes
   */
  TokenList build(final Data d, final int p, final byte[] str) {
    final FTPos ftp = ftpos.get(d, p);
    if (ftp == null) return null;

    boolean marked = false;
    final TokenList tl = new TokenList();
    final TokenBuilder tb = new TokenBuilder();
    final FTLexer lex = new FTLexer().sc().init(str);
    int len = -ftlen;
    while (lex.hasNext()) {
      final FTSpan span = lex.next();
      // check if current text is still to be marked or already marked
      if (ftp.contains(span.pos) || marked) {
        if (tb.size() != 0) {
          // write current text node
          tl.add(tb.finish());
          len += tb.size();
          tb.reset();
          // skip construction
          if (len >= 0 && tl.size() > 1 && !marked) break;
        }
        if (!marked) tl.add((byte[]) null);
        marked ^= true;
      }
      // add span
      tb.add(span.text);
    }
    // write last text node
    if (tb.size() != 0) {
      tl.add(tb.finish());
      len += tb.size();
    }

    // chop first and last text
    if (len > 0) {
      final int ts = tl.size();
      // get first text (empty if it is a full-text match)
      final byte[] first = tl.get(0) != null ? tl.get(0) : EMPTY;
      // get last text (empty if it is a full-text match)
      final byte[] last = tl.get(ts - 2) != null ? tl.get(ts - 1) : EMPTY;

      if (first != EMPTY) {
        // remove leading characters of first text
        final double l = first.length + last.length;
        final int ll = Math.min(first.length, (int) (first.length / l * len));
        tl.set(0, concat(DOTS, subtoken(first, ll)));
        len -= ll;
      }
      if (last != EMPTY && len > 0) {
        // remove trailing characters of last text
        final int ll = Math.min(last.length, len);
        tl.set(ts - 1, concat(subtoken(last, 0, last.length - ll), DOTS));
        len -= ll;
      }
      // still too much text: shorten inner texts
      for (int t = ts - 2; t > 0 && len > 0; t--) {
        final byte[] txt = tl.get(t);
        // skip elements, marked texts and too short text snippets
        if (txt == null || tl.get(t - 1) == null) continue;
        final int ll = Math.min(txt.length, len);
        tl.set(
            t,
            concat(
                subtoken(txt, 0, (txt.length - ll) / 2),
                DOTS,
                subtoken(txt, (txt.length + ll) / 2)));
        len -= ll;
      }
    }
    return tl;
  }