Java FTLexer.nextToken Examples

Programming Language: Java

Namespace/Package Name: org.basex.util.ft

Class/Type: FTLexer

Method/Function: nextToken

Examples at hotexamples.com: 3

Java FTLexer.nextToken - 3 examples found. These are the top rated real world Java examples of org.basex.util.ft.FTLexer.nextToken extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

hasNext(4)

init(3)

nextToken(3)

ftOpt(2)

get(2)

lserror(1)

next(1)

Example #1

Show file

File: FTBuilder.java Project: charles-dyfis-net/basex

  /**
   * Extracts and indexes words from the specified data reference.
   *
   * @throws IOException I/O Exception
   */
  final void index() throws IOException {
    // delete old index
    abort();

    final Performance perf = Prop.debug ? new Performance() : null;
    Util.debug(det());

    for (pre = 0; pre < size; ++pre) {
      if ((pre & 0xFFFF) == 0) check();

      final int k = data.kind(pre);
      if (k != Data.TEXT) {
        if (scm == 1 && k == Data.DOC) unit.add(pre);
        continue;
      }
      if (scm == 2) unit.add(pre);

      pos = -1;
      final StopWords sw = lex.ftOpt().sw;
      lex.init(data.text(pre, true));
      while (lex.hasNext()) {
        final byte[] tok = lex.nextToken();
        ++pos;
        // skip too long and stopword tokens
        if (tok.length <= data.meta.maxlen && (sw.size() == 0 || !sw.contains(tok))) {
          // check if main memory is exhausted
          if ((ntok++ & 0xFFF) == 0 && scm == 0 && memFull()) {
            // currently no frequency support for tf/idf based scoring
            writeIndex(csize++);
            Performance.gc(2);
          }
          index(tok);
        }
      }
    }

    // calculate term frequencies
    if (scm > 0) {
      maxfreq = new int[unit.size() + 1];
      ntoken = new int[nrTokens()];
      token = 0;
      calcFreq();
    }

    // write tokens
    token = 0;
    write();

    // set meta data
    if (scm > 0) {
      data.meta.maxscore = max;
      data.meta.minscore = min;
    }
    data.meta.ftxtindex = true;
    Util.memory(perf);
  }

Example #2

Show file

File: FTWords.java Project: jefferya/basex

  @Override
  public boolean indexAccessible(final IndexInfo ii) {
    /* If the following conditions yield true, the index is accessed:
     * - all query terms are statically available
     * - no FTTimes option is specified
     * - explicitly set case, diacritics and stemming match options do not
     *   conflict with index options. */
    data = ii.ic.data;
    final MetaData md = data.meta;
    final FTOpt fto = ftt.opt;

    /* Index will be applied if no explicit match options have been set
     * that conflict with the index options. As a consequence, though, index-
     * based querying might yield other results than sequential scanning. */
    if (occ != null
        || fto.cs != null && md.casesens == (fto.cs == FTCase.INSENSITIVE)
        || fto.isSet(DC) && md.diacritics != fto.is(DC)
        || fto.isSet(ST) && md.stemming != fto.is(ST)
        || fto.ln != null && !fto.ln.equals(md.language)) return false;

    // adopt database options to tokenizer
    fto.copy(md);

    // estimate costs if text is not known at compile time
    if (tokens == null) {
      ii.costs = Math.max(2, data.meta.size / 30);
      return true;
    }

    // summarize number of hits; break loop if no hits are expected
    final FTLexer ft = new FTLexer(fto);
    ii.costs = 0;
    for (byte[] t : tokens) {
      ft.init(t);
      while (ft.hasNext()) {
        final byte[] tok = ft.nextToken();
        if (fto.sw != null && fto.sw.contains(tok)) continue;

        if (fto.is(WC)) {
          // don't use index if one of the terms starts with a wildcard
          t = ft.get();
          if (t[0] == '.') return false;
          // don't use index if certain characters or more than 1 dot are found
          int d = 0;
          for (final byte w : t) {
            if (w == '{' || w == '\\' || w == '.' && ++d > 1) return false;
          }
        }
        // favor full-text index requests over exact queries
        final int costs = data.costs(ft);
        if (costs != 0) ii.costs += Math.max(2, costs / 100);
      }
    }
    return true;
  }

Example #3

Show file

File: FTWords.java Project: jefferya/basex

 /**
  * Caches and returns all unique tokens specified in a query.
  *
  * @param list token list
  * @return token set
  */
 private TokenSet unique(final TokenList list) {
   // cache all query tokens in a set (duplicates are removed)
   final TokenSet ts = new TokenSet();
   switch (mode) {
     case ALL:
     case ANY:
       for (final byte[] t : list) ts.add(t);
       break;
     case ALL_WORDS:
     case ANY_WORD:
       final FTLexer l = new FTLexer(ftt.opt);
       for (final byte[] t : list) {
         l.init(t);
         while (l.hasNext()) ts.add(l.nextToken());
       }
       break;
     case PHRASE:
       final TokenBuilder tb = new TokenBuilder();
       for (final byte[] t : list) tb.add(t).add(' ');
       ts.add(tb.trim().finish());
   }
   return ts;
 }