Пример #1
0
  void loadFilteredTerms(Term term, List<String> docNums) {
    long start = System.currentTimeMillis();
    ColumnParent parent = new ColumnParent();
    parent.setColumn_family(CassandraUtils.termVecColumnFamily);

    String key =
        CassandraUtils.hashKey(
            indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(term));

    SlicePredicate slicePredicate = new SlicePredicate();

    for (String docNum : docNums) {
      slicePredicate.addToColumn_names(docNum.getBytes());
    }

    List<ColumnOrSuperColumn> columsList = null;
    try {
      columsList =
          client.get_slice(
              CassandraUtils.keySpace, key, parent, slicePredicate, ConsistencyLevel.ONE);
    } catch (InvalidRequestException e) {
      throw new RuntimeException(e);
    } catch (UnavailableException e) {
      throw new RuntimeException(e);
    } catch (TimedOutException e) {
      throw new RuntimeException(e);
    } catch (TException e) {
      throw new RuntimeException(e);
    } catch (Exception e) {
      throw new RuntimeException(e);
    }

    termBuffer = new Term[0];

    if (columsList != null && columsList.size() > 0) {
      termBuffer = new Term[1];
      termBuffer[0] = term;
      termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>();
      termDocFreqBuffer.put(term, columsList);
    }
    long end = System.currentTimeMillis();
    logger.debug(
        "loadFilterdTerms: " + term + "(" + termBuffer.length + ") took " + (end - start) + "ms");
  }
Пример #2
0
  private void loadTerms(Term skipTo) {

    if (initTerm == null) initTerm = skipTo;

    // chose starting term
    String startTerm =
        CassandraUtils.hashKey(
            indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(skipTo));

    // ending term. the initial query we don't care since
    // we only pull 2 terms, also we don't
    String endTerm = "";

    // The boundary condition for this search. currently the field.
    String boundryTerm =
        CassandraUtils.hashKey(
            indexName
                + CassandraUtils.delimeter
                + CassandraUtils.createColumnName(skipTo.field(), CassandraUtils.finalToken));

    if ((!skipTo.equals(chunkBoundryTerm) || termPosition == 0) && termCache != null) {
      termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey());
    } else {
      termDocFreqBuffer = null;
    }

    if (termDocFreqBuffer != null) {

      termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {});
      termPosition = 0;

      logger.debug("Found " + startTerm + " in cache");
      return;
    } else if (chunkCount > 1 && actualInitSize < maxChunkSize) {

      // include last term
      if (skipTo.equals(chunkBoundryTerm) && termCache.containsKey(skipTo)) {
        termBuffer = new Term[] {skipTo};
        termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey());
      } else {
        termBuffer = new Term[] {};
      }

      termPosition = 0;
      return; // done!
    }

    chunkCount++;

    // The first time we grab just a few keys
    int count = maxInitSize;

    // otherwise we grab all the rest of the keys
    if (chunkBoundryTerm != null) {
      count = maxChunkSize;
      startTerm =
          CassandraUtils.hashKey(
              indexName
                  + CassandraUtils.delimeter
                  + CassandraUtils.createColumnName(chunkBoundryTerm));

      // After first pass use the boundary term, since we know on pass 2 we are using the OPP
      endTerm = boundryTerm;
    }

    long start = System.currentTimeMillis();

    termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>();

    ColumnParent columnParent = new ColumnParent(CassandraUtils.termVecColumnFamily);
    SlicePredicate slicePredicate = new SlicePredicate();

    // Get all columns
    SliceRange sliceRange = new SliceRange(new byte[] {}, new byte[] {}, true, Integer.MAX_VALUE);
    slicePredicate.setSlice_range(sliceRange);

    List<KeySlice> columns;
    try {
      columns =
          client.get_range_slice(
              CassandraUtils.keySpace,
              columnParent,
              slicePredicate,
              startTerm,
              endTerm,
              count,
              ConsistencyLevel.ONE);
    } catch (InvalidRequestException e) {
      throw new RuntimeException(e);
    } catch (TException e) {
      throw new RuntimeException(e);
    } catch (UnavailableException e) {
      throw new RuntimeException(e);
    } catch (TimedOutException e) {
      throw new RuntimeException(e);
    }

    // term to start with next time
    actualInitSize = columns.size();
    logger.debug(
        "Found "
            + columns.size()
            + " keys in range:"
            + startTerm
            + " to "
            + endTerm
            + " in "
            + (System.currentTimeMillis() - start)
            + "ms");

    if (actualInitSize > 0) {
      for (KeySlice entry : columns) {

        // term keys look like wikipedia/body/wiki
        String termStr =
            entry
                .getKey()
                .substring(
                    entry.getKey().indexOf(CassandraUtils.delimeter)
                        + CassandraUtils.delimeter.length());
        Term term = CassandraUtils.parseTerm(termStr);

        logger.debug(termStr + " has " + entry.getColumns().size());

        // check for tombstone keys or incorrect keys (from RP)
        if (entry.getColumns().size() > 0
            && term.field().equals(skipTo.field())
            &&
            // from this index
            entry
                .getKey()
                .equals(
                    CassandraUtils.hashKey(
                        indexName
                            + CassandraUtils.delimeter
                            + term.field()
                            + CassandraUtils.delimeter
                            + term.text()))) termDocFreqBuffer.put(term, entry.getColumns());
      }

      if (!termDocFreqBuffer.isEmpty()) {
        chunkBoundryTerm = termDocFreqBuffer.lastKey();
      }
    }

    // add a final key (excluded in submap below)
    termDocFreqBuffer.put(finalTerm, null);

    // put in cache
    for (Term termKey : termDocFreqBuffer.keySet()) {

      if (termCache == null) {
        termCache = termDocFreqBuffer;
      } else {
        termCache.putAll(termDocFreqBuffer);
      }

      indexReader.addTermEnumCache(termKey, this);
    }

    // cache the initial term too
    indexReader.addTermEnumCache(skipTo, this);

    termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {});

    termPosition = 0;

    long end = System.currentTimeMillis();

    logger.debug(
        "loadTerms: " + startTerm + "(" + termBuffer.length + ") took " + (end - start) + "ms");
  }