void loadFilteredTerms(Term term, List<String> docNums) { long start = System.currentTimeMillis(); ColumnParent parent = new ColumnParent(); parent.setColumn_family(CassandraUtils.termVecColumnFamily); String key = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(term)); SlicePredicate slicePredicate = new SlicePredicate(); for (String docNum : docNums) { slicePredicate.addToColumn_names(docNum.getBytes()); } List<ColumnOrSuperColumn> columsList = null; try { columsList = client.get_slice( CassandraUtils.keySpace, key, parent, slicePredicate, ConsistencyLevel.ONE); } catch (InvalidRequestException e) { throw new RuntimeException(e); } catch (UnavailableException e) { throw new RuntimeException(e); } catch (TimedOutException e) { throw new RuntimeException(e); } catch (TException e) { throw new RuntimeException(e); } catch (Exception e) { throw new RuntimeException(e); } termBuffer = new Term[0]; if (columsList != null && columsList.size() > 0) { termBuffer = new Term[1]; termBuffer[0] = term; termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>(); termDocFreqBuffer.put(term, columsList); } long end = System.currentTimeMillis(); logger.debug( "loadFilterdTerms: " + term + "(" + termBuffer.length + ") took " + (end - start) + "ms"); }
private void loadTerms(Term skipTo) { if (initTerm == null) initTerm = skipTo; // chose starting term String startTerm = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(skipTo)); // ending term. the initial query we don't care since // we only pull 2 terms, also we don't String endTerm = ""; // The boundary condition for this search. currently the field. String boundryTerm = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(skipTo.field(), CassandraUtils.finalToken)); if ((!skipTo.equals(chunkBoundryTerm) || termPosition == 0) && termCache != null) { termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey()); } else { termDocFreqBuffer = null; } if (termDocFreqBuffer != null) { termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {}); termPosition = 0; logger.debug("Found " + startTerm + " in cache"); return; } else if (chunkCount > 1 && actualInitSize < maxChunkSize) { // include last term if (skipTo.equals(chunkBoundryTerm) && termCache.containsKey(skipTo)) { termBuffer = new Term[] {skipTo}; termDocFreqBuffer = termCache.subMap(skipTo, termCache.lastKey()); } else { termBuffer = new Term[] {}; } termPosition = 0; return; // done! } chunkCount++; // The first time we grab just a few keys int count = maxInitSize; // otherwise we grab all the rest of the keys if (chunkBoundryTerm != null) { count = maxChunkSize; startTerm = CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + CassandraUtils.createColumnName(chunkBoundryTerm)); // After first pass use the boundary term, since we know on pass 2 we are using the OPP endTerm = boundryTerm; } long start = System.currentTimeMillis(); termDocFreqBuffer = new TreeMap<Term, List<ColumnOrSuperColumn>>(); ColumnParent columnParent = new ColumnParent(CassandraUtils.termVecColumnFamily); SlicePredicate slicePredicate = new SlicePredicate(); // Get all columns SliceRange sliceRange = new SliceRange(new byte[] {}, new byte[] {}, true, Integer.MAX_VALUE); slicePredicate.setSlice_range(sliceRange); List<KeySlice> columns; try { columns = client.get_range_slice( CassandraUtils.keySpace, columnParent, slicePredicate, startTerm, endTerm, count, ConsistencyLevel.ONE); } catch (InvalidRequestException e) { throw new RuntimeException(e); } catch (TException e) { throw new RuntimeException(e); } catch (UnavailableException e) { throw new RuntimeException(e); } catch (TimedOutException e) { throw new RuntimeException(e); } // term to start with next time actualInitSize = columns.size(); logger.debug( "Found " + columns.size() + " keys in range:" + startTerm + " to " + endTerm + " in " + (System.currentTimeMillis() - start) + "ms"); if (actualInitSize > 0) { for (KeySlice entry : columns) { // term keys look like wikipedia/body/wiki String termStr = entry .getKey() .substring( entry.getKey().indexOf(CassandraUtils.delimeter) + CassandraUtils.delimeter.length()); Term term = CassandraUtils.parseTerm(termStr); logger.debug(termStr + " has " + entry.getColumns().size()); // check for tombstone keys or incorrect keys (from RP) if (entry.getColumns().size() > 0 && term.field().equals(skipTo.field()) && // from this index entry .getKey() .equals( CassandraUtils.hashKey( indexName + CassandraUtils.delimeter + term.field() + CassandraUtils.delimeter + term.text()))) termDocFreqBuffer.put(term, entry.getColumns()); } if (!termDocFreqBuffer.isEmpty()) { chunkBoundryTerm = termDocFreqBuffer.lastKey(); } } // add a final key (excluded in submap below) termDocFreqBuffer.put(finalTerm, null); // put in cache for (Term termKey : termDocFreqBuffer.keySet()) { if (termCache == null) { termCache = termDocFreqBuffer; } else { termCache.putAll(termDocFreqBuffer); } indexReader.addTermEnumCache(termKey, this); } // cache the initial term too indexReader.addTermEnumCache(skipTo, this); termBuffer = termDocFreqBuffer.keySet().toArray(new Term[] {}); termPosition = 0; long end = System.currentTimeMillis(); logger.debug( "loadTerms: " + startTerm + "(" + termBuffer.length + ") took " + (end - start) + "ms"); }