Example #1
0
 SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) {
   super(tenum);
   this.terms = terms;
   this.ords = ords;
   comparator = BytesRef.getUTF8SortedAsUnicodeComparator();
   lastElement = terms.size() - 1;
   lastTerm = terms.get(ords[lastElement], new BytesRef());
   seekTerm = terms.get(ords[upto], spare);
 }
  @Override
  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {
    PostingsReaderBase postingsReader =
        new SepPostingsReader(
            state.dir,
            state.segmentInfo,
            state.context,
            new MockIntFactory(blockSize),
            state.segmentSuffix);

    TermsIndexReaderBase indexReader;
    boolean success = false;
    try {
      indexReader =
          new FixedGapTermsIndexReader(
              state.dir,
              state.fieldInfos,
              state.segmentInfo.name,
              state.termsIndexDivisor,
              BytesRef.getUTF8SortedAsUnicodeComparator(),
              state.segmentSuffix,
              IOContext.DEFAULT);
      success = true;
    } finally {
      if (!success) {
        postingsReader.close();
      }
    }

    success = false;
    try {
      FieldsProducer ret =
          new BlockTermsReader(
              indexReader,
              state.dir,
              state.fieldInfos,
              state.segmentInfo.name,
              postingsReader,
              state.context,
              1024,
              state.segmentSuffix);
      success = true;
      return ret;
    } finally {
      if (!success) {
        try {
          postingsReader.close();
        } finally {
          indexReader.close();
        }
      }
    }
  }
Example #3
0
 @Override
 public boolean seekExact(BytesRef text) {
   termUpto =
       binarySearch(
           text,
           br,
           0,
           info.terms.size() - 1,
           info.terms,
           info.sortedTerms,
           BytesRef.getUTF8SortedAsUnicodeComparator());
   return termUpto >= 0;
 }
 TermsIncludingScoreQuery(
     String field,
     boolean multipleValuesPerDocument,
     BytesRefHash terms,
     float[] scores,
     Query originalQuery) {
   this.field = field;
   this.multipleValuesPerDocument = multipleValuesPerDocument;
   this.terms = terms;
   this.scores = scores;
   this.originalQuery = originalQuery;
   this.ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
   this.unwrittenOriginalQuery = originalQuery;
 }
    private static final class LeafSourceQueue extends PriorityQueue<LeafSource> {

      private final Comparator<BytesRef> termComp = BytesRef.getUTF8SortedAsUnicodeComparator();

      LeafSourceQueue(int size) {
        super(size);
      }

      @Override
      protected boolean lessThan(LeafSource termsA, LeafSource termsB) {
        final int cmp = termComp.compare(termsA.current, termsB.current);
        if (cmp != 0) {
          return cmp < 0;
        } else {
          return termsA.context.ord < termsB.context.ord;
        }
      }
    }
Example #6
0
 @Override
 public SeekStatus seekCeil(BytesRef text) {
   termUpto =
       binarySearch(
           text,
           br,
           0,
           info.terms.size() - 1,
           info.terms,
           info.sortedTerms,
           BytesRef.getUTF8SortedAsUnicodeComparator());
   if (termUpto < 0) { // not found; choose successor
     termUpto = -termUpto - 1;
     if (termUpto >= info.terms.size()) {
       return SeekStatus.END;
     } else {
       info.terms.get(info.sortedTerms[termUpto], br);
       return SeekStatus.NOT_FOUND;
     }
   } else {
     return SeekStatus.FOUND;
   }
 }
  @Override
  public void build(TermFreqIterator iterator) throws IOException {
    BytesRef scratch = new BytesRef();
    TermFreqIterator iter =
        new WFSTTermFreqIteratorWrapper(iterator, BytesRef.getUTF8SortedAsUnicodeComparator());
    IntsRef scratchInts = new IntsRef();
    BytesRef previous = null;
    PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
    Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
    while ((scratch = iter.next()) != null) {
      long cost = iter.weight();

      if (previous == null) {
        previous = new BytesRef();
      } else if (scratch.equals(previous)) {
        continue; // for duplicate suggestions, the best weight is actually
        // added
      }
      Util.toIntsRef(scratch, scratchInts);
      builder.add(scratchInts, cost);
      previous.copyBytes(scratch);
    }
    fst = builder.finish();
  }
 @Override
 public Comparator<BytesRef> getComparator() {
   return BytesRef.getUTF8SortedAsUnicodeComparator();
 }
Example #9
0
 /**
  * Sorts hashed terms into ascending order, reusing memory along the way. Note that sorting is
  * lazily delayed until required (often it's not required at all). If a sorted view is required
  * then hashing + sort + binary search is still faster and smaller than TreeMap usage (which
  * would be an alternative and somewhat more elegant approach, apart from more sophisticated
  * Tries / prefix trees).
  */
 public void sortTerms() {
   if (sortedTerms == null) {
     sortedTerms = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
   }
 }
Example #10
0
 /**
  * @param field The field that should contain terms that are specified in the previous parameter
  * @param terms The terms that matching documents should have. The terms must be sorted by natural
  *     order.
  */
 TermsQuery(String field, Query fromQuery, BytesRefHash terms) {
   super(field);
   this.fromQuery = fromQuery;
   this.terms = terms;
   ords = terms.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
 }
 @Test
 public void testInMemorySorter() throws Exception {
   check(new InMemorySorter(BytesRef.getUTF8SortedAsUnicodeComparator()));
 }
Example #12
0
  private IndexIterationContext createContext(
      int nDocs,
      RandomIndexWriter fromWriter,
      RandomIndexWriter toWriter,
      boolean multipleValuesPerDocument,
      boolean scoreDocsInOrder)
      throws IOException {
    IndexIterationContext context = new IndexIterationContext();
    int numRandomValues = nDocs / 2;
    context.randomUniqueValues = new String[numRandomValues];
    Set<String> trackSet = new HashSet<String>();
    context.randomFrom = new boolean[numRandomValues];
    for (int i = 0; i < numRandomValues; i++) {
      String uniqueRandomValue;
      do {
        uniqueRandomValue = _TestUtil.randomRealisticUnicodeString(random());
        //        uniqueRandomValue = _TestUtil.randomSimpleString(random);
      } while ("".equals(uniqueRandomValue) || trackSet.contains(uniqueRandomValue));
      // Generate unique values and empty strings aren't allowed.
      trackSet.add(uniqueRandomValue);
      context.randomFrom[i] = random().nextBoolean();
      context.randomUniqueValues[i] = uniqueRandomValue;
    }

    RandomDoc[] docs = new RandomDoc[nDocs];
    for (int i = 0; i < nDocs; i++) {
      String id = Integer.toString(i);
      int randomI = random().nextInt(context.randomUniqueValues.length);
      String value = context.randomUniqueValues[randomI];
      Document document = new Document();
      document.add(newTextField(random(), "id", id, Field.Store.NO));
      document.add(newTextField(random(), "value", value, Field.Store.NO));

      boolean from = context.randomFrom[randomI];
      int numberOfLinkValues = multipleValuesPerDocument ? 2 + random().nextInt(10) : 1;
      docs[i] = new RandomDoc(id, numberOfLinkValues, value, from);
      for (int j = 0; j < numberOfLinkValues; j++) {
        String linkValue =
            context.randomUniqueValues[random().nextInt(context.randomUniqueValues.length)];
        docs[i].linkValues.add(linkValue);
        if (from) {
          if (!context.fromDocuments.containsKey(linkValue)) {
            context.fromDocuments.put(linkValue, new ArrayList<RandomDoc>());
          }
          if (!context.randomValueFromDocs.containsKey(value)) {
            context.randomValueFromDocs.put(value, new ArrayList<RandomDoc>());
          }

          context.fromDocuments.get(linkValue).add(docs[i]);
          context.randomValueFromDocs.get(value).add(docs[i]);
          document.add(newTextField(random(), "from", linkValue, Field.Store.NO));
        } else {
          if (!context.toDocuments.containsKey(linkValue)) {
            context.toDocuments.put(linkValue, new ArrayList<RandomDoc>());
          }
          if (!context.randomValueToDocs.containsKey(value)) {
            context.randomValueToDocs.put(value, new ArrayList<RandomDoc>());
          }

          context.toDocuments.get(linkValue).add(docs[i]);
          context.randomValueToDocs.get(value).add(docs[i]);
          document.add(newTextField(random(), "to", linkValue, Field.Store.NO));
        }
      }

      final RandomIndexWriter w;
      if (from) {
        w = fromWriter;
      } else {
        w = toWriter;
      }

      w.addDocument(document);
      if (random().nextInt(10) == 4) {
        w.commit();
      }
      if (VERBOSE) {
        System.out.println("Added document[" + docs[i].id + "]: " + document);
      }
    }

    // Pre-compute all possible hits for all unique random values. On top of this also compute all
    // possible score for
    // any ScoreMode.
    IndexSearcher fromSearcher = newSearcher(fromWriter.getReader());
    IndexSearcher toSearcher = newSearcher(toWriter.getReader());
    for (int i = 0; i < context.randomUniqueValues.length; i++) {
      String uniqueRandomValue = context.randomUniqueValues[i];
      final String fromField;
      final String toField;
      final Map<String, Map<Integer, JoinScore>> queryVals;
      if (context.randomFrom[i]) {
        fromField = "from";
        toField = "to";
        queryVals = context.fromHitsToJoinScore;
      } else {
        fromField = "to";
        toField = "from";
        queryVals = context.toHitsToJoinScore;
      }
      final Map<BytesRef, JoinScore> joinValueToJoinScores = new HashMap<BytesRef, JoinScore>();
      if (multipleValuesPerDocument) {
        fromSearcher.search(
            new TermQuery(new Term("value", uniqueRandomValue)),
            new Collector() {

              private Scorer scorer;
              private SortedSetDocValues docTermOrds;
              final BytesRef joinValue = new BytesRef();

              @Override
              public void collect(int doc) throws IOException {
                docTermOrds.setDocument(doc);
                long ord;
                while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                  docTermOrds.lookupOrd(ord, joinValue);
                  JoinScore joinScore = joinValueToJoinScores.get(joinValue);
                  if (joinScore == null) {
                    joinValueToJoinScores.put(
                        BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore());
                  }
                  joinScore.addScore(scorer.score());
                }
              }

              @Override
              public void setNextReader(AtomicReaderContext context) throws IOException {
                docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), fromField);
              }

              @Override
              public void setScorer(Scorer scorer) {
                this.scorer = scorer;
              }

              @Override
              public boolean acceptsDocsOutOfOrder() {
                return false;
              }
            });
      } else {
        fromSearcher.search(
            new TermQuery(new Term("value", uniqueRandomValue)),
            new Collector() {

              private Scorer scorer;
              private BinaryDocValues terms;
              private Bits docsWithField;
              private final BytesRef spare = new BytesRef();

              @Override
              public void collect(int doc) throws IOException {
                terms.get(doc, spare);
                BytesRef joinValue = spare;
                if (joinValue.length == 0 && !docsWithField.get(doc)) {
                  return;
                }

                JoinScore joinScore = joinValueToJoinScores.get(joinValue);
                if (joinScore == null) {
                  joinValueToJoinScores.put(
                      BytesRef.deepCopyOf(joinValue), joinScore = new JoinScore());
                }
                joinScore.addScore(scorer.score());
              }

              @Override
              public void setNextReader(AtomicReaderContext context) throws IOException {
                terms = FieldCache.DEFAULT.getTerms(context.reader(), fromField, true);
                docsWithField = FieldCache.DEFAULT.getDocsWithField(context.reader(), fromField);
              }

              @Override
              public void setScorer(Scorer scorer) {
                this.scorer = scorer;
              }

              @Override
              public boolean acceptsDocsOutOfOrder() {
                return false;
              }
            });
      }

      final Map<Integer, JoinScore> docToJoinScore = new HashMap<Integer, JoinScore>();
      if (multipleValuesPerDocument) {
        if (scoreDocsInOrder) {
          AtomicReader slowCompositeReader =
              SlowCompositeReaderWrapper.wrap(toSearcher.getIndexReader());
          Terms terms = slowCompositeReader.terms(toField);
          if (terms != null) {
            DocsEnum docsEnum = null;
            TermsEnum termsEnum = null;
            SortedSet<BytesRef> joinValues =
                new TreeSet<BytesRef>(BytesRef.getUTF8SortedAsUnicodeComparator());
            joinValues.addAll(joinValueToJoinScores.keySet());
            for (BytesRef joinValue : joinValues) {
              termsEnum = terms.iterator(termsEnum);
              if (termsEnum.seekExact(joinValue)) {
                docsEnum =
                    termsEnum.docs(slowCompositeReader.getLiveDocs(), docsEnum, DocsEnum.FLAG_NONE);
                JoinScore joinScore = joinValueToJoinScores.get(joinValue);

                for (int doc = docsEnum.nextDoc();
                    doc != DocIdSetIterator.NO_MORE_DOCS;
                    doc = docsEnum.nextDoc()) {
                  // First encountered join value determines the score.
                  // Something to keep in mind for many-to-many relations.
                  if (!docToJoinScore.containsKey(doc)) {
                    docToJoinScore.put(doc, joinScore);
                  }
                }
              }
            }
          }
        } else {
          toSearcher.search(
              new MatchAllDocsQuery(),
              new Collector() {

                private SortedSetDocValues docTermOrds;
                private final BytesRef scratch = new BytesRef();
                private int docBase;

                @Override
                public void collect(int doc) throws IOException {
                  docTermOrds.setDocument(doc);
                  long ord;
                  while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                    docTermOrds.lookupOrd(ord, scratch);
                    JoinScore joinScore = joinValueToJoinScores.get(scratch);
                    if (joinScore == null) {
                      continue;
                    }
                    Integer basedDoc = docBase + doc;
                    // First encountered join value determines the score.
                    // Something to keep in mind for many-to-many relations.
                    if (!docToJoinScore.containsKey(basedDoc)) {
                      docToJoinScore.put(basedDoc, joinScore);
                    }
                  }
                }

                @Override
                public void setNextReader(AtomicReaderContext context) throws IOException {
                  docBase = context.docBase;
                  docTermOrds = FieldCache.DEFAULT.getDocTermOrds(context.reader(), toField);
                }

                @Override
                public boolean acceptsDocsOutOfOrder() {
                  return false;
                }

                @Override
                public void setScorer(Scorer scorer) {}
              });
        }
      } else {
        toSearcher.search(
            new MatchAllDocsQuery(),
            new Collector() {

              private BinaryDocValues terms;
              private int docBase;
              private final BytesRef spare = new BytesRef();

              @Override
              public void collect(int doc) {
                terms.get(doc, spare);
                JoinScore joinScore = joinValueToJoinScores.get(spare);
                if (joinScore == null) {
                  return;
                }
                docToJoinScore.put(docBase + doc, joinScore);
              }

              @Override
              public void setNextReader(AtomicReaderContext context) throws IOException {
                terms = FieldCache.DEFAULT.getTerms(context.reader(), toField, false);
                docBase = context.docBase;
              }

              @Override
              public boolean acceptsDocsOutOfOrder() {
                return false;
              }

              @Override
              public void setScorer(Scorer scorer) {}
            });
      }
      queryVals.put(uniqueRandomValue, docToJoinScore);
    }

    fromSearcher.getIndexReader().close();
    toSearcher.getIndexReader().close();

    return context;
  }
Example #13
0
 @Override
 int compareTerm(Terms.Bucket other) {
   return BytesRef.getUTF8SortedAsUnicodeComparator()
       .compare(termBytes, ((Bucket) other).termBytes);
 }
Example #14
0
      // TODO: we may want an alternate mode here which is
      // "if you are about to return NOT_FOUND I won't use
      // the terms data from that"; eg FuzzyTermsEnum will
      // (usually) just immediately call seek again if we
      // return NOT_FOUND so it's a waste for us to fill in
      // the term that was actually NOT_FOUND
      @Override
      public SeekStatus seekCeil(final BytesRef target) throws IOException {

        if (indexEnum == null) {
          throw new IllegalStateException("terms index was not loaded");
        }

        // System.out.println("BTR.seek seg=" + segment + " target=" + fieldInfo.name + ":" +
        // target.utf8ToString() + " " + target + " current=" + term().utf8ToString() + " " + term()
        // + " indexIsCurrent=" + indexIsCurrent + " didIndexNext=" + didIndexNext + " seekPending="
        // + seekPending + " divisor=" + indexReader.getDivisor() + " this="  + this);
        if (didIndexNext) {
          if (nextIndexTerm == null) {
            // System.out.println("  nextIndexTerm=null");
          } else {
            // System.out.println("  nextIndexTerm=" + nextIndexTerm.utf8ToString());
          }
        }

        boolean doSeek = true;

        // See if we can avoid seeking, because target term
        // is after current term but before next index term:
        if (indexIsCurrent) {

          final int cmp = BytesRef.getUTF8SortedAsUnicodeComparator().compare(term.get(), target);

          if (cmp == 0) {
            // Already at the requested term
            return SeekStatus.FOUND;
          } else if (cmp < 0) {

            // Target term is after current term
            if (!didIndexNext) {
              if (indexEnum.next() == -1) {
                nextIndexTerm = null;
              } else {
                nextIndexTerm = indexEnum.term();
              }
              // System.out.println("  now do index next() nextIndexTerm=" + (nextIndexTerm == null
              // ? "null" : nextIndexTerm.utf8ToString()));
              didIndexNext = true;
            }

            if (nextIndexTerm == null
                || BytesRef.getUTF8SortedAsUnicodeComparator().compare(target, nextIndexTerm) < 0) {
              // Optimization: requested term is within the
              // same term block we are now in; skip seeking
              // (but do scanning):
              doSeek = false;
              // System.out.println("  skip seek: nextIndexTerm=" + (nextIndexTerm == null ? "null"
              // : nextIndexTerm.utf8ToString()));
            }
          }
        }

        if (doSeek) {
          // System.out.println("  seek");

          // Ask terms index to find biggest indexed term (=
          // first term in a block) that's <= our text:
          in.seek(indexEnum.seek(target));
          boolean result = nextBlock();

          // Block must exist since, at least, the indexed term
          // is in the block:
          assert result;

          indexIsCurrent = true;
          didIndexNext = false;

          if (doOrd) {
            state.ord = indexEnum.ord() - 1;
          }

          term.copyBytes(indexEnum.term());
          // System.out.println("  seek: term=" + term.utf8ToString());
        } else {
          // System.out.println("  skip seek");
          if (state.termBlockOrd == blockTermCount && !nextBlock()) {
            indexIsCurrent = false;
            return SeekStatus.END;
          }
        }

        seekPending = false;

        int common = 0;

        // Scan within block.  We could do this by calling
        // _next() and testing the resulting term, but this
        // is wasteful.  Instead, we first confirm the
        // target matches the common prefix of this block,
        // and then we scan the term bytes directly from the
        // termSuffixesreader's byte[], saving a copy into
        // the BytesRef term per term.  Only when we return
        // do we then copy the bytes into the term.

        while (true) {

          // First, see if target term matches common prefix
          // in this block:
          if (common < termBlockPrefix) {
            final int cmp =
                (term.byteAt(common) & 0xFF) - (target.bytes[target.offset + common] & 0xFF);
            if (cmp < 0) {

              // TODO: maybe we should store common prefix
              // in block header?  (instead of relying on
              // last term of previous block)

              // Target's prefix is after the common block
              // prefix, so term cannot be in this block
              // but it could be in next block.  We
              // must scan to end-of-block to set common
              // prefix for next block:
              if (state.termBlockOrd < blockTermCount) {
                while (state.termBlockOrd < blockTermCount - 1) {
                  state.termBlockOrd++;
                  state.ord++;
                  termSuffixesReader.skipBytes(termSuffixesReader.readVInt());
                }
                final int suffix = termSuffixesReader.readVInt();
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              }
              state.ord++;

              if (!nextBlock()) {
                indexIsCurrent = false;
                return SeekStatus.END;
              }
              common = 0;

            } else if (cmp > 0) {
              // Target's prefix is before the common prefix
              // of this block, so we position to start of
              // block and return NOT_FOUND:
              assert state.termBlockOrd == 0;

              final int suffix = termSuffixesReader.readVInt();
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              return SeekStatus.NOT_FOUND;
            } else {
              common++;
            }

            continue;
          }

          // Test every term in this block
          while (true) {
            state.termBlockOrd++;
            state.ord++;

            final int suffix = termSuffixesReader.readVInt();

            // We know the prefix matches, so just compare the new suffix:
            final int termLen = termBlockPrefix + suffix;
            int bytePos = termSuffixesReader.getPosition();

            boolean next = false;
            final int limit = target.offset + (termLen < target.length ? termLen : target.length);
            int targetPos = target.offset + termBlockPrefix;
            while (targetPos < limit) {
              final int cmp = (termSuffixes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF);
              if (cmp < 0) {
                // Current term is still before the target;
                // keep scanning
                next = true;
                break;
              } else if (cmp > 0) {
                // Done!  Current term is after target. Stop
                // here, fill in real term, return NOT_FOUND.
                term.setLength(termBlockPrefix + suffix);
                term.grow(term.length());
                termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (!next && target.length <= termLen) {
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);

              if (target.length == termLen) {
                // Done!  Exact match.  Stop here, fill in
                // real term, return FOUND.
                // System.out.println("  FOUND");
                return SeekStatus.FOUND;
              } else {
                // System.out.println("  NOT_FOUND");
                return SeekStatus.NOT_FOUND;
              }
            }

            if (state.termBlockOrd == blockTermCount) {
              // Must pre-fill term for next block's common prefix
              term.setLength(termBlockPrefix + suffix);
              term.grow(term.length());
              termSuffixesReader.readBytes(term.bytes(), termBlockPrefix, suffix);
              break;
            } else {
              termSuffixesReader.skipBytes(suffix);
            }
          }

          // The purpose of the terms dict index is to seek
          // the enum to the closest index term before the
          // term we are looking for.  So, we should never
          // cross another index term (besides the first
          // one) while we are scanning:

          assert indexIsCurrent;

          if (!nextBlock()) {
            // System.out.println("  END");
            indexIsCurrent = false;
            return SeekStatus.END;
          }
          common = 0;
        }
      }
  /**
   * Called once per field per document if term vectors are enabled, to write the vectors to
   * RAMOutputStream, which is then quickly flushed to the real term vectors files in the Directory.
   */
  @Override
  void finish() throws IOException {

    assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");

    final int numPostings = termsHashPerField.bytesHash.size();

    final BytesRef flushTerm = perThread.flushTerm;

    assert numPostings >= 0;

    if (!doVectors || numPostings == 0) return;

    if (numPostings > maxNumPostings) maxNumPostings = numPostings;

    final IndexOutput tvf = perThread.doc.perDocTvf;

    // This is called once, after inverting all occurrences
    // of a given field in the doc.  At this point we flush
    // our hash into the DocWriter.

    assert fieldInfo.storeTermVector;
    assert perThread.vectorFieldsInOrder(fieldInfo);

    perThread.doc.addField(termsHashPerField.fieldInfo.number);
    TermVectorsPostingsArray postings = (TermVectorsPostingsArray) termsHashPerField.postingsArray;

    // TODO: we may want to make this sort in same order
    // as Codec's terms dict?
    final int[] termIDs =
        termsHashPerField.sortPostings(BytesRef.getUTF8SortedAsUnicodeComparator());

    tvf.writeVInt(numPostings);
    byte bits = 0x0;
    if (doVectorPositions) bits |= TermVectorsReader.STORE_POSITIONS_WITH_TERMVECTOR;
    if (doVectorOffsets) bits |= TermVectorsReader.STORE_OFFSET_WITH_TERMVECTOR;
    tvf.writeByte(bits);

    int lastLen = 0;
    byte[] lastBytes = null;
    int lastStart = 0;

    final ByteSliceReader reader = perThread.vectorSliceReader;
    final ByteBlockPool termBytePool = perThread.termsHashPerThread.termBytePool;

    for (int j = 0; j < numPostings; j++) {
      final int termID = termIDs[j];
      final int freq = postings.freqs[termID];

      // Get BytesRef
      termBytePool.setBytesRef(flushTerm, postings.textStarts[termID]);

      // Compute common byte prefix between last term and
      // this term
      int prefix = 0;
      if (j > 0) {
        while (prefix < lastLen && prefix < flushTerm.length) {
          if (lastBytes[lastStart + prefix] != flushTerm.bytes[flushTerm.offset + prefix]) {
            break;
          }
          prefix++;
        }
      }

      lastLen = flushTerm.length;
      lastBytes = flushTerm.bytes;
      lastStart = flushTerm.offset;

      final int suffix = flushTerm.length - prefix;
      tvf.writeVInt(prefix);
      tvf.writeVInt(suffix);
      tvf.writeBytes(flushTerm.bytes, lastStart + prefix, suffix);
      tvf.writeVInt(freq);

      if (doVectorPositions) {
        termsHashPerField.initReader(reader, termID, 0);
        reader.writeTo(tvf);
      }

      if (doVectorOffsets) {
        termsHashPerField.initReader(reader, termID, 1);
        reader.writeTo(tvf);
      }
    }

    termsHashPerField.reset();

    // NOTE: we clear, per-field, at the thread level,
    // because term vectors fully write themselves on each
    // field; this saves RAM (eg if large doc has two large
    // fields w/ term vectors on) because we recycle/reuse
    // all RAM after each field:
    perThread.termsHashPerThread.reset(false);
  }
  @Override
  public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException {

    final String seedFileName =
        IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, SEED_EXT);
    final IndexInput in = state.dir.openInput(seedFileName, state.context);
    final long seed = in.readLong();
    if (LuceneTestCase.VERBOSE) {
      System.out.println(
          "MockRandomCodec: reading from seg="
              + state.segmentInfo.name
              + " formatID="
              + state.segmentSuffix
              + " seed="
              + seed);
    }
    in.close();

    final Random random = new Random(seed);

    int readBufferSize = _TestUtil.nextInt(random, 1, 4096);
    if (LuceneTestCase.VERBOSE) {
      System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize);
    }

    PostingsReaderBase postingsReader;

    if (random.nextBoolean()) {
      if (LuceneTestCase.VERBOSE) {
        System.out.println("MockRandomCodec: reading Sep postings");
      }
      postingsReader =
          new SepPostingsReader(
              state.dir,
              state.segmentInfo,
              state.context,
              new MockIntStreamFactory(random),
              state.segmentSuffix);
    } else {
      if (LuceneTestCase.VERBOSE) {
        System.out.println("MockRandomCodec: reading Standard postings");
      }
      postingsReader =
          new Lucene40PostingsReader(
              state.dir, state.segmentInfo, state.context, state.segmentSuffix);
    }

    if (random.nextBoolean()) {
      final int totTFCutoff = _TestUtil.nextInt(random, 1, 20);
      if (LuceneTestCase.VERBOSE) {
        System.out.println(
            "MockRandomCodec: reading pulsing postings with totTFCutoff=" + totTFCutoff);
      }
      postingsReader = new PulsingPostingsReader(postingsReader);
    }

    final FieldsProducer fields;

    if (random.nextBoolean()) {
      // Use BlockTree terms dict
      if (LuceneTestCase.VERBOSE) {
        System.out.println("MockRandomCodec: reading BlockTree terms dict");
      }

      boolean success = false;
      try {
        fields =
            new BlockTreeTermsReader(
                state.dir,
                state.fieldInfos,
                state.segmentInfo.name,
                postingsReader,
                state.context,
                state.segmentSuffix,
                state.termsIndexDivisor);
        success = true;
      } finally {
        if (!success) {
          postingsReader.close();
        }
      }
    } else {

      if (LuceneTestCase.VERBOSE) {
        System.out.println("MockRandomCodec: reading Block terms dict");
      }
      final TermsIndexReaderBase indexReader;
      boolean success = false;
      try {
        final boolean doFixedGap = random.nextBoolean();

        // randomness diverges from writer, here:
        if (state.termsIndexDivisor != -1) {
          state.termsIndexDivisor = _TestUtil.nextInt(random, 1, 10);
        }

        if (doFixedGap) {
          // if termsIndexDivisor is set to -1, we should not touch it. It means a
          // test explicitly instructed not to load the terms index.
          if (LuceneTestCase.VERBOSE) {
            System.out.println(
                "MockRandomCodec: fixed-gap terms index (divisor=" + state.termsIndexDivisor + ")");
          }
          indexReader =
              new FixedGapTermsIndexReader(
                  state.dir,
                  state.fieldInfos,
                  state.segmentInfo.name,
                  state.termsIndexDivisor,
                  BytesRef.getUTF8SortedAsUnicodeComparator(),
                  state.segmentSuffix,
                  state.context);
        } else {
          final int n2 = random.nextInt(3);
          if (n2 == 1) {
            random.nextInt();
          } else if (n2 == 2) {
            random.nextLong();
          }
          if (LuceneTestCase.VERBOSE) {
            System.out.println(
                "MockRandomCodec: variable-gap terms index (divisor="
                    + state.termsIndexDivisor
                    + ")");
          }
          indexReader =
              new VariableGapTermsIndexReader(
                  state.dir,
                  state.fieldInfos,
                  state.segmentInfo.name,
                  state.termsIndexDivisor,
                  state.segmentSuffix,
                  state.context);
        }

        success = true;
      } finally {
        if (!success) {
          postingsReader.close();
        }
      }

      final int termsCacheSize = _TestUtil.nextInt(random, 1, 1024);

      success = false;
      try {
        fields =
            new BlockTermsReader(
                indexReader,
                state.dir,
                state.fieldInfos,
                state.segmentInfo.name,
                postingsReader,
                state.context,
                termsCacheSize,
                state.segmentSuffix);
        success = true;
      } finally {
        if (!success) {
          try {
            postingsReader.close();
          } finally {
            indexReader.close();
          }
        }
      }
    }

    return fields;
  }
Example #17
0
 /**
  * Creates a new sorted wrapper, using {@link BytesRef#getUTF8SortedAsUnicodeComparator} for
  * sorting.
  */
 public SortedInputIterator(InputIterator source) throws IOException {
   this(source, BytesRef.getUTF8SortedAsUnicodeComparator());
 }
 @Override
 public PerDocProducer docsProducer(SegmentReadState state) throws IOException {
   return new SimpleTextPerDocProducer(
       state, BytesRef.getUTF8SortedAsUnicodeComparator(), DOC_VALUES_SEG_SUFFIX);
 }