Ejemplo n.º 1
0
  public void testDocsWithField() throws Exception {
    Directory dir = newDirectory();
    IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random()));
    IndexWriter writer = new IndexWriter(dir, conf);
    Document doc = new Document();
    doc.add(new NumericDocValuesField("dv", 0L));
    writer.addDocument(doc);

    doc = new Document();
    doc.add(new TextField("dv", "some text", Field.Store.NO));
    doc.add(new NumericDocValuesField("dv", 0L));
    writer.addDocument(doc);

    DirectoryReader r = writer.getReader();
    writer.close();

    AtomicReader subR = r.leaves().get(0).reader();
    assertEquals(2, subR.numDocs());

    Bits bits = FieldCache.DEFAULT.getDocsWithField(subR, "dv");
    assertTrue(bits.get(0));
    assertTrue(bits.get(1));
    r.close();
    dir.close();
  }
Ejemplo n.º 2
0
  private static DocSet createBigSet(
      List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxDoc, int firstReader)
      throws IOException {
    long[] bits = new long[FixedBitSet.bits2words(maxDoc)];
    int sz = 0;
    for (int i = firstReader; i < postList.length; i++) {
      PostingsEnum postings = postList[i];
      if (postings == null) continue;
      LeafReaderContext ctx = leaves.get(i);
      Bits liveDocs = ctx.reader().getLiveDocs();
      int base = ctx.docBase;
      for (; ; ) {
        int subId = postings.nextDoc();
        if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
        if (liveDocs != null && !liveDocs.get(subId)) continue;
        int globalId = subId + base;
        bits[globalId >> 6] |= (1L << globalId);
        sz++;
      }
    }

    BitDocSet docSet = new BitDocSet(new FixedBitSet(bits, maxDoc), sz);

    int smallSetSize = smallSetSize(maxDoc);
    if (sz < smallSetSize) {
      // make this optional?
      DocSet smallSet = toSmallSet(docSet);
      // assert equals(docSet, smallSet);
      return smallSet;
    }

    return docSet;
  }
    public DocumentFilteredLeafIndexReader(
        LeafReaderContext context, Filter preserveFilter, boolean negateFilter) throws IOException {
      super(context.reader());
      final int maxDoc = in.maxDoc();
      final FixedBitSet bits = new FixedBitSet(maxDoc);
      // ignore livedocs here, as we filter them later:
      final DocIdSet docs = preserveFilter.getDocIdSet(context, null);
      if (docs != null) {
        final DocIdSetIterator it = docs.iterator();
        if (it != null) {
          bits.or(it);
        }
      }
      if (negateFilter) {
        bits.flip(0, maxDoc);
      }

      if (in.hasDeletions()) {
        final Bits oldLiveDocs = in.getLiveDocs();
        assert oldLiveDocs != null;
        final DocIdSetIterator it = new BitSetIterator(bits, 0L); // the cost is not useful here
        for (int i = it.nextDoc(); i < maxDoc; i = it.nextDoc()) {
          if (!oldLiveDocs.get(i)) {
            // we can safely modify the current bit, as the iterator already stepped over it:
            bits.clear(i);
          }
        }
      }

      this.liveDocs = bits;
      this.numDocs = bits.cardinality();
    }
Ejemplo n.º 4
0
 @Override
 public boolean get(int index) {
   for (Bits bit : bits) {
     if (!bit.get(index)) {
       return false;
     }
   }
   return true;
 }
Ejemplo n.º 5
0
 @Override
 public int nextDoc() throws IOException {
   if (docID == NO_MORE_DOCS) {
     return docID;
   }
   boolean first = true;
   int termFreq = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       termFreq = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       termFreq = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip termFreq++;
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
               || StringHelper.startsWith(scratch, FIELD)
               || StringHelper.startsWith(scratch, END)
           : "scratch=" + scratch.utf8ToString();
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         in.seek(lineStart);
         if (!omitTF) {
           tf = termFreq;
         }
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
  /**
   * @param reader
   * @param lireFeature
   * @return the maximum distance found for normalizing.
   * @throws java.io.IOException
   */
  @SuppressWarnings("unchecked")
  private float[] findSimilar(IndexReader reader, LireFeature[] lireFeature) throws IOException {
    float[] maxDistance = new float[lireFeature.length];
    float[] overallMaxDistance = new float[lireFeature.length];

    for (int i = 0; i < overallMaxDistance.length; i++) {
      overallMaxDistance[i] = -1f;
      maxDistance[i] = -1f;
    }

    parDocs = new TreeSet[lireFeature.length];
    for (int i = 0; i < parDocs.length; i++) {
      parDocs[i] = new TreeSet<SimpleResult>();
    }

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    // clear result set ...

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.

      Document d = reader.document(i);
      float[] distance = getDistance(d, lireFeature);
      // calculate the overall max distance to normalize score afterwards
      for (int j = 0; j < distance.length; j++) {
        float f = distance[j];
        if (overallMaxDistance[j] < f) {
          overallMaxDistance[j] = f;
        }
        // if it is the first document:
        if (maxDistance[j] < 0) {
          maxDistance[j] = f;
        }
        // if the array is not full yet:
        if (this.parDocs[j].size() < maxHits) {
          this.parDocs[j].add(new SimpleResult(f, d));
          if (f > maxDistance[j]) {
            maxDistance[j] = f;
          }
        } else if (f < maxDistance[j]) {
          // if it is nearer to the sample than at least on of the current set:
          // remove the last one ...
          this.parDocs[j].remove(this.parDocs[j].last());
          // add the new one ...
          this.parDocs[j].add(new SimpleResult(f, d));
          // and set our new distance border ...
          maxDistance[j] = this.parDocs[j].last().getDistance();
        }
      }
    }
    return maxDistance;
  }
    @Override
    public Explanation explain(LeafReaderContext context, int doc) throws IOException {

      Explanation expl = subQueryWeight.explain(context, doc);
      if (!expl.isMatch()) {
        return expl;
      }
      // First: Gather explanations for all filters
      List<Explanation> filterExplanations = new ArrayList<>();
      for (int i = 0; i < filterFunctions.length; ++i) {
        Bits docSet =
            Lucene.asSequentialAccessBits(
                context.reader().maxDoc(), filterWeights[i].scorer(context));
        if (docSet.get(doc)) {
          FilterFunction filterFunction = filterFunctions[i];
          Explanation functionExplanation =
              filterFunction.function.getLeafScoreFunction(context).explainScore(doc, expl);
          double factor = functionExplanation.getValue();
          float sc = CombineFunction.toFloat(factor);
          Explanation filterExplanation =
              Explanation.match(
                  sc,
                  "function score, product of:",
                  Explanation.match(1.0f, "match filter: " + filterFunction.filter.toString()),
                  functionExplanation);
          filterExplanations.add(filterExplanation);
        }
      }
      if (filterExplanations.size() > 0) {
        FiltersFunctionFactorScorer scorer = functionScorer(context);
        int actualDoc = scorer.iterator().advance(doc);
        assert (actualDoc == doc);
        double score = scorer.computeScore(doc, expl.getValue());
        Explanation factorExplanation =
            Explanation.match(
                CombineFunction.toFloat(score),
                "function score, score mode ["
                    + scoreMode.toString().toLowerCase(Locale.ROOT)
                    + "]",
                filterExplanations);
        expl = combineFunction.explain(expl, factorExplanation, maxBoost);
      }
      if (minScore != null && minScore > expl.getValue()) {
        expl =
            Explanation.noMatch(
                "Score value is too low, expected at least "
                    + minScore
                    + " but got "
                    + expl.getValue(),
                expl);
      }
      return expl;
    }
Ejemplo n.º 8
0
 @Override
 public Bits getLiveDocs() {
   Bits liveDocs = super.getLiveDocs();
   if (liveDocs != null) {
     assert maxDoc() == liveDocs.length();
     liveDocs = new AssertingBits(liveDocs);
   } else {
     assert maxDoc() == numDocs();
     assert !hasDeletions();
   }
   return liveDocs;
 }
  /**
   * We assume that the initial indexing has been done and a set of reference objects has been found
   * and indexed in the separate directory. However further documents were added and they now need
   * to get a ranked list of reference objects. So we (i) get all these new documents missing the
   * field "ro-order" and (ii) add this field.
   *
   * @param indexPath the index to update
   * @throws IOException
   */
  public void updateIndex(String indexPath) throws IOException {
    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
    int numDocs = reader.numDocs();
    boolean hasDeletions = reader.hasDeletions();
    int countUpdated = 0;

    IndexReader readerRo = DirectoryReader.open(FSDirectory.open(new File(indexPath + "-ro")));
    ImageSearcher searcher =
        new GenericImageSearcher(numReferenceObjectsUsed, featureClass, featureFieldName);
    Map<String, Analyzer> perField = new HashMap<String, Analyzer>(1);
    perField.put("ro-order", new WhitespaceAnalyzer(LuceneUtils.LUCENE_VERSION));
    PerFieldAnalyzerWrapper aWrapper =
        new PerFieldAnalyzerWrapper(new SimpleAnalyzer(LuceneUtils.LUCENE_VERSION), perField);

    IndexWriter iw =
        new IndexWriter(
            FSDirectory.open(new File(indexPath)),
            new IndexWriterConfig(LuceneUtils.LUCENE_VERSION, aWrapper)
                .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
    StringBuilder sb = new StringBuilder(256);
    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    for (int i = 0; i < numDocs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.
      Document document = reader.document(i);
      if (document.getField("ro-order") == null) { // if the field is not here we create it.
        ImageSearchHits hits = searcher.search(document, readerRo);
        sb.delete(0, sb.length());
        for (int j = 0; j < numReferenceObjectsUsed; j++) {
          sb.append(hits.doc(j).getValues("ro-id")[0]);
          sb.append(' ');
        }
        // System.out.println(sb.toString());
        document.add(new TextField("ro-order", sb.toString(), Field.Store.YES));
        iw.updateDocument(
            new Term(
                DocumentBuilder.FIELD_NAME_IDENTIFIER,
                document.getValues(DocumentBuilder.FIELD_NAME_IDENTIFIER)[0]),
            document);
        countUpdated++;
      }

      // progress report
      progress.setNumDocsProcessed(progress.getNumDocsProcessed() + 1);

      // debug:
      System.out.println("countUpdated = " + countUpdated);
    }
    iw.commit();
    iw.close();
  }
Ejemplo n.º 10
0
 @Override
 public int nextDoc() throws IOException {
   boolean first = true;
   in.seek(nextDocStart);
   long posStart = 0;
   while (true) {
     final long lineStart = in.getFilePointer();
     SimpleTextUtil.readLine(in, scratch);
     // System.out.println("NEXT DOC: " + scratch.utf8ToString());
     if (StringHelper.startsWith(scratch, DOC)) {
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + DOC.length,
           scratch.length - DOC.length,
           scratchUTF16);
       docID = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       tf = 0;
       first = false;
     } else if (StringHelper.startsWith(scratch, FREQ)) {
       UnicodeUtil.UTF8toUTF16(
           scratch.bytes,
           scratch.offset + FREQ.length,
           scratch.length - FREQ.length,
           scratchUTF16);
       tf = ArrayUtil.parseInt(scratchUTF16.chars, 0, scratchUTF16.length);
       posStart = in.getFilePointer();
     } else if (StringHelper.startsWith(scratch, POS)) {
       // skip
     } else if (StringHelper.startsWith(scratch, START_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, END_OFFSET)) {
       // skip
     } else if (StringHelper.startsWith(scratch, PAYLOAD)) {
       // skip
     } else {
       assert StringHelper.startsWith(scratch, TERM)
           || StringHelper.startsWith(scratch, FIELD)
           || StringHelper.startsWith(scratch, END);
       if (!first && (liveDocs == null || liveDocs.get(docID))) {
         nextDocStart = lineStart;
         in.seek(posStart);
         return docID;
       }
       return docID = NO_MORE_DOCS;
     }
   }
 }
Ejemplo n.º 11
0
  public void tttestGetDistribution() throws IOException {
    BufferedWriter bw = new BufferedWriter(new FileWriter("data.csv"));
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    // get the first document:
    //        if (!IndexReader.indexExists(reader.directory()))
    //            throw new FileNotFoundException("No index found at this specific location.");

    CEDD cedd1 = new CEDD();
    FCTH fcth1 = new FCTH();

    CEDD cedd2 = new CEDD();
    FCTH fcth2 = new FCTH();

    JCD jcd1 = new JCD();
    JCD jcd2 = new JCD();
    String[] cls;

    // Needed for check whether the document is deleted.
    Bits liveDocs = MultiFields.getLiveDocs(reader);

    int docs = reader.numDocs();
    for (int i = 0; i < docs; i++) {
      if (reader.hasDeletions() && !liveDocs.get(i)) continue; // if it is deleted, just ignore it.

      Document doc = reader.document(i);
      cls = doc.getValues(DocumentBuilder.FIELD_NAME_CEDD);
      if (cls != null && cls.length > 0) cedd1.setStringRepresentation(cls[0]);
      cls = doc.getValues(DocumentBuilder.FIELD_NAME_FCTH);
      if (cls != null && cls.length > 0) fcth1.setStringRepresentation(cls[0]);

      for (int j = i + 1; j < docs; j++) {
        if (reader.hasDeletions() && !liveDocs.get(i))
          continue; // if it is deleted, just ignore it.
        Document doc2 = reader.document(j);
        cls = doc2.getValues(DocumentBuilder.FIELD_NAME_CEDD);
        if (cls != null && cls.length > 0) cedd2.setStringRepresentation(cls[0]);
        cls = doc2.getValues(DocumentBuilder.FIELD_NAME_FCTH);
        if (cls != null && cls.length > 0) fcth2.setStringRepresentation(cls[0]);
        jcd1.init(cedd1, fcth1);
        jcd2.init(cedd2, fcth2);
        bw.write(
            cedd1.getDistance(cedd2)
                + ";"
                + fcth1.getDistance(fcth2)
                + ";"
                + jcd1.getDistance(jcd2)
                + "\n");
      }
      if (i % 100 == 0) System.out.println(i + " entries processed ... ");
    }
    bw.close();
  }
Ejemplo n.º 12
0
  public void writeResponse() throws IOException {
    Boolean omitHeader = req.getParams().getBool(CommonParams.OMIT_HEADER);
    if (omitHeader != null && omitHeader) rsp.getValues().remove("responseHeader");

    SolrIndexSearcher searcher = req.getSearcher();

    if (liveDocs == null) {
      liveDocs = searcher.getLeafReader().getLiveDocs();
    }

    int maxDoc = searcher.maxDoc();

    try {

      // responseWriter.write(sw,req,rsp);

      ReturnFields fields = rsp.getReturnFields(); // return everything
      Set<String> fnames = fields.getLuceneFieldNames();
      int docCounter = 0;
      for (int i = 0; i < maxDoc; i++) {
        if (liveDocs != null && !liveDocs.get(i)) {
          continue;
        }
        Document doc = searcher.doc(i);
        SolrDocument sdoc = toSolrDocument(doc, schema);
        writeSolrDocument(null, sdoc, fields, docCounter++);
        getWriter().write("\n");
      }
    } finally {
      close();
      writer.write('\n'); // ending with a newline looks much better from the command line
      writer.close();
    }
  }
Ejemplo n.º 13
0
 @Override
 public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) throws IOException {
   FixedBitSet bits = new FixedBitSet(context.reader().maxDoc());
   bits.set(doc);
   if (acceptDocs != null && !acceptDocs.get(doc)) bits.clear(doc);
   return new BitDocIdSet(bits);
 }
Ejemplo n.º 14
0
  /**
   * Used when base query is highly constraining vs the drilldowns, or when the docs must be scored
   * at once (i.e., like BooleanScorer2, not BooleanScorer). In this case we just .next() on base
   * and .advance() on the dim filters.
   */
  private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims)
      throws IOException {
    // if (DEBUG) {
    //  System.out.println("  doQueryFirstScoring");
    // }
    int docID = baseScorer.docID();

    nextDoc:
    while (docID != PostingsEnum.NO_MORE_DOCS) {
      if (acceptDocs != null && acceptDocs.get(docID) == false) {
        docID = baseScorer.nextDoc();
        continue;
      }
      LeafCollector failedCollector = null;
      for (DocsAndCost dim : dims) {
        // TODO: should we sort this 2nd dimension of
        // docsEnums from most frequent to least?
        if (dim.approximation.docID() < docID) {
          dim.approximation.advance(docID);
        }

        boolean matches = false;
        if (dim.approximation.docID() == docID) {
          if (dim.twoPhase == null) {
            matches = true;
          } else {
            matches = dim.twoPhase.matches();
          }
        }

        if (matches == false) {
          if (failedCollector != null) {
            // More than one dim fails on this document, so
            // it's neither a hit nor a near-miss; move to
            // next doc:
            docID = baseScorer.nextDoc();
            continue nextDoc;
          } else {
            failedCollector = dim.sidewaysLeafCollector;
          }
        }
      }

      collectDocID = docID;

      // TODO: we could score on demand instead since we are
      // daat here:
      collectScore = baseScorer.score();

      if (failedCollector == null) {
        // Hit passed all filters, so it's "real":
        collectHit(collector, dims);
      } else {
        // Hit missed exactly one filter:
        collectNearMiss(failedCollector);
      }

      docID = baseScorer.nextDoc();
    }
  }
  /**
   * Returns Doc Ids by searching the index for document having the correct spatial hash cell id at
   * given grid level
   *
   * @param reader reader to the index
   */
  @Override
  public DocIdSet getDocIdSet(AtomicReaderContext context, Bits acceptDocs) throws IOException {
    if (spatialHashCellsIds.size() == 0) {
      return null;
    }

    final AtomicReader atomicReader = context.reader();

    OpenBitSet matchedDocumentsIds = new OpenBitSet(atomicReader.maxDoc());
    Boolean found = false;
    for (int i = 0; i < spatialHashCellsIds.size(); i++) {
      Term spatialHashCellTerm = new Term(fieldName, spatialHashCellsIds.get(i));
      DocsEnum spatialHashCellsDocs = atomicReader.termDocsEnum(spatialHashCellTerm);
      if (spatialHashCellsDocs != null) {
        while (true) {
          final int docId = spatialHashCellsDocs.nextDoc();
          if (docId == DocIdSetIterator.NO_MORE_DOCS) {
            break;
          } else {
            if (acceptDocs == null || acceptDocs.get(docId)) {
              matchedDocumentsIds.fastSet(docId);
              found = true;
            }
          }
        }
      }
    }

    if (found) {
      return matchedDocumentsIds;
    } else {
      return null;
    }
  }
    @Override
    public int nextDoc() throws IOException {

      while (true) {
        if (count == docFreq) {
          return doc = NO_MORE_DOCS;
        }

        count++;

        // TODO: maybe we should do the 1-bit trick for encoding
        // freq=1 case?

        // Decode next doc
        // System.out.println("  sep d&p read doc");
        accum += docReader.next();

        // System.out.println("  sep d&p read freq");
        freq = freqReader.next();

        pendingPosCount += freq;

        if (liveDocs == null || liveDocs.get(accum)) {
          break;
        }
      }

      position = 0;
      return (doc = accum);
    }
Ejemplo n.º 17
0
    static DocMap build(final int maxDoc, final Bits liveDocs) {
      assert liveDocs != null;
      final MonotonicAppendingLongBuffer docMap = new MonotonicAppendingLongBuffer();
      int del = 0;
      for (int i = 0; i < maxDoc; ++i) {
        docMap.add(i - del);
        if (!liveDocs.get(i)) {
          ++del;
        }
      }
      final int numDeletedDocs = del;
      assert docMap.size() == maxDoc;
      return new DocMap() {

        @Override
        public int get(int docID) {
          if (!liveDocs.get(docID)) {
            return -1;
          }
          return (int) docMap.get(docID);
        }

        @Override
        public int maxDoc() {
          return maxDoc;
        }

        @Override
        public int numDeletedDocs() {
          return numDeletedDocs;
        }
      };
    }
    @Override
    public int nextDoc() throws IOException {

      while (true) {
        if (count == docFreq) {
          return doc = NO_MORE_DOCS;
        }

        count++;

        // Decode next doc
        // System.out.println("decode docDelta:");
        accum += docReader.next();

        if (!omitTF) {
          // System.out.println("decode freq:");
          freq = freqReader.next();
        }

        if (liveDocs == null || liveDocs.get(accum)) {
          break;
        }
      }
      return (doc = accum);
    }
Ejemplo n.º 19
0
 private static void printDocs(DirectoryReader r) throws Throwable {
   for (AtomicReaderContext ctx : r.leaves()) {
     // TODO: improve this
     AtomicReader sub = ctx.reader();
     Bits liveDocs = sub.getLiveDocs();
     System.out.println("  " + ((SegmentReader) sub).getSegmentInfo());
     for (int docID = 0; docID < sub.maxDoc(); docID++) {
       StoredDocument doc = sub.document(docID);
       if (liveDocs == null || liveDocs.get(docID)) {
         System.out.println("    docID=" + docID + " id:" + doc.get("id"));
       } else {
         System.out.println("    DEL docID=" + docID + " id:" + doc.get("id"));
       }
     }
   }
 }
 private Set<Uid> getShardDocUIDs(final IndexShard shard) throws IOException {
   shard.refresh("get_uids");
   try (Engine.Searcher searcher = shard.acquireSearcher("test")) {
     Set<Uid> ids = new HashSet<>();
     for (LeafReaderContext leafContext : searcher.reader().leaves()) {
       LeafReader reader = leafContext.reader();
       Bits liveDocs = reader.getLiveDocs();
       for (int i = 0; i < reader.maxDoc(); i++) {
         if (liveDocs == null || liveDocs.get(i)) {
           Document uuid = reader.document(i, Collections.singleton(UidFieldMapper.NAME));
           ids.add(Uid.createUid(uuid.get(UidFieldMapper.NAME)));
         }
       }
     }
     return ids;
   }
 }
Ejemplo n.º 21
0
  public void testDocsWithField() throws Exception {
    Directory dir = newDirectory();

    IndexWriterConfig iwc = newIndexWriterConfig(random(), null);
    iwc.setMergePolicy(newLogMergePolicy());
    RandomIndexWriter iw = new RandomIndexWriter(random(), dir, iwc);

    int numDocs = TEST_NIGHTLY ? atLeast(500) : atLeast(50);
    for (int i = 0; i < numDocs; i++) {
      Document doc = new Document();
      if (random().nextInt(4) >= 0) {
        doc.add(new NumericDocValuesField("numbers", random().nextLong()));
      }
      doc.add(new NumericDocValuesField("numbersAlways", random().nextLong()));
      iw.addDocument(doc);
      if (random().nextInt(17) == 0) {
        iw.commit();
      }
    }
    DirectoryReader ir = iw.getReader();
    iw.forceMerge(1);
    DirectoryReader ir2 = iw.getReader();
    LeafReader merged = getOnlyLeafReader(ir2);
    iw.close();

    Bits multi = MultiDocValues.getDocsWithField(ir, "numbers");
    Bits single = merged.getDocsWithField("numbers");
    if (multi == null) {
      assertNull(single);
    } else {
      assertEquals(single.length(), multi.length());
      for (int i = 0; i < numDocs; i++) {
        assertEquals(single.get(i), multi.get(i));
      }
    }

    multi = MultiDocValues.getDocsWithField(ir, "numbersAlways");
    single = merged.getDocsWithField("numbersAlways");
    assertEquals(single.length(), multi.length());
    for (int i = 0; i < numDocs; i++) {
      assertEquals(single.get(i), multi.get(i));
    }
    ir.close();
    ir2.close();
    dir.close();
  }
 @Override
 protected boolean useRandomAccess(Bits bits, long filterCost) {
   int multiplier = threshold;
   if (threshold == -1) {
     // default
     multiplier = 100;
   }
   return filterCost * multiplier > bits.length();
 }
Ejemplo n.º 23
0
 @Override
 public int nextDoc() {
   pos = -1;
   if (hasNext && (liveDocs == null || liveDocs.get(0))) {
     hasNext = false;
     return doc = 0;
   } else {
     return doc = NO_MORE_DOCS;
   }
 }
Ejemplo n.º 24
0
 /**
  * Merges in the term vectors from the readers in <code>mergeState</code>. The default
  * implementation skips over deleted documents, and uses {@link #startDocument(int)}, {@link
  * #startField(FieldInfo, int, boolean, boolean)}, {@link #startTerm(BytesRef, int)}, {@link
  * #addPosition(int, int, int)}, and {@link #finish(FieldInfos, int)}, returning the number of
  * documents that were written. Implementations can override this method for more sophisticated
  * merging (bulk-byte copying, etc).
  */
 public int merge(MergeState mergeState) throws IOException {
   int docCount = 0;
   for (MergeState.IndexReaderAndLiveDocs reader : mergeState.readers) {
     final int maxDoc = reader.reader.maxDoc();
     final Bits liveDocs = reader.liveDocs;
     for (int docID = 0; docID < maxDoc; docID++) {
       if (liveDocs != null && !liveDocs.get(docID)) {
         // skip deleted docs
         continue;
       }
       // NOTE: it's very important to first assign to vectors then pass it to
       // termVectorsWriter.addAllDocVectors; see LUCENE-1282
       Fields vectors = reader.reader.getTermVectors(docID);
       addAllDocVectors(vectors, mergeState.fieldInfos);
       docCount++;
       mergeState.checkAbort.work(300);
     }
   }
   finish(mergeState.fieldInfos, docCount);
   return docCount;
 }
 @Override
 protected void search(List<LeafReaderContext> leaves, Weight weight, Collector collector)
     throws IOException {
   for (LeafReaderContext ctx : leaves) { // search each subreader
     // we force the use of Scorer (not BulkScorer) to make sure
     // that the scorer passed to LeafCollector.setScorer supports
     // Scorer.getChildren
     Scorer scorer = weight.scorer(ctx);
     if (scorer != null) {
       final LeafCollector leafCollector = collector.getLeafCollector(ctx);
       leafCollector.setScorer(scorer);
       final Bits liveDocs = ctx.reader().getLiveDocs();
       for (int doc = scorer.nextDoc();
           doc != DocIdSetIterator.NO_MORE_DOCS;
           doc = scorer.nextDoc()) {
         if (liveDocs == null || liveDocs.get(doc)) {
           leafCollector.collect(doc);
         }
       }
     }
   }
 }
Ejemplo n.º 26
0
  private static DocSet createSmallSet(
      List<LeafReaderContext> leaves, PostingsEnum[] postList, int maxPossible, int firstReader)
      throws IOException {
    int[] docs = new int[maxPossible];
    int sz = 0;
    for (int i = firstReader; i < postList.length; i++) {
      PostingsEnum postings = postList[i];
      if (postings == null) continue;
      LeafReaderContext ctx = leaves.get(i);
      Bits liveDocs = ctx.reader().getLiveDocs();
      int base = ctx.docBase;
      for (; ; ) {
        int subId = postings.nextDoc();
        if (subId == DocIdSetIterator.NO_MORE_DOCS) break;
        if (liveDocs != null && !liveDocs.get(subId)) continue;
        int globalId = subId + base;
        docs[sz++] = globalId;
      }
    }

    return new SortedIntDocSet(docs, sz);
  }
 @Override
 public int nextDoc() throws IOException {
   int d;
   do {
     d = bitSet.nextSetBit(docBase + doc + 1);
     if (d == -1 || d >= maxDoc + docBase) {
       doc = NO_MORE_DOCS;
     } else {
       doc = d - docBase;
     }
   } while (doc != NO_MORE_DOCS
       && d < maxDoc + docBase
       && liveDocs != null
       && !liveDocs.get(doc));
   return doc;
 }
Ejemplo n.º 28
0
  private ConciseDocument readNextDocument() throws Exception {
    if (reader != null) {
      ConciseDocument cd = null;
      while (docID < reader.maxDoc()) {
        // check if document is deleted
        // see LUCENE-2600 at https://lucene.apache.org/core/4_0_0/MIGRATE.html
        if (liveDocs != null && !liveDocs.get(docID)) {
          docID++;
          continue;
        }

        Document doc = reader.document(docID);
        cd = new ConciseDocument(doc);
        cd.docID = docID;
        cd.documentFile = new File(originalFolder, cd.filename);
        docID++;
        return cd;
      }
    }
    return null;
  }
  /**
   * Merges the sortedset docvalues from <code>toMerge</code>.
   *
   * <p>The default implementation calls {@link #addSortedSetField}, passing an Iterable that merges
   * ordinals and values and filters deleted documents .
   */
  public void mergeSortedSetField(
      FieldInfo fieldInfo, final MergeState mergeState, List<SortedSetDocValues> toMerge)
      throws IOException {

    mergeState.checkAbort.work(mergeState.segmentInfo.getDocCount());

    final AtomicReader readers[] = mergeState.readers.toArray(new AtomicReader[toMerge.size()]);
    final SortedSetDocValues dvs[] = toMerge.toArray(new SortedSetDocValues[toMerge.size()]);

    // step 1: iterate thru each sub and mark terms still in use
    TermsEnum liveTerms[] = new TermsEnum[dvs.length];
    long[] weights = new long[liveTerms.length];
    for (int sub = 0; sub < liveTerms.length; sub++) {
      AtomicReader reader = readers[sub];
      SortedSetDocValues dv = dvs[sub];
      Bits liveDocs = reader.getLiveDocs();
      if (liveDocs == null) {
        liveTerms[sub] = dv.termsEnum();
        weights[sub] = dv.getValueCount();
      } else {
        LongBitSet bitset = new LongBitSet(dv.getValueCount());
        for (int i = 0; i < reader.maxDoc(); i++) {
          if (liveDocs.get(i)) {
            dv.setDocument(i);
            long ord;
            while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
              bitset.set(ord);
            }
          }
        }
        liveTerms[sub] = new BitsFilteredTermsEnum(dv.termsEnum(), bitset);
        weights[sub] = bitset.cardinality();
      }
    }

    // step 2: create ordinal map (this conceptually does the "merging")
    final OrdinalMap map = OrdinalMap.build(this, liveTerms, weights, PackedInts.COMPACT);

    // step 3: add field
    addSortedSetField(
        fieldInfo,
        // ord -> value
        new Iterable<BytesRef>() {
          @Override
          public Iterator<BytesRef> iterator() {
            return new Iterator<BytesRef>() {
              long currentOrd;

              @Override
              public boolean hasNext() {
                return currentOrd < map.getValueCount();
              }

              @Override
              public BytesRef next() {
                if (!hasNext()) {
                  throw new NoSuchElementException();
                }
                int segmentNumber = map.getFirstSegmentNumber(currentOrd);
                long segmentOrd = map.getFirstSegmentOrd(currentOrd);
                final BytesRef term = dvs[segmentNumber].lookupOrd(segmentOrd);
                currentOrd++;
                return term;
              }

              @Override
              public void remove() {
                throw new UnsupportedOperationException();
              }
            };
          }
        },
        // doc -> ord count
        new Iterable<Number>() {
          @Override
          public Iterator<Number> iterator() {
            return new Iterator<Number>() {
              int readerUpto = -1;
              int docIDUpto;
              int nextValue;
              AtomicReader currentReader;
              Bits currentLiveDocs;
              boolean nextIsSet;

              @Override
              public boolean hasNext() {
                return nextIsSet || setNext();
              }

              @Override
              public void remove() {
                throw new UnsupportedOperationException();
              }

              @Override
              public Number next() {
                if (!hasNext()) {
                  throw new NoSuchElementException();
                }
                assert nextIsSet;
                nextIsSet = false;
                // TODO make a mutable number
                return nextValue;
              }

              private boolean setNext() {
                while (true) {
                  if (readerUpto == readers.length) {
                    return false;
                  }

                  if (currentReader == null || docIDUpto == currentReader.maxDoc()) {
                    readerUpto++;
                    if (readerUpto < readers.length) {
                      currentReader = readers[readerUpto];
                      currentLiveDocs = currentReader.getLiveDocs();
                    }
                    docIDUpto = 0;
                    continue;
                  }

                  if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                    nextIsSet = true;
                    SortedSetDocValues dv = dvs[readerUpto];
                    dv.setDocument(docIDUpto);
                    nextValue = 0;
                    while (dv.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
                      nextValue++;
                    }
                    docIDUpto++;
                    return true;
                  }

                  docIDUpto++;
                }
              }
            };
          }
        },
        // ords
        new Iterable<Number>() {
          @Override
          public Iterator<Number> iterator() {
            return new Iterator<Number>() {
              int readerUpto = -1;
              int docIDUpto;
              long nextValue;
              AtomicReader currentReader;
              Bits currentLiveDocs;
              LongValues currentMap;
              boolean nextIsSet;
              long ords[] = new long[8];
              int ordUpto;
              int ordLength;

              @Override
              public boolean hasNext() {
                return nextIsSet || setNext();
              }

              @Override
              public void remove() {
                throw new UnsupportedOperationException();
              }

              @Override
              public Number next() {
                if (!hasNext()) {
                  throw new NoSuchElementException();
                }
                assert nextIsSet;
                nextIsSet = false;
                // TODO make a mutable number
                return nextValue;
              }

              private boolean setNext() {
                while (true) {
                  if (readerUpto == readers.length) {
                    return false;
                  }

                  if (ordUpto < ordLength) {
                    nextValue = ords[ordUpto];
                    ordUpto++;
                    nextIsSet = true;
                    return true;
                  }

                  if (currentReader == null || docIDUpto == currentReader.maxDoc()) {
                    readerUpto++;
                    if (readerUpto < readers.length) {
                      currentReader = readers[readerUpto];
                      currentLiveDocs = currentReader.getLiveDocs();
                      currentMap = map.getGlobalOrds(readerUpto);
                    }
                    docIDUpto = 0;
                    continue;
                  }

                  if (currentLiveDocs == null || currentLiveDocs.get(docIDUpto)) {
                    assert docIDUpto < currentReader.maxDoc();
                    SortedSetDocValues dv = dvs[readerUpto];
                    dv.setDocument(docIDUpto);
                    ordUpto = ordLength = 0;
                    long ord;
                    while ((ord = dv.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
                      if (ordLength == ords.length) {
                        ords = ArrayUtil.grow(ords, ordLength + 1);
                      }
                      ords[ordLength] = currentMap.get(ord);
                      ordLength++;
                    }
                    docIDUpto++;
                    continue;
                  }

                  docIDUpto++;
                }
              }
            };
          }
        });
  }
Ejemplo n.º 30
0
 /**
  * Reconstruct document fields.
  *
  * @param docNum document number. If this document is deleted, but the index is not optimized yet,
  *     the reconstruction process may still yield the reconstructed field content even from
  *     deleted documents.
  * @return reconstructed document
  * @throws Exception
  */
 public Reconstructed reconstruct(int docNum) throws Exception {
   if (docNum < 0 || docNum > reader.maxDoc()) {
     throw new Exception("Document number outside of valid range.");
   }
   Reconstructed res = new Reconstructed();
   if (deleted != null && deleted.get(docNum)) {
     throw new Exception("Document is deleted.");
   } else {
     Document doc = reader.document(docNum);
     for (int i = 0; i < fieldNames.length; i++) {
       Field[] fs = doc.getFields(fieldNames[i]);
       if (fs != null && fs.length > 0) {
         res.getStoredFields().put(fieldNames[i], fs);
       }
     }
   }
   // collect values from unstored fields
   HashSet<String> fields = new HashSet<String>(Arrays.asList(fieldNames));
   // try to use term vectors if available
   progress.maxValue = fieldNames.length;
   progress.curValue = 0;
   progress.minValue = 0;
   for (int i = 0; i < fieldNames.length; i++) {
     TermFreqVector tvf = reader.getTermFreqVector(docNum, fieldNames[i]);
     if (tvf != null && tvf.size() > 0 && (tvf instanceof TermPositionVector)) {
       TermPositionVector tpv = (TermPositionVector) tvf;
       progress.message = "Reading term vectors ...";
       progress.curValue = i;
       setChanged();
       notifyObservers(progress);
       BytesRef[] tv = tpv.getTerms();
       for (int k = 0; k < tv.length; k++) {
         // do we have positions?
         int[] posArr = tpv.getTermPositions(k);
         if (posArr == null) {
           // only offsets
           TermVectorOffsetInfo[] offsets = tpv.getOffsets(k);
           if (offsets.length == 0) {
             continue;
           }
           // convert offsets into positions
           posArr = convertOffsets(offsets);
         }
         GrowableStringArray gsa = res.getReconstructedFields().get(fieldNames[i]);
         if (gsa == null) {
           gsa = new GrowableStringArray();
           res.getReconstructedFields().put(fieldNames[i], gsa);
         }
         for (int m = 0; m < posArr.length; m++) {
           gsa.append(posArr[m], "|", tv[k].utf8ToString());
         }
       }
       fields.remove(fieldNames[i]); // got what we wanted
     }
   }
   // this loop collects data only from left-over fields
   // not yet collected through term vectors
   progress.maxValue = fields.size();
   progress.curValue = 0;
   progress.minValue = 0;
   for (String fld : fields) {
     progress.message = "Collecting terms in " + fld + " ...";
     progress.curValue++;
     setChanged();
     notifyObservers(progress);
     Terms terms = MultiFields.getTerms(reader, fld);
     if (terms == null) { // no terms in this field
       continue;
     }
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       DocsAndPositionsEnum dpe = te.docsAndPositions(deleted, null);
       if (dpe == null) { // no position info for this field
         break;
       }
       int num = dpe.advance(docNum);
       if (num != docNum) { // either greater than or NO_MORE_DOCS
         continue; // no data for this term in this doc
       }
       String term = te.term().utf8ToString();
       GrowableStringArray gsa = (GrowableStringArray) res.getReconstructedFields().get(fld);
       if (gsa == null) {
         gsa = new GrowableStringArray();
         res.getReconstructedFields().put(fld, gsa);
       }
       for (int k = 0; k < dpe.freq(); k++) {
         int pos = dpe.nextPosition();
         gsa.append(pos, "|", term);
       }
     }
   }
   progress.message = "Done.";
   progress.curValue = 100;
   setChanged();
   notifyObservers(progress);
   return res;
 }