public void testDocsAndPositionsEnumStart() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    int numIters = atLeast(3);
    MemoryIndex memory = new MemoryIndex(true, false, random().nextInt(50) * 1024 * 1024);
    for (int i = 0; i < numIters; i++) { // check reuse
      memory.addField("foo", "bar", analyzer);
      LeafReader reader = (LeafReader) memory.createSearcher().getIndexReader();
      TestUtil.checkReader(reader);
      assertEquals(1, reader.terms("foo").getSumTotalTermFreq());
      PostingsEnum disi = reader.postings(new Term("foo", "bar"), PostingsEnum.ALL);
      int docid = disi.docID();
      assertEquals(-1, docid);
      assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      assertEquals(0, disi.nextPosition());
      assertEquals(0, disi.startOffset());
      assertEquals(3, disi.endOffset());

      // now reuse and check again
      TermsEnum te = reader.terms("foo").iterator();
      assertTrue(te.seekExact(new BytesRef("bar")));
      disi = te.postings(disi);
      docid = disi.docID();
      assertEquals(-1, docid);
      assertTrue(disi.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      reader.close();
      memory.reset();
    }
  }
  public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException {
    ShapeFieldCache<T> idx = sidx.get(reader);
    if (idx != null) {
      return idx;
    }
    long startTime = System.currentTimeMillis();

    log.fine("Building Cache [" + reader.maxDoc() + "]");
    idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize);
    int count = 0;
    DocsEnum docs = null;
    Terms terms = reader.terms(shapeField);
    TermsEnum te = null;
    if (terms != null) {
      te = terms.iterator(te);
      BytesRef term = te.next();
      while (term != null) {
        T shape = readShape(term);
        if (shape != null) {
          docs = te.docs(null, docs, DocsEnum.FLAG_NONE);
          Integer docid = docs.nextDoc();
          while (docid != DocIdSetIterator.NO_MORE_DOCS) {
            idx.add(docid, shape);
            docid = docs.nextDoc();
            count++;
          }
        }
        term = te.next();
      }
    }
    sidx.put(reader, idx);
    long elapsed = System.currentTimeMillis() - startTime;
    log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx);
    return idx;
  }
    @Override
    public void hitExecute(SearchContext context, HitContext hitContext) {
      if (context.getFetchSubPhaseContext(CONTEXT_FACTORY).hitExecutionNeeded() == false) {
        return;
      }
      String field = context.getFetchSubPhaseContext(CONTEXT_FACTORY).getField();

      if (hitContext.hit().fieldsOrNull() == null) {
        hitContext.hit().fields(new HashMap<>());
      }
      SearchHitField hitField = hitContext.hit().fields().get(NAMES[0]);
      if (hitField == null) {
        hitField = new InternalSearchHitField(NAMES[0], new ArrayList<>(1));
        hitContext.hit().fields().put(NAMES[0], hitField);
      }
      TermVectorsResponse termVector =
          TermVectorsService.getTermVectors(
              context.indexShard(),
              new TermVectorsRequest(
                  context.indexShard().shardId().getIndex().getName(),
                  hitContext.hit().type(),
                  hitContext.hit().id()));
      try {
        Map<String, Integer> tv = new HashMap<>();
        TermsEnum terms = termVector.getFields().terms(field).iterator();
        BytesRef term;
        while ((term = terms.next()) != null) {
          tv.put(term.utf8ToString(), terms.postings(null, PostingsEnum.ALL).freq());
        }
        hitField.values().add(tv);
      } catch (IOException e) {
        ESLoggerFactory.getLogger(FetchSubPhasePluginIT.class.getName())
            .info("Swallowed exception", e);
      }
    }
  public void testSeekCeilNotFound() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    // Get empty string in there!
    doc.add(newStringField("field", "", Field.Store.NO));
    w.addDocument(doc);

    for (int i = 0; i < 36; i++) {
      doc = new Document();
      String term = "" + (char) (97 + i);
      String term2 = "a" + (char) (97 + i);
      doc.add(newTextField("field", term + " " + term2, Field.Store.NO));
      w.addDocument(doc);
    }

    w.forceMerge(1);
    IndexReader r = w.getReader();
    TermsEnum te = MultiFields.getTerms(r, "field").iterator(null);
    assertEquals(TermsEnum.SeekStatus.NOT_FOUND, te.seekCeil(new BytesRef(new byte[] {0x22})));
    assertEquals("a", te.term().utf8ToString());
    assertEquals(1L, te.ord());
    r.close();
    w.close();
    dir.close();
  }
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   * @param fieldName Optional field name of the terms for skip terms
   */
  private void addTermFrequencies(
      Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      if (isSkipTerm(fieldName, term)) {
        continue;
      }

      final PostingsEnum docs = termsEnum.postings(null, null);
      int freq = 0;
      while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        freq += docs.freq();
      }

      // increment frequency
      Int cnt = termFreqMap.get(term);
      if (cnt == null) {
        cnt = new Int();
        termFreqMap.put(term, cnt);
        cnt.x = freq;
      } else {
        cnt.x += freq;
      }
    }
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
示例#7
0
  public void testReuseDocsEnumNoReuse() throws IOException {
    Directory dir = newDirectory();
    Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat());
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
    int numdocs = atLeast(20);
    createRandomIndex(numdocs, writer, random());
    writer.commit();

    DirectoryReader open = DirectoryReader.open(dir);
    for (LeafReaderContext ctx : open.leaves()) {
      LeafReader indexReader = ctx.reader();
      Terms terms = indexReader.terms("body");
      TermsEnum iterator = terms.iterator();
      IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>();
      MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc());
      while ((iterator.next()) != null) {
        PostingsEnum docs =
            iterator.postings(
                random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()),
                null,
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }

      assertEquals(terms.size(), enums.size());
    }
    writer.commit();
    IOUtils.close(writer, open, dir);
  }
  private void getPrefixTerms(
      ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment
    // into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf
    // individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
      Terms _terms = leaf.reader().terms(field);
      if (_terms == null) {
        continue;
      }

      TermsEnum termsEnum = _terms.iterator();
      TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
      if (TermsEnum.SeekStatus.END == seekStatus) {
        continue;
      }

      for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
        if (!StringHelper.startsWith(term, prefix.bytes())) {
          break;
        }

        terms.add(new Term(field, BytesRef.deepCopyOf(term)));
        if (terms.size() >= maxExpansions) {
          return;
        }
      }
    }
  }
示例#9
0
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param field2termFreqMap a Map of terms and their frequencies per field
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(
      Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName)
      throws IOException {
    Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
    if (termFreqMap == null) {
      termFreqMap = new HashMap<>();
      field2termFreqMap.put(fieldName, termFreqMap);
    }
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();

      // increment frequency
      Int cnt = termFreqMap.get(term);
      if (cnt == null) {
        cnt = new Int();
        termFreqMap.put(term, cnt);
        cnt.x = freq;
      } else {
        cnt.x += freq;
      }
    }
  }
示例#10
0
 public String[] getTerms() {
   IndexReader reader = null;
   int maxSize = 100;
   Set<String> searchResults = new HashSet<String>();
   try {
     reader = DirectoryReader.open(dir);
     Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("contents");
     TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);
     BytesRef byteRef = null;
     while ((byteRef = termsEnum.next()) != null) {
       String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
       searchResults.add(term);
       if (searchResults.size() >= maxSize) {
         break;
       }
     }
   } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   } finally {
     try {
       if (reader != null) {
         reader.close();
       }
     } catch (IOException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }
   }
   return searchResults.toArray(new String[searchResults.size()]);
 }
示例#11
0
 /**
  * Prepare a document reconstructor.
  *
  * @param reader IndexReader to read from.
  * @param fieldNames if non-null or not empty, data will be collected only from these fields,
  *     otherwise data will be collected from all fields
  * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated)
  * @throws Exception
  */
 public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception {
   if (reader == null) {
     throw new Exception("IndexReader cannot be null.");
   }
   this.reader = reader;
   if (fieldNames == null || fieldNames.length == 0) {
     // collect fieldNames
     this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]);
   } else {
     this.fieldNames = fieldNames;
   }
   if (numTerms == -1) {
     Fields fields = MultiFields.getFields(reader);
     numTerms = 0;
     FieldsEnum fe = fields.iterator();
     String fld = null;
     while ((fld = fe.next()) != null) {
       TermsEnum te = fe.terms();
       while (te.next() != null) {
         numTerms++;
       }
     }
     this.numTerms = numTerms;
   }
   deleted = MultiFields.getDeletedDocs(reader);
 }
示例#12
0
  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
    Query createCandidateQuery(IndexReader indexReader) throws IOException {
      List<Term> extractedTerms = new ArrayList<>();
      // include extractionResultField:failed, because docs with this term have no
      // extractedTermsField
      // and otherwise we would fail to return these docs. Docs that failed query term extraction
      // always need to be verified by MemoryIndex:
      extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED));

      LeafReader reader = indexReader.leaves().get(0).reader();
      Fields fields = reader.fields();
      for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
          continue;
        }

        BytesRef fieldBr = new BytesRef(field);
        TermsEnum tenum = terms.iterator();
        for (BytesRef term = tenum.next(); term != null; term = tenum.next()) {
          BytesRefBuilder builder = new BytesRefBuilder();
          builder.append(fieldBr);
          builder.append(FIELD_VALUE_SEPARATOR);
          builder.append(term);
          extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef()));
        }
      }
      return new TermsQuery(extractedTerms);
    }
示例#14
0
 /**
  * Find terms in the index based on a prefix. Useful for autocomplete.
  *
  * @param index the index
  * @param fieldName the field
  * @param prefix the prefix we're looking for (null or empty string for all terms)
  * @param sensitive match case-sensitively or not?
  * @param maxResults max. number of results to return (or -1 for all)
  * @return the matching terms
  */
 public static List<String> findTermsByPrefix(
     LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) {
   boolean allTerms = prefix == null || prefix.length() == 0;
   if (allTerms) {
     prefix = "";
     sensitive = true; // don't do unnecessary work in this case
   }
   try {
     if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase();
     org.apache.lucene.index.Terms terms = index.terms(fieldName);
     List<String> results = new ArrayList<>();
     TermsEnum termsEnum = terms.iterator();
     BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET));
     termsEnum.seekCeil(brPrefix); // find the prefix in the terms list
     while (maxResults < 0 || results.size() < maxResults) {
       BytesRef term = termsEnum.next();
       if (term == null) break;
       String termText = term.utf8ToString();
       String optDesensitized = termText;
       if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase();
       if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) {
         // Doesn't match prefix or different field; no more matches
         break;
       }
       // Match, add term
       results.add(termText);
     }
     return results;
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
示例#15
0
  public TermInfo collect(String term) throws IOException {
    TermInfo info = new TermInfo();
    BytesRef luceneTerm = new BytesRef(term.getBytes());
    // this gives documents in which the term is found, but no offset information can be retrieved
    PostingsEnum postings =
        MultiFields.getTermDocsEnum(indexReader, ngramInfoFieldname, luceneTerm);
    // now go through each document
    int docId = postings.nextDoc();
    while (docId != PostingsEnum.NO_MORE_DOCS) {
      // get the term vector for that document.
      TermsEnum it = indexReader.getTermVector(docId, ngramInfoFieldname).iterator();
      // find the term of interest
      it.seekExact(luceneTerm);
      // get its posting info. this will contain offset info
      PostingsEnum postingsInDoc = it.postings(null, PostingsEnum.OFFSETS);
      postingsInDoc.nextDoc();

      Document doc = indexReader.document(docId);
      String id = doc.get(idFieldname);
      JATEDocument jd = new JATEDocument(id);
      Set<int[]> offsets = new HashSet<>();
      int totalFreq = postingsInDoc.freq();
      for (int i = 0; i < totalFreq; i++) {
        postingsInDoc.nextPosition();
        offsets.add(new int[] {postingsInDoc.startOffset(), postingsInDoc.endOffset()});
      }
      info.getOffsets().put(jd, offsets);

      docId = postings.nextDoc();
    }

    return info;
  }
示例#16
0
    @Override
    public void seekExact(long targetOrd) throws IOException {
      int delta = (int) (targetOrd - ordBase - ord);
      // System.out.println("  seek(ord) targetOrd=" + targetOrd + " delta=" + delta + " ord=" + ord
      // + " ii=" + indexInterval);
      if (delta < 0 || delta > indexInterval) {
        final int idx = (int) (targetOrd >>> indexIntervalBits);
        final BytesRef base = indexedTermsArray[idx];
        // System.out.println("  do seek term=" + base.utf8ToString());
        ord = idx << indexIntervalBits;
        delta = (int) (targetOrd - ord);
        final TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(base);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
      } else {
        // System.out.println("seek w/in block");
      }

      while (--delta >= 0) {
        BytesRef br = termsEnum.next();
        if (br == null) {
          assert false;
          return;
        }
        ord++;
      }

      setTerm();
      assert term != null;
    }
示例#17
0
    @Override
    public SeekStatus seekCeil(BytesRef target) throws IOException {

      // already here
      if (term != null && term.equals(target)) {
        return SeekStatus.FOUND;
      }

      int startIdx = Arrays.binarySearch(indexedTermsArray, target);

      if (startIdx >= 0) {
        // we hit the term exactly... lucky us!
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = startIdx << indexIntervalBits;
        setTerm();
        assert term != null;
        return SeekStatus.FOUND;
      }

      // we didn't hit the term exactly
      startIdx = -startIdx - 1;

      if (startIdx == 0) {
        // our target occurs *before* the first term
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(target);
        assert seekStatus == TermsEnum.SeekStatus.NOT_FOUND;
        ord = 0;
        setTerm();
        assert term != null;
        return SeekStatus.NOT_FOUND;
      }

      // back up to the start of the block
      startIdx--;

      if ((ord >> indexIntervalBits) == startIdx && term != null && term.compareTo(target) <= 0) {
        // we are already in the right block and the current term is before the term we want,
        // so we don't need to seek.
      } else {
        // seek to the right block
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(indexedTermsArray[startIdx]);
        assert seekStatus == TermsEnum.SeekStatus.FOUND;
        ord = startIdx << indexIntervalBits;
        setTerm();
        assert term != null; // should be non-null since it's in the index
      }

      while (term != null && term.compareTo(target) < 0) {
        next();
      }

      if (term == null) {
        return SeekStatus.END;
      } else if (term.compareTo(target) == 0) {
        return SeekStatus.FOUND;
      } else {
        return SeekStatus.NOT_FOUND;
      }
    }
示例#18
0
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
示例#19
0
  /**
   * Remove a stale file (uidIter.term().text()) from the index database (and the xref file)
   *
   * @throws java.io.IOException if an error occurs
   */
  private void removeFile() throws IOException {
    String path = Util.uid2url(uidIter.term().utf8ToString());

    for (IndexChangedListener listener : listeners) {
      listener.fileRemove(path);
    }
    writer.deleteDocuments(new Term(QueryBuilder.U, uidIter.term()));
    writer.prepareCommit();
    writer.commit();

    File xrefFile;
    if (RuntimeEnvironment.getInstance().isCompressXref()) {
      xrefFile = new File(xrefDir, path + ".gz");
    } else {
      xrefFile = new File(xrefDir, path);
    }
    File parent = xrefFile.getParentFile();

    if (!xrefFile.delete() && xrefFile.exists()) {
      log.log(Level.INFO, "Failed to remove obsolete xref-file: {0}", xrefFile.getAbsolutePath());
    }

    // Remove the parent directory if it's empty
    if (parent.delete()) {
      log.log(Level.FINE, "Removed empty xref dir:{0}", parent.getAbsolutePath());
    }
    setDirty();
    for (IndexChangedListener listener : listeners) {
      listener.fileRemoved(path);
    }
  }
 @Override
 protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum)
     throws IOException {
   BytesRef spare = new BytesRef();
   PostingsEnum postingsEnum = null;
   for (int i = 0; i < terms.size(); i++) {
     if (termsEnum.seekExact(terms.get(ords[i], spare))) {
       postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
       float score = TermsIncludingScoreQuery.this.scores[ords[i]];
       for (int doc = postingsEnum.nextDoc();
           doc != DocIdSetIterator.NO_MORE_DOCS;
           doc = postingsEnum.nextDoc()) {
         // I prefer this:
         /*if (scores[doc] < score) {
           scores[doc] = score;
           matchingDocs.set(doc);
         }*/
         // But this behaves the same as MVInnerScorer and only then the tests will pass:
         if (!matchingDocs.get(doc)) {
           scores[doc] = score;
           matchingDocs.set(doc);
         }
       }
     }
   }
 }
示例#21
0
 public static void fillQueue(TermsEnum termsEnum, TermStatsQueue tiq, String field)
     throws Exception {
   BytesRef term;
   while ((term = termsEnum.next()) != null) {
     BytesRef r = new BytesRef();
     r.copyBytes(term);
     tiq.insertWithOverflow(new TermStats(field, r, termsEnum.docFreq()));
   }
 }
示例#22
0
  /**
   * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) *
   * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are
   * described above.
   *
   * @param doc
   * @param terms
   * @param context
   * @return
   */
  @Override
  public float extract(Document doc, Terms terms, RerankerContext context) {
    Set<String> queryTokens = new HashSet<>(context.getQueryTokens());

    TermsEnum termsEnum = null;
    try {
      termsEnum = terms.iterator();
    } catch (IOException e) {
      LOG.warn("Error computing BM25, unable to retrieve terms enum");
      return 0.0f;
    }

    IndexReader reader = context.getIndexSearcher().getIndexReader();
    long maxDocs = reader.numDocs();
    long sumTotalTermFreq = getSumTermFrequency(reader, context.getField());
    // Compute by iterating
    long docSize = 0L;

    // NOTE df cannot be retrieved just from the term vector,
    // the term vector here is only a partial term vector that treats this as if we only have 1
    // document in the index
    Map<String, Integer> docFreqMap = null;
    try {
      docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField());
    } catch (IOException e) {
      LOG.warn("Unable to retrieve document frequencies.");
      docFreqMap = new HashMap<>();
    }

    Map<String, Long> termFreqMap = new HashMap<>();
    try {
      while (termsEnum.next() != null) {
        String termString = termsEnum.term().utf8ToString();
        docSize += termsEnum.totalTermFreq();
        if (queryTokens.contains(termString)) {
          termFreqMap.put(termString, termsEnum.totalTermFreq());
        }
      }
    } catch (IOException e) {
      LOG.warn("Unable to retrieve termsEnum, treating as 0");
    }

    float score = 0.0f;
    // Iterate over the query tokens
    double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs);
    for (String token : queryTokens) {
      long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0;
      double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0;
      double numerator = (this.k1 + 1) * termFreq;
      double docLengthFactor = this.b * (docSize / avgFL);
      double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor);
      score += computeIDF(docFreq, maxDocs) * numerator / denominator;
    }

    return score;
  }
示例#23
0
  public void test10kPulsed() throws Exception {
    // we always run this test with pulsing codec.
    Codec cp = _TestUtil.alwaysPostingsFormat(new Pulsing41PostingsFormat(1));

    File f = _TestUtil.getTempDir("10kpulsed");
    BaseDirectoryWrapper dir = newFSDirectory(f);
    dir.setCheckIndexOnClose(false); // we do this ourselves explicitly
    RandomIndexWriter iw =
        new RandomIndexWriter(
            random(),
            dir,
            newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random())).setCodec(cp));

    Document document = new Document();
    FieldType ft = new FieldType(TextField.TYPE_STORED);

    switch (_TestUtil.nextInt(random(), 0, 2)) {
      case 0:
        ft.setIndexOptions(IndexOptions.DOCS_ONLY);
        break;
      case 1:
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
        break;
      default:
        ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
        break;
    }

    Field field = newField("field", "", ft);
    document.add(field);

    NumberFormat df = new DecimalFormat("00000", new DecimalFormatSymbols(Locale.ROOT));

    for (int i = 0; i < 10050; i++) {
      field.setStringValue(df.format(i));
      iw.addDocument(document);
    }

    IndexReader ir = iw.getReader();
    iw.close();

    TermsEnum te = MultiFields.getTerms(ir, "field").iterator(null);
    DocsEnum de = null;

    for (int i = 0; i < 10050; i++) {
      String expected = df.format(i);
      assertEquals(expected, te.next().utf8ToString());
      de = _TestUtil.docs(random(), te, null, de, 0);
      assertTrue(de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
      assertEquals(DocIdSetIterator.NO_MORE_DOCS, de.nextDoc());
    }
    ir.close();

    _TestUtil.checkIndex(dir);
    dir.close();
  }
示例#24
0
 /* Copied from lucene 4.2.x core */
 private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException {
   final Terms terms = MultiFields.getTerms(r, field);
   if (terms != null) {
     final TermsEnum termsEnum = terms.iterator(null);
     if (termsEnum.seekExact(text, true)) {
       return termsEnum.totalTermFreq();
     }
   }
   return 0;
 }
示例#25
0
 protected void fill(String field, TermsEnum termsEnum) throws IOException {
   while (true) {
     BytesRef term = termsEnum.next();
     if (term != null) {
       insertWithOverflow(new TermStats(field, term, termsEnum.docFreq()));
     } else {
       break;
     }
   }
 }
 SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd)
     throws IOException {
   super(counts, total - counts[0], counts[0], endFacetOrd + 1);
   this.tenum = tenum;
   this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd + 1;
   if (mergePos < maxTermPos) {
     assert tenum != null;
     tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd);
     mergeTerm = tenum.term();
   }
 }
示例#27
0
  public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a =
        new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
          }

          @Override
          public int getPositionIncrementGap(String fieldName) {
            return positionGap;
          }

          @Override
          public int getOffsetGap(String fieldName) {
            return offsetGap;
          }
        };

    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlySegmentReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
  }
示例#28
0
 /** Computes which global ordinals are accepted by this IncludeExclude instance. */
 @Override
 public LongBitSet acceptedGlobalOrdinals(RandomAccessOrds globalOrdinals) throws IOException {
   LongBitSet acceptedGlobalOrdinals = new LongBitSet(globalOrdinals.getValueCount());
   TermsEnum globalTermsEnum;
   Terms globalTerms = new DocValuesTerms(globalOrdinals);
   // TODO: specialize based on compiled.type: for ALL and prefixes (sinkState >= 0 ) we can
   // avoid i/o and just set bits.
   globalTermsEnum = compiled.getTermsEnum(globalTerms);
   for (BytesRef term = globalTermsEnum.next(); term != null; term = globalTermsEnum.next()) {
     acceptedGlobalOrdinals.set(globalTermsEnum.ord());
   }
   return acceptedGlobalOrdinals;
 }
示例#29
0
  // make sure we never reuse from another reader even if it is the same field & codec etc
  public void testReuseDocsEnumDifferentReader() throws IOException {
    Directory dir = newDirectory();
    Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat());
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));

    RandomIndexWriter writer =
        new RandomIndexWriter(random(), dir, newIndexWriterConfig(analyzer).setCodec(cp));
    int numdocs = atLeast(20);
    createRandomIndex(numdocs, writer, random());
    writer.commit();

    DirectoryReader firstReader = DirectoryReader.open(dir);
    DirectoryReader secondReader = DirectoryReader.open(dir);
    List<LeafReaderContext> leaves = firstReader.leaves();
    List<LeafReaderContext> leaves2 = secondReader.leaves();

    for (LeafReaderContext ctx : leaves) {
      Terms terms = ctx.reader().terms("body");
      TermsEnum iterator = terms.iterator();
      IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>();
      MatchNoBits bits = new Bits.MatchNoBits(firstReader.maxDoc());
      iterator = terms.iterator();
      PostingsEnum docs = null;
      BytesRef term = null;
      while ((term = iterator.next()) != null) {
        docs =
            iterator.postings(
                null,
                randomDocsEnum("body", term, leaves2, bits),
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }
      assertEquals(terms.size(), enums.size());

      iterator = terms.iterator();
      enums.clear();
      docs = null;
      while ((term = iterator.next()) != null) {
        docs =
            iterator.postings(
                bits,
                randomDocsEnum("body", term, leaves2, bits),
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }
      assertEquals(terms.size(), enums.size());
    }
    writer.close();
    IOUtils.close(firstReader, secondReader, dir);
  }
示例#30
0
 @Override
 public long lookupTerm(BytesRef key) {
   try {
     switch (te.seekCeil(key)) {
       case FOUND:
         assert te.ord() >= 0;
         return te.ord();
       case NOT_FOUND:
         assert te.ord() >= 0;
         return -te.ord() - 1;
       default: /* END */
         return -numTerms() - 1;
     }
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }