Beispiel #1
0
 /**
  * Find terms in the index based on a prefix. Useful for autocomplete.
  *
  * @param index the index
  * @param fieldName the field
  * @param prefix the prefix we're looking for (null or empty string for all terms)
  * @param sensitive match case-sensitively or not?
  * @param maxResults max. number of results to return (or -1 for all)
  * @return the matching terms
  */
 public static List<String> findTermsByPrefix(
     LeafReader index, String fieldName, String prefix, boolean sensitive, int maxResults) {
   boolean allTerms = prefix == null || prefix.length() == 0;
   if (allTerms) {
     prefix = "";
     sensitive = true; // don't do unnecessary work in this case
   }
   try {
     if (!sensitive) prefix = StringUtil.removeAccents(prefix).toLowerCase();
     org.apache.lucene.index.Terms terms = index.terms(fieldName);
     List<String> results = new ArrayList<>();
     TermsEnum termsEnum = terms.iterator();
     BytesRef brPrefix = new BytesRef(prefix.getBytes(LUCENE_DEFAULT_CHARSET));
     termsEnum.seekCeil(brPrefix); // find the prefix in the terms list
     while (maxResults < 0 || results.size() < maxResults) {
       BytesRef term = termsEnum.next();
       if (term == null) break;
       String termText = term.utf8ToString();
       String optDesensitized = termText;
       if (!sensitive) optDesensitized = StringUtil.removeAccents(termText).toLowerCase();
       if (!allTerms && !optDesensitized.substring(0, prefix.length()).equalsIgnoreCase(prefix)) {
         // Doesn't match prefix or different field; no more matches
         break;
       }
       // Match, add term
       results.add(termText);
     }
     return results;
   } catch (IOException e) {
     throw new RuntimeException(e);
   }
 }
  public synchronized ShapeFieldCache<T> getCache(LeafReader reader) throws IOException {
    ShapeFieldCache<T> idx = sidx.get(reader);
    if (idx != null) {
      return idx;
    }
    long startTime = System.currentTimeMillis();

    log.fine("Building Cache [" + reader.maxDoc() + "]");
    idx = new ShapeFieldCache<>(reader.maxDoc(), defaultSize);
    int count = 0;
    DocsEnum docs = null;
    Terms terms = reader.terms(shapeField);
    TermsEnum te = null;
    if (terms != null) {
      te = terms.iterator(te);
      BytesRef term = te.next();
      while (term != null) {
        T shape = readShape(term);
        if (shape != null) {
          docs = te.docs(null, docs, DocsEnum.FLAG_NONE);
          Integer docid = docs.nextDoc();
          while (docid != DocIdSetIterator.NO_MORE_DOCS) {
            idx.add(docid, shape);
            docid = docs.nextDoc();
            count++;
          }
        }
        term = te.next();
      }
    }
    sidx.put(reader, idx);
    long elapsed = System.currentTimeMillis() - startTime;
    log.fine("Cached: [" + count + " in " + elapsed + "ms] " + idx);
    return idx;
  }
Beispiel #3
0
 public String[] getTerms() {
   IndexReader reader = null;
   int maxSize = 100;
   Set<String> searchResults = new HashSet<String>();
   try {
     reader = DirectoryReader.open(dir);
     Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("contents");
     TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);
     BytesRef byteRef = null;
     while ((byteRef = termsEnum.next()) != null) {
       String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
       searchResults.add(term);
       if (searchResults.size() >= maxSize) {
         break;
       }
     }
   } catch (IOException e) {
     // TODO Auto-generated catch block
     e.printStackTrace();
   } finally {
     try {
       if (reader != null) {
         reader.close();
       }
     } catch (IOException e) {
       // TODO Auto-generated catch block
       e.printStackTrace();
     }
   }
   return searchResults.toArray(new String[searchResults.size()]);
 }
  private void getPrefixTerms(
      ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment
    // into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf
    // individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
      Terms _terms = leaf.reader().terms(field);
      if (_terms == null) {
        continue;
      }

      TermsEnum termsEnum = _terms.iterator();
      TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
      if (TermsEnum.SeekStatus.END == seekStatus) {
        continue;
      }

      for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
        if (!StringHelper.startsWith(term, prefix.bytes())) {
          break;
        }

        terms.add(new Term(field, BytesRef.deepCopyOf(term)));
        if (terms.size() >= maxExpansions) {
          return;
        }
      }
    }
  }
    Query createCandidateQuery(IndexReader indexReader) throws IOException {
      List<Term> extractedTerms = new ArrayList<>();
      // include extractionResultField:failed, because docs with this term have no
      // extractedTermsField
      // and otherwise we would fail to return these docs. Docs that failed query term extraction
      // always need to be verified by MemoryIndex:
      extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED));

      LeafReader reader = indexReader.leaves().get(0).reader();
      Fields fields = reader.fields();
      for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
          continue;
        }

        BytesRef fieldBr = new BytesRef(field);
        TermsEnum tenum = terms.iterator();
        for (BytesRef term = tenum.next(); term != null; term = tenum.next()) {
          BytesRefBuilder builder = new BytesRefBuilder();
          builder.append(fieldBr);
          builder.append(FIELD_VALUE_SEPARATOR);
          builder.append(term);
          extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef()));
        }
      }
      return new TermsQuery(extractedTerms);
    }
Beispiel #6
0
  /**
   * Add term frequencies for a single document to a frequency map.
   *
   * @param reader the index
   * @param doc doc id
   * @param luceneName the index field from which to use the term vector
   * @param freq where to add to the token frequencies
   */
  public static void getFrequenciesFromTermVector(
      IndexReader reader, int doc, String luceneName, Map<String, Integer> freq) {
    try {
      org.apache.lucene.index.Terms terms = reader.getTermVector(doc, luceneName);
      if (terms == null) {
        throw new IllegalArgumentException("Field " + luceneName + " has no Terms");
      }
      TermsEnum termsEnum = terms.iterator();

      // Verzamel concordantiewoorden uit term vector
      PostingsEnum postingsEnum = null;
      while (termsEnum.next() != null) {
        postingsEnum = termsEnum.postings(null, postingsEnum, PostingsEnum.FREQS);
        String term = termsEnum.term().utf8ToString();
        Integer n = freq.get(term);
        if (n == null) {
          n = 0;
        }
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          n += termsEnum.docFreq();
        }
        freq.put(term, n);
      }
    } catch (Exception e) {
      throw ExUtil.wrapRuntimeException(e);
    }
  }
  public void testReuseDocsEnumNoReuse() throws IOException {
    Directory dir = newDirectory();
    Codec cp = TestUtil.alwaysPostingsFormat(new Lucene40RWPostingsFormat());
    RandomIndexWriter writer =
        new RandomIndexWriter(
            random(), dir, newIndexWriterConfig(new MockAnalyzer(random())).setCodec(cp));
    int numdocs = atLeast(20);
    createRandomIndex(numdocs, writer, random());
    writer.commit();

    DirectoryReader open = DirectoryReader.open(dir);
    for (LeafReaderContext ctx : open.leaves()) {
      LeafReader indexReader = ctx.reader();
      Terms terms = indexReader.terms("body");
      TermsEnum iterator = terms.iterator();
      IdentityHashMap<PostingsEnum, Boolean> enums = new IdentityHashMap<>();
      MatchNoBits bits = new Bits.MatchNoBits(indexReader.maxDoc());
      while ((iterator.next()) != null) {
        PostingsEnum docs =
            iterator.postings(
                random().nextBoolean() ? bits : new Bits.MatchNoBits(indexReader.maxDoc()),
                null,
                random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
        enums.put(docs, true);
      }

      assertEquals(terms.size(), enums.size());
    }
    writer.commit();
    IOUtils.close(writer, open, dir);
  }
Beispiel #8
0
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Beispiel #9
0
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
  public void assertTerms(Terms leftTerms, Terms rightTerms, boolean deep) throws Exception {
    if (leftTerms == null || rightTerms == null) {
      assertNull(leftTerms);
      assertNull(rightTerms);
      return;
    }
    assertTermsStatistics(leftTerms, rightTerms);

    // NOTE: we don't assert hasOffsets/hasPositions/hasPayloads because they are allowed to be
    // different

    TermsEnum leftTermsEnum = leftTerms.iterator(null);
    TermsEnum rightTermsEnum = rightTerms.iterator(null);
    assertTermsEnum(leftTermsEnum, rightTermsEnum, true);

    assertTermsSeeking(leftTerms, rightTerms);

    if (deep) {
      int numIntersections = atLeast(3);
      for (int i = 0; i < numIntersections; i++) {
        String re = AutomatonTestUtil.randomRegexp(random());
        CompiledAutomaton automaton =
            new CompiledAutomaton(new RegExp(re, RegExp.NONE).toAutomaton());
        if (automaton.type == CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
          // TODO: test start term too
          TermsEnum leftIntersection = leftTerms.intersect(automaton, null);
          TermsEnum rightIntersection = rightTerms.intersect(automaton, null);
          assertTermsEnum(leftIntersection, rightIntersection, rarely());
        }
      }
    }
  }
  /*
   *  Utility function to display a term vector.
   */
  static void termVectorDisplay(Terms terms) throws IOException {

    if ((terms == null) || (terms.size() == -1)) System.out.println("    The field is not stored.");
    else {
      /*
       *  The terms for this field are stored.
       */
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %10d %-20s %d ",
            ithTerm.ord(), ithTerm.term().utf8ToString(), ithTerm.totalTermFreq());

        DocsAndPositionsEnum currDoc = ithTerm.docsAndPositions(null, null);
        currDoc.nextDoc();

        for (int jthPosition = 0; jthPosition < ithTerm.totalTermFreq(); jthPosition++)
          System.out.print(currDoc.nextPosition() + " ");

        System.out.println();
      }
      ;
    }
    ;
  }
  /*
   *  listTermDictionary displays the term dictionary for a field.
   */
  static void listTermDictionary(IndexReader reader, String fieldName) throws IOException {

    System.out.println("\nTerm Dictionary:  field " + fieldName);

    /*
      Grant says:
      MultiFields.getTerms(IndexReader, fieldName)
    */

    Terms terms = MultiFields.getTerms(reader, fieldName);

    if ((terms == null) || (terms.size() == -1))
      System.out.println("    The term dictionary is empty.");
    else {
      System.out.println("    Vocabulary size: " + terms.size() + " terms");

      TermsEnum ithTerm = terms.iterator(null);

      /*
       *  Iterate over the terms in this document.
       *  Information about a term's occurrences (tf and
       *  positions) is accessed via the indexing API, which
       *  returns inverted lists that describe (only) the
       *  current document.
       */
      while (ithTerm.next() != null) {
        System.out.format(
            "      %-30s %d %d\n",
            ithTerm.term().utf8ToString(), ithTerm.docFreq(), ithTerm.totalTermFreq());
      }
      ;
    }
    ;
  }
 private void buildFieldStatistics(XContentBuilder builder, Terms curTerms) throws IOException {
   long sumDocFreq = curTerms.getSumDocFreq();
   int docCount = curTerms.getDocCount();
   long sumTotalTermFrequencies = curTerms.getSumTotalTermFreq();
   if (docCount > 0) {
     assert ((sumDocFreq > 0)) : "docCount >= 0 but sumDocFreq ain't!";
     assert ((sumTotalTermFrequencies > 0)) : "docCount >= 0 but sumTotalTermFrequencies ain't!";
     builder.startObject(FieldStrings.FIELD_STATISTICS);
     builder.field(FieldStrings.SUM_DOC_FREQ, sumDocFreq);
     builder.field(FieldStrings.DOC_COUNT, docCount);
     builder.field(FieldStrings.SUM_TTF, sumTotalTermFrequencies);
     builder.endObject();
   } else if (docCount == -1) { // this should only be -1 if the field
     // statistics were not requested at all. In
     // this case all 3 values should be -1
     assert ((sumDocFreq == -1)) : "docCount was -1 but sumDocFreq ain't!";
     assert ((sumTotalTermFrequencies == -1))
         : "docCount was -1 but sumTotalTermFrequencies ain't!";
   } else {
     throw new ElasticsearchIllegalStateException(
         "Something is wrong with the field statistics of the term vector request: Values are "
             + "\n"
             + FieldStrings.SUM_DOC_FREQ
             + " "
             + sumDocFreq
             + "\n"
             + FieldStrings.DOC_COUNT
             + " "
             + docCount
             + "\n"
             + FieldStrings.SUM_TTF
             + " "
             + sumTotalTermFrequencies);
   }
 }
Beispiel #14
0
 /* Copied from lucene 4.2.x core */
 private static long totalTermFreq(IndexReader r, String field, BytesRef text) throws IOException {
   final Terms terms = MultiFields.getTerms(r, field);
   if (terms != null) {
     final TermsEnum termsEnum = terms.iterator(null);
     if (termsEnum.seekExact(text, true)) {
       return termsEnum.totalTermFreq();
     }
   }
   return 0;
 }
 private long getSumTermFrequency(IndexReader reader, String fieldName) {
   Terms collectionTermVector = null;
   try {
     collectionTermVector = MultiFields.getTerms(reader, fieldName);
     long totalTermFreq = collectionTermVector.getSumTotalTermFreq();
     return totalTermFreq;
   } catch (IOException e) {
     LOG.warn("Unable to get total term frequency, it might not be indexed");
   }
   return 0;
 }
 @Override
 public FieldStats stats(Terms terms, int maxDoc) throws IOException {
   float minValue = NumericUtils.sortableIntToFloat(NumericUtils.getMinInt(terms));
   float maxValue = NumericUtils.sortableIntToFloat(NumericUtils.getMaxInt(terms));
   return new FieldStats.Float(
       maxDoc,
       terms.getDocCount(),
       terms.getSumDocFreq(),
       terms.getSumTotalTermFreq(),
       minValue,
       maxValue);
 }
 @Override
 public FieldStats stats(Terms terms, int maxDoc) throws IOException {
   long minValue = NumericUtils.getMinInt(terms);
   long maxValue = NumericUtils.getMaxInt(terms);
   return new FieldStats.Long(
       maxDoc,
       terms.getDocCount(),
       terms.getSumDocFreq(),
       terms.getSumTotalTermFreq(),
       minValue,
       maxValue);
 }
Beispiel #18
0
  public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a =
        new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
          }

          @Override
          public int getPositionIncrementGap(String fieldName) {
            return positionGap;
          }

          @Override
          public int getOffsetGap(String fieldName) {
            return offsetGap;
          }
        };

    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlySegmentReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
  }
 private void initMemory(Terms curTerms, int termFreq) {
   // init memory for performance reasons
   if (curTerms.hasPositions()) {
     currentPositions = ArrayUtil.grow(currentPositions, termFreq);
   }
   if (curTerms.hasOffsets()) {
     currentStartOffset = ArrayUtil.grow(currentStartOffset, termFreq);
     currentEndOffset = ArrayUtil.grow(currentEndOffset, termFreq);
   }
   if (curTerms.hasPayloads()) {
     currentPayloads = new BytesArray[termFreq];
   }
 }
 @Override
 protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
   if (maxEdits == 0 || prefixLength >= term.text().length()) { // can only match if it's exact
     return new SingleTermsEnum(terms.iterator(), term.bytes());
   }
   return new FuzzyTermsEnum(terms, atts, getTerm(), maxEdits, prefixLength, transpositions);
 }
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param termFreqMap a Map of terms and their frequencies
   * @param vector List of terms and their frequencies for a doc/field
   * @param fieldName Optional field name of the terms for skip terms
   */
  private void addTermFrequencies(
      Map<String, Int> termFreqMap, Terms vector, @Nullable String fieldName) throws IOException {
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      if (isSkipTerm(fieldName, term)) {
        continue;
      }

      final PostingsEnum docs = termsEnum.postings(null, null);
      int freq = 0;
      while (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        freq += docs.freq();
      }

      // increment frequency
      Int cnt = termFreqMap.get(term);
      if (cnt == null) {
        cnt = new Int();
        termFreqMap.put(term, cnt);
        cnt.x = freq;
      } else {
        cnt.x += freq;
      }
    }
  }
  /**
   * Adds terms and frequencies found in vector into the Map termFreqMap
   *
   * @param field2termFreqMap a Map of terms and their frequencies per field
   * @param vector List of terms and their frequencies for a doc/field
   */
  private void addTermFrequencies(
      Map<String, Map<String, Int>> field2termFreqMap, Terms vector, String fieldName)
      throws IOException {
    Map<String, Int> termFreqMap = field2termFreqMap.get(fieldName);
    if (termFreqMap == null) {
      termFreqMap = new HashMap<>();
      field2termFreqMap.put(fieldName, termFreqMap);
    }
    final TermsEnum termsEnum = vector.iterator();
    final CharsRefBuilder spare = new CharsRefBuilder();
    BytesRef text;
    while ((text = termsEnum.next()) != null) {
      spare.copyUTF8Bytes(text);
      final String term = spare.toString();
      if (isNoiseWord(term)) {
        continue;
      }
      final int freq = (int) termsEnum.totalTermFreq();

      // increment frequency
      Int cnt = termFreqMap.get(term);
      if (cnt == null) {
        cnt = new Int();
        termFreqMap.put(term, cnt);
        cnt.x = freq;
      } else {
        cnt.x += freq;
      }
    }
  }
Beispiel #23
0
  /**
   * @param reader
   * @param numTerms
   * @param field
   * @return TermStats[] ordered by terms with highest docFreq first.
   * @throws Exception
   */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
      throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      for (String field : fieldNames) {
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      Iterator<String> fieldIterator = fields.iterator();
      while (fieldIterator.hasNext()) {
        String field = fieldIterator.next();
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
 public PostingsEnum randomDocsEnum(
     String field, BytesRef term, List<LeafReaderContext> readers, Bits bits) throws IOException {
   if (random().nextInt(10) == 0) {
     return null;
   }
   LeafReader indexReader = readers.get(random().nextInt(readers.size())).reader();
   Terms terms = indexReader.terms(field);
   if (terms == null) {
     return null;
   }
   TermsEnum iterator = terms.iterator();
   if (iterator.seekExact(term)) {
     return iterator.postings(
         bits, null, random().nextBoolean() ? PostingsEnum.FREQS : PostingsEnum.NONE);
   }
   return null;
 }
Beispiel #25
0
  @Override
  protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) throws IOException {
    if (this.terms.size() == 0) {
      return TermsEnum.EMPTY;
    }

    return new SeekingTermSetTermsEnum(terms.iterator(), this.terms, ords);
  }
  protected void compareTermVectors(Terms terms, Terms memTerms, String field_name)
      throws IOException {

    TermsEnum termEnum = terms.iterator();
    TermsEnum memTermEnum = memTerms.iterator();

    while (termEnum.next() != null) {
      assertNotNull(memTermEnum.next());
      assertThat(termEnum.totalTermFreq(), equalTo(memTermEnum.totalTermFreq()));

      PostingsEnum docsPosEnum = termEnum.postings(null, PostingsEnum.POSITIONS);
      PostingsEnum memDocsPosEnum = memTermEnum.postings(null, PostingsEnum.POSITIONS);
      String currentTerm = termEnum.term().utf8ToString();

      assertThat(
          "Token mismatch for field: " + field_name,
          currentTerm,
          equalTo(memTermEnum.term().utf8ToString()));

      docsPosEnum.nextDoc();
      memDocsPosEnum.nextDoc();

      int freq = docsPosEnum.freq();
      assertThat(freq, equalTo(memDocsPosEnum.freq()));
      for (int i = 0; i < freq; i++) {
        String failDesc = " (field:" + field_name + " term:" + currentTerm + ")";
        int memPos = memDocsPosEnum.nextPosition();
        int pos = docsPosEnum.nextPosition();
        assertThat("Position test failed" + failDesc, memPos, equalTo(pos));
        assertThat(
            "Start offset test failed" + failDesc,
            memDocsPosEnum.startOffset(),
            equalTo(docsPosEnum.startOffset()));
        assertThat(
            "End offset test failed" + failDesc,
            memDocsPosEnum.endOffset(),
            equalTo(docsPosEnum.endOffset()));
        assertThat(
            "Missing payload test failed" + failDesc,
            docsPosEnum.getPayload(),
            equalTo(docsPosEnum.getPayload()));
      }
    }
    assertNull("Still some tokens not processed", memTermEnum.next());
  }
  /**
   * We will implement this according to the Lucene specification the formula used: sum ( IDF(qi) *
   * (df(qi,D) * (k+1)) / (df(qi,D) + k * (1-b + b*|D| / avgFL)) IDF and avgFL computation are
   * described above.
   *
   * @param doc
   * @param terms
   * @param context
   * @return
   */
  @Override
  public float extract(Document doc, Terms terms, RerankerContext context) {
    Set<String> queryTokens = new HashSet<>(context.getQueryTokens());

    TermsEnum termsEnum = null;
    try {
      termsEnum = terms.iterator();
    } catch (IOException e) {
      LOG.warn("Error computing BM25, unable to retrieve terms enum");
      return 0.0f;
    }

    IndexReader reader = context.getIndexSearcher().getIndexReader();
    long maxDocs = reader.numDocs();
    long sumTotalTermFreq = getSumTermFrequency(reader, context.getField());
    // Compute by iterating
    long docSize = 0L;

    // NOTE df cannot be retrieved just from the term vector,
    // the term vector here is only a partial term vector that treats this as if we only have 1
    // document in the index
    Map<String, Integer> docFreqMap = null;
    try {
      docFreqMap = getDocFreqs(reader, context.getQueryTokens(), context.getField());
    } catch (IOException e) {
      LOG.warn("Unable to retrieve document frequencies.");
      docFreqMap = new HashMap<>();
    }

    Map<String, Long> termFreqMap = new HashMap<>();
    try {
      while (termsEnum.next() != null) {
        String termString = termsEnum.term().utf8ToString();
        docSize += termsEnum.totalTermFreq();
        if (queryTokens.contains(termString)) {
          termFreqMap.put(termString, termsEnum.totalTermFreq());
        }
      }
    } catch (IOException e) {
      LOG.warn("Unable to retrieve termsEnum, treating as 0");
    }

    float score = 0.0f;
    // Iterate over the query tokens
    double avgFL = computeAvgFL(sumTotalTermFreq, maxDocs);
    for (String token : queryTokens) {
      long docFreq = docFreqMap.containsKey(token) ? docFreqMap.get(token) : 0;
      double termFreq = termFreqMap.containsKey(token) ? termFreqMap.get(token) : 0;
      double numerator = (this.k1 + 1) * termFreq;
      double docLengthFactor = this.b * (docSize / avgFL);
      double denominator = termFreq + (this.k1) * (1 - this.b + docLengthFactor);
      score += computeIDF(docFreq, maxDocs) * numerator / denominator;
    }

    return score;
  }
 private void buildField(
     XContentBuilder builder,
     final CharsRefBuilder spare,
     Fields theFields,
     Iterator<String> fieldIter)
     throws IOException {
   String fieldName = fieldIter.next();
   builder.startObject(fieldName);
   Terms curTerms = theFields.terms(fieldName);
   // write field statistics
   buildFieldStatistics(builder, curTerms);
   builder.startObject(FieldStrings.TERMS);
   TermsEnum termIter = curTerms.iterator(null);
   for (int i = 0; i < curTerms.size(); i++) {
     buildTerm(builder, spare, curTerms, termIter);
   }
   builder.endObject();
   builder.endObject();
 }
 private final TermsEnum delegate() throws IOException {
   if (delegateTermsEnum == null) {
     /* pull the iterator only if we really need it -
      * this can be a relativly heavy operation depending on the
      * delegate postings format and they underlying directory
      * (clone IndexInput) */
     delegateTermsEnum = delegateTerms.iterator(reuseDelegate);
   }
   return delegateTermsEnum;
 }
 @Override
 public FieldStats.Long stats(IndexReader reader) throws IOException {
   int maxDoc = reader.maxDoc();
   Terms terms = org.apache.lucene.index.MultiFields.getTerms(reader, name());
   if (terms == null) {
     return null;
   }
   long minValue = LegacyNumericUtils.getMinInt(terms);
   long maxValue = LegacyNumericUtils.getMaxInt(terms);
   return new FieldStats.Long(
       maxDoc,
       terms.getDocCount(),
       terms.getSumDocFreq(),
       terms.getSumTotalTermFreq(),
       isSearchable(),
       isAggregatable(),
       minValue,
       maxValue);
 }