Esempio n. 1
  * Prepare a document reconstructor.
  * @param reader IndexReader to read from.
  * @param fieldNames if non-null or not empty, data will be collected only from these fields,
  *     otherwise data will be collected from all fields
  * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated)
  * @throws Exception
 public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception {
   if (reader == null) {
     throw new Exception("IndexReader cannot be null.");
   this.reader = reader;
   if (fieldNames == null || fieldNames.length == 0) {
     // collect fieldNames
     this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]);
   } else {
     this.fieldNames = fieldNames;
   if (numTerms == -1) {
     Fields fields = MultiFields.getFields(reader);
     numTerms = 0;
     FieldsEnum fe = fields.iterator();
     String fld = null;
     while ((fld = != null) {
       TermsEnum te = fe.terms();
       while ( != null) {
     this.numTerms = numTerms;
   deleted = MultiFields.getDeletedDocs(reader);
  public void testGetTermVector() throws IOException {
            .setSource("field", "type=text,term_vector=with_positions_offsets_payloads")

        .prepareIndex(indexOrAlias(), "type1", "1")
        .setSource("field", "the quick brown fox jumps over the lazy dog")

    TermVectorsResponse termVectorsResponse =
        client().prepareTermVectors(indexOrAlias(), "type1", "1").get();
    assertThat(termVectorsResponse.getIndex(), equalTo("test"));
    assertThat(termVectorsResponse.isExists(), equalTo(true));
    Fields fields = termVectorsResponse.getFields();
    assertThat(fields.size(), equalTo(1));
    assertThat(fields.terms("field").size(), equalTo(8L));
    Query createCandidateQuery(IndexReader indexReader) throws IOException {
      List<Term> extractedTerms = new ArrayList<>();
      // include extractionResultField:failed, because docs with this term have no
      // extractedTermsField
      // and otherwise we would fail to return these docs. Docs that failed query term extraction
      // always need to be verified by MemoryIndex:
      extractedTerms.add(new Term(, EXTRACTION_FAILED));

      LeafReader reader = indexReader.leaves().get(0).reader();
      Fields fields = reader.fields();
      for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {

        BytesRef fieldBr = new BytesRef(field);
        TermsEnum tenum = terms.iterator();
        for (BytesRef term =; term != null; term = {
          BytesRefBuilder builder = new BytesRefBuilder();
          extractedTerms.add(new Term(, builder.toBytesRef()));
      return new TermsQuery(extractedTerms);
Esempio n. 4
  * Retrieve term vector for this document and field, or null if term vectors were not indexed. The
  * returned Fields instance acts like a single-document inverted index (the docID will be 0).
 public final Terms getTermVector(int docID, String field) throws IOException {
   Fields vectors = getTermVectors(docID);
   if (vectors == null) {
     return null;
   return vectors.terms(field);
Esempio n. 5
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader =;
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
        /*} else {
    } finally {

      if (ireader != null) {
        try {
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
Esempio n. 6
   * List all of the files in this index database
   * @throws IOException If an IO error occurs while reading from the database
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader =; // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
    } finally {

      if (ireader != null) {
        try {
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
Esempio n. 7
   * Find words for a more-like-this query former.
   * @param docNum the id of the lucene document from which to find terms
  private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
      final Fields vectors = ir.getTermVectors(docNum);
      final Terms vector;
      if (vectors != null) {
        vector = vectors.terms(fieldName);
      } else {
        vector = null;

      // field does not store term vector info
      if (vector == null) {
        Document d = ir.document(docNum);
        IndexableField[] fields = d.getFields(fieldName);
        for (IndexableField field : fields) {
          final String stringValue = field.stringValue();
          if (stringValue != null) {
            addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
      } else {
        addTermFrequencies(field2termFreqMap, vector, fieldName);

    return createQueue(field2termFreqMap);
Esempio n. 8
  private void printSegment(PrintWriter out, SegmentCommitInfo si) throws Exception {
    SegmentReader reader =
        new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random()));

    for (int i = 0; i < reader.numDocs(); i++) out.println(reader.document(i));

    Fields fields = reader.fields();
    for (String field : fields) {
      Terms terms = fields.terms(field);
      TermsEnum tis = terms.iterator(null);
      while ( != null) {

        out.print("  term=" + field + ":" + tis.term());
        out.println("    DF=" + tis.docFreq());

        DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null);

        while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          out.print(" doc=" + positions.docID());
          out.print(" TF=" + positions.freq());
          out.print(" pos=");
          for (int j = 1; j < positions.freq(); j++) out.print("," + positions.nextPosition());
   *  listTermVectors displays the term vectors for all of the fields
   *  in a document in an index (specified by reader).
  static void listTermVectors(IndexReader reader, String docidString) throws IOException {

    System.out.println("\nTermVector:  docid " + docidString);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");

     *  Iterate over the fields in this document.
    Fields fields = reader.getTermVectors(docid);
    Iterator<String> fieldIterator = fields.iterator();

    while (fieldIterator.hasNext()) {
      String fieldName =;
      System.out.println("  Field: " + fieldName);

      Terms terms = fields.terms(fieldName);
 public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
   assert index != null;
   assert type != null;
   assert id != null;
   builder.field(FieldStrings._INDEX, index);
   builder.field(FieldStrings._TYPE, type);
   if (!isArtificial()) {
     builder.field(FieldStrings._ID, id);
   builder.field(FieldStrings._VERSION, docVersion);
   builder.field(FieldStrings.FOUND, isExists());
   builder.field(FieldStrings.TOOK, tookInMillis);
   if (!isExists()) {
     return builder;
   final CharsRefBuilder spare = new CharsRefBuilder();
   Fields theFields = getFields();
   Iterator<String> fieldIter = theFields.iterator();
   while (fieldIter.hasNext()) {
     buildField(builder, spare, theFields, fieldIter);
   return builder;
 public int[] toDocsArray(Term term, Bits bits, IndexReader reader) throws IOException {
   Fields fields = MultiFields.getFields(reader);
   Terms cterms = fields.terms(term.field);
   TermsEnum ctermsEnum = cterms.iterator();
   if (ctermsEnum.seekExact(new BytesRef(term.text()))) {
     PostingsEnum postingsEnum =, ctermsEnum, bits, null, PostingsEnum.NONE);
     return toArray(postingsEnum);
   return null;
Esempio n. 12
  public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a =
        new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;

          public int getPositionIncrementGap(String fieldName) {
            return positionGap;

          public int getOffsetGap(String fieldName) {
            return offsetGap;

    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    final LeafReader reader = getOnlySegmentReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"),;
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
Esempio n. 13
   * @param reader
   * @param numTerms
   * @param field
   * @return TermStats[] ordered by terms with highest docFreq first.
   * @throws Exception
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
      throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {"Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      tiq = new TermStatsQueue(numTerms);
      for (String field : fieldNames) {
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {"Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      tiq = new TermStatsQueue(numTerms);
      Iterator<String> fieldIterator = fields.iterator();
      while (fieldIterator.hasNext()) {
        String field =;
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
    return result;
Esempio n. 14
  public static Terms getTermVector(String fieldname, SolrIndexSearcher solrIndexSearcher)
      throws JATEException {
    try {
      Fields fields = MultiFields.getFields(solrIndexSearcher.getLeafReader());

      Terms vector = fields.terms(fieldname);
      if (vector == null)
        throw new JATEException(String.format("Cannot find expected field: %s", fieldname));
      return vector;
    } catch (IOException ioe) {
      StringBuilder sb =
          new StringBuilder(
              String.format("Cannot find expected field: %s. Error stacktrack: \n", fieldname));
      throw new JATEException(sb.toString());
Esempio n. 15
 private void verifyCount(IndexReader ir) throws Exception {
   Fields fields = MultiFields.getFields(ir);
   for (String field : fields) {
     Terms terms = fields.terms(field);
     if (terms == null) {
     int docCount = terms.getDocCount();
     FixedBitSet visited = new FixedBitSet(ir.maxDoc());
     TermsEnum te = terms.iterator();
     while ( != null) {
       PostingsEnum de =, te, null, PostingsEnum.NONE);
       while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
     assertEquals(visited.cardinality(), docCount);
 private Fields mergeFields(Fields fields1, Fields fields2) throws IOException {
   ParallelFields parallelFields = new ParallelFields();
   for (String fieldName : fields2) {
     Terms terms = fields2.terms(fieldName);
     if (terms != null) {
       parallelFields.addField(fieldName, terms);
   for (String fieldName : fields1) {
     if (parallelFields.fields.containsKey(fieldName)) {
     Terms terms = fields1.terms(fieldName);
     if (terms != null) {
       parallelFields.addField(fieldName, terms);
   return parallelFields;
   * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code>
   * O(numIndexedFields)</code>.
   * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns
   *     will break out its in-heap bytes separately in the returned {@link CompletionStats}
  public CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) {
    CompletionStats completionStats = new CompletionStats();
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
      LeafReader atomicReader = atomicReaderContext.reader();
      try {
        Fields fields = atomicReader.fields();
        for (String fieldName : fields) {
          Terms terms = fields.terms(fieldName);
          if (terms instanceof CompletionTerms) {
            CompletionTerms completionTerms = (CompletionTerms) terms;
      } catch (IOException ioe) {
        logger.error("Could not get completion stats", ioe);

    return completionStats;
Esempio n. 18
  public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException {
    DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead
    int maxDoc = searcher.getIndexReader().maxDoc();
    int smallSetSize = smallSetSize(maxDoc);

    String field = term.field();
    BytesRef termVal = term.bytes();

    int maxCount = 0;
    int firstReader = -1;
    List<LeafReaderContext> leaves = reader.leaves();
    PostingsEnum[] postList =
        new PostingsEnum
                .size()]; // use array for slightly higher scanning cost, but fewer memory
                          // allocations
    for (LeafReaderContext ctx : leaves) {
      assert leaves.get(ctx.ord) == ctx;
      LeafReader r = ctx.reader();
      Fields f = r.fields();
      Terms t = f.terms(field);
      if (t == null) continue; // field is missing
      TermsEnum te = t.iterator();
      if (te.seekExact(termVal)) {
        maxCount += te.docFreq();
        postList[ctx.ord] = te.postings(null, PostingsEnum.NONE);
        if (firstReader < 0) firstReader = ctx.ord;

    if (maxCount == 0) {
      return DocSet.EMPTY;

    if (maxCount <= smallSetSize) {
      return createSmallSet(leaves, postList, maxCount, firstReader);

    return createBigSet(leaves, postList, maxDoc, firstReader);
Esempio n. 19
  * Return a query that will return docs like the passed Fields.
  * @return a query that will return docs like the passed Fields.
 public Query like(Fields... likeFields) throws IOException {
   // get all field names
   Set<String> fieldNames = new HashSet<>();
   for (Fields fields : likeFields) {
     for (String fieldName : fields) {
   // term selection is per field, then appended to a single boolean query
   BooleanQuery bq = new BooleanQuery();
   for (String fieldName : fieldNames) {
     Map<String, Int> termFreqMap = new HashMap<>();
     for (Fields fields : likeFields) {
       Terms vector = fields.terms(fieldName);
       if (vector != null) {
         addTermFrequencies(termFreqMap, vector, fieldName);
     addToQuery(createQueue(termFreqMap, fieldName), bq);
   return bq;
Esempio n. 20
  /** Returns TermStats[] ordered by terms with highest docFreq first. */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field)
      throws Exception {
    TermStatsQueue tiq = null;

    if (field != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("field " + field + " not found");
      Terms terms = fields.terms(field);
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator(null);
        tiq = new TermStatsQueue(numTerms);
        tiq.fill(field, termsEnum);
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("no fields found for this index");
      tiq = new TermStatsQueue(numTerms);
      for (String fieldName : fields) {
        Terms terms = fields.terms(fieldName);
        if (terms != null) {
          tiq.fill(fieldName, terms.iterator(null));

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
    return result;
  public void collectTermContext(
      IndexReader reader,
      List<LeafReaderContext> leaves,
      TermContext[] contextArray,
      Term[] queryTerms)
      throws IOException {
    TermsEnum termsEnum = null;
    for (LeafReaderContext context : leaves) {
      final Fields fields = context.reader().fields();
      for (int i = 0; i < queryTerms.length; i++) {
        Term term = queryTerms[i];
        TermContext termContext = contextArray[i];
        final Terms terms = fields.terms(term.field());
        if (terms == null) {
          // field does not exist
        termsEnum = terms.iterator();
        assert termsEnum != null;

        if (termsEnum == TermsEnum.EMPTY) continue;
        if (termsEnum.seekExact(term.bytes())) {
          if (termContext == null) {
            contextArray[i] =
                new TermContext(
          } else {
                termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
  public void testCodec() throws Exception {
    Directory dir = new AppendingRAMDirectory(new RAMDirectory());
    IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer());

    cfg.setCodecProvider(new AppendingCodecProvider());
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundFile(false);
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundDocStore(false);
    IndexWriter writer = new IndexWriter(dir, cfg);
    Document doc = new Document();
    doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
    IndexReader reader =, null, true, 1, new AppendingCodecProvider());
    assertEquals(2, reader.numDocs());
    doc = reader.document(0);
    assertEquals(text, doc.get("f"));
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("f");
    TermsEnum te = terms.iterator();
    assertEquals(SeekStatus.FOUND, BytesRef("quick")));
    assertEquals(SeekStatus.FOUND, BytesRef("brown")));
    assertEquals(SeekStatus.FOUND, BytesRef("fox")));
    assertEquals(SeekStatus.FOUND, BytesRef("jumped")));
    assertEquals(SeekStatus.FOUND, BytesRef("over")));
    assertEquals(SeekStatus.FOUND, BytesRef("lazy")));
    assertEquals(SeekStatus.FOUND, BytesRef("dog")));
    assertEquals(SeekStatus.FOUND, BytesRef("the")));
    DocsEnum de =, null);
    assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS);
    assertEquals(2, de.freq());
    assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS);
    assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS);
  protected ShardTermlistResponse shardOperation(ShardTermlistRequest request)
      throws ElasticSearchException {
    synchronized (termlistMutex) {
      InternalIndexShard indexShard =
      Engine.Searcher searcher = indexShard.searcher();
      try {
        Set<String> set = new CompactHashSet();

        Fields fields = MultiFields.getFields(searcher.reader());
        if (fields != null) {
          for (Iterator<String> it = fields.iterator(); it.hasNext(); ) {
            String field =;
            if (field.charAt(0) == '_') {
            if (request.getField() == null || field.equals(request.getField())) {
              Terms terms = fields.terms(field);
              if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text;
                while ((text = != null) {
                  System.out.println("field=" + field + "; text=" + text.utf8ToString());
        return new ShardTermlistResponse(request.index(), request.shardId(), set);
      } catch (IOException ex) {
        throw new ElasticSearchException(ex.getMessage(), ex);
 private void buildField(
     XContentBuilder builder,
     final CharsRefBuilder spare,
     Fields theFields,
     Iterator<String> fieldIter)
     throws IOException {
   String fieldName =;
   Terms curTerms = theFields.terms(fieldName);
   // write field statistics
   buildFieldStatistics(builder, curTerms);
   TermsEnum termIter = curTerms.iterator(null);
   for (int i = 0; i < curTerms.size(); i++) {
     buildTerm(builder, spare, curTerms, termIter);
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term =;
      } else {
        // position termsEnum on first term
        term =;

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term =;

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            fieldTerms.add(external.toString(), docFreq);

        term =;

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {

    NamedList<Object> termVectors = new NamedList<Object>();
    rb.rsp.add(TERM_VECTORS, termVectors);

    IndexSchema schema = rb.req.getSchema();
    SchemaField keyField = schema.getUniqueKeyField();
    String uniqFieldName = null;
    if (keyField != null) {
      uniqFieldName = keyField.getName();
      termVectors.add("uniqueKeyFieldName", uniqFieldName);

    FieldOptions allFields = new FieldOptions();
    // figure out what options we have, and try to get the appropriate vector
    allFields.termFreq = params.getBool(TermVectorParams.TF, false);
    allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
    allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
    allFields.docFreq = params.getBool(TermVectorParams.DF, false);
    allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
    // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
    // short cut to all values.
    if (params.getBool(TermVectorParams.ALL, false)) {
      allFields.termFreq = true;
      allFields.positions = true;
      allFields.offsets = true;
      allFields.docFreq = true;
      allFields.tfIdf = true;

    // Build up our per field mapping
    Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
    NamedList<List<String>> warnings = new NamedList<List<String>>();
    List<String> noTV = new ArrayList<String>();
    List<String> noPos = new ArrayList<String>();
    List<String> noOff = new ArrayList<String>();

    Set<String> fields = getFields(rb);
    if (null != fields) {
      // we have specific fields to retrieve, or no fields
      for (String field : fields) {

        // workarround SOLR-3523
        if (null == field || "score".equals(field)) continue;

        // we don't want to issue warnings about the uniqueKey field
        // since it can cause lots of confusion in distributed requests
        // where the uniqueKey field is injected into the fl for merging
        final boolean fieldIsUniqueKey = field.equals(uniqFieldName);

        SchemaField sf = schema.getFieldOrNull(field);
        if (sf != null) {
          if (sf.storeTermVector()) {
            FieldOptions option = fieldOptions.get(field);
            if (option == null) {
              option = new FieldOptions();
              option.fieldName = field;
              fieldOptions.put(field, option);
            // get the per field mappings
            option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
            option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
            option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
            // Validate these are even an option
            option.positions =
                params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
            if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey) {
            option.offsets =
                params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
            if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey) {
          } else { // field doesn't have term vectors
            if (!fieldIsUniqueKey) noTV.add(field);
        } else {
          // field doesn't exist
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
    } // else, deal with all fields

    // NOTE: currently all typs of warnings are schema driven, and garunteed
    // to be consistent across all shards - if additional types of warnings
    // are added that might be differnet between shards, finishStage() needs
    // to be changed to account for that.
    boolean hasWarnings = false;
    if (!noTV.isEmpty()) {
      warnings.add("noTermVectors", noTV);
      hasWarnings = true;
    if (!noPos.isEmpty()) {
      warnings.add("noPositions", noPos);
      hasWarnings = true;
    if (!noOff.isEmpty()) {
      warnings.add("noOffsets", noOff);
      hasWarnings = true;
    if (hasWarnings) {
      termVectors.add("warnings", warnings);

    DocListAndSet listAndSet = rb.getResults();
    List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
    Iterator<Integer> iter;
    if (docIds != null && !docIds.isEmpty()) {
      iter = docIds.iterator();
    } else {
      DocList list = listAndSet.docList;
      iter = list.iterator();
    SolrIndexSearcher searcher = rb.req.getSearcher();

    IndexReader reader = searcher.getIndexReader();
    // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors

    // Only load the id field to get the uniqueKey of that
    // field

    final String finalUniqFieldName = uniqFieldName;

    final List<String> uniqValues = new ArrayList<String>();

    // TODO: is this required to be single-valued? if so, we should STOP
    // once we find it...
    final StoredFieldVisitor getUniqValue =
        new StoredFieldVisitor() {
          public void stringField(FieldInfo fieldInfo, String value) {

          public void intField(FieldInfo fieldInfo, int value) {

          public void longField(FieldInfo fieldInfo, long value) {

          public Status needsField(FieldInfo fieldInfo) {
            return ( ? Status.YES : Status.NO;

    TermsEnum termsEnum = null;

    while (iter.hasNext()) {
      Integer docId =;
      NamedList<Object> docNL = new NamedList<Object>();

      if (keyField != null) {
        reader.document(docId, getUniqValue);
        String uniqVal = null;
        if (uniqValues.size() != 0) {
          uniqVal = uniqValues.get(0);
          docNL.add("uniqueKey", uniqVal);
          termVectors.add(uniqVal, docNL);
      } else {
        // support for schemas w/o a unique key,
        termVectors.add("doc-" + docId, docNL);

      if (null != fields) {
        for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
          final String field = entry.getKey();
          final Terms vector = reader.getTermVector(docId, field);
          if (vector != null) {
            termsEnum = vector.iterator(termsEnum);
            mapOneVector(docNL, entry.getValue(), reader, docId, vector.iterator(termsEnum), field);
      } else {
        // extract all fields
        final Fields vectors = reader.getTermVectors(docId);
        for (String field : vectors) {
          Terms terms = vectors.terms(field);
          if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            mapOneVector(docNL, allFields, reader, docId, termsEnum, field);
Esempio n. 27
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
      } else {
        // position termsEnum on first term
        term =;

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
        term =;

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));

    return res;
  // NumericDocValues Updates
  // If otherFieldUpdates != null, we need to merge the updates into them
  private synchronized Map<String, NumericFieldUpdates> applyNumericDocValuesUpdates(
      Iterable<NumericUpdate> updates,
      ReadersAndUpdates rld,
      SegmentReader reader,
      Map<String, NumericFieldUpdates> otherFieldUpdates)
      throws IOException {
    Fields fields = reader.fields();
    if (fields == null) {
      // This reader has no postings
      return Collections.emptyMap();

    // TODO: we can process the updates per DV field, from last to first so that
    // if multiple terms affect same document for the same field, we add an update
    // only once (that of the last term). To do that, we can keep a bitset which
    // marks which documents have already been updated. So e.g. if term T1
    // updates doc 7, and then we process term T2 and it updates doc 7 as well,
    // we don't apply the update since we know T1 came last and therefore wins
    // the update.
    // We can also use that bitset as 'liveDocs' to pass to, so
    // that these documents aren't even returned.

    String currentField = null;
    TermsEnum termsEnum = null;
    DocsEnum docs = null;
    final Map<String, NumericFieldUpdates> result =
        otherFieldUpdates == null ? new HashMap<String, NumericFieldUpdates>() : otherFieldUpdates;
    // System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader);
    for (NumericUpdate update : updates) {
      Term term = update.term;
      int limit = update.docIDUpto;

      // TODO: we traverse the terms in update order (not term order) so that we
      // apply the updates in the correct order, i.e. if two terms udpate the
      // same document, the last one that came in wins, irrespective of the
      // terms lexical order.
      // we can apply the updates in terms order if we keep an updatesGen (and
      // increment it with every update) and attach it to each NumericUpdate. Note
      // that we cannot rely only on docIDUpto because an app may send two updates
      // which will get same docIDUpto, yet will still need to respect the order
      // those updates arrived.

      if (!term.field().equals(currentField)) {
        // if we change the code to process updates in terms order, enable this assert
        //        assert currentField == null || currentField.compareTo(term.field()) < 0;
        currentField = term.field();
        Terms terms = fields.terms(currentField);
        if (terms != null) {
          termsEnum = terms.iterator(termsEnum);
        } else {
          termsEnum = null;
          continue; // no terms in that field

      if (termsEnum == null) {
      // System.out.println("  term=" + term);

      if (termsEnum.seekExact(term.bytes())) {
        // we don't need term frequencies for this
        DocsEnum docsEnum =, docs, DocsEnum.FLAG_NONE);

        // System.out.println("BDS: got docsEnum=" + docsEnum);

        NumericFieldUpdates fieldUpdates = result.get(update.field);
        if (fieldUpdates == null) {
          fieldUpdates = new NumericFieldUpdates.PackedNumericFieldUpdates(reader.maxDoc());
          result.put(update.field, fieldUpdates);
        int doc;
        while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          // System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term +
          // " doc=" + docID);
          if (doc >= limit) {
            break; // no more docs that can be updated for this term
          fieldUpdates.add(doc, update.value);
    return result;
  // Delete by Term
  private synchronized long applyTermDeletes(
      Iterable<Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) throws IOException {
    long delCount = 0;
    Fields fields = reader.fields();
    if (fields == null) {
      // This reader has no postings
      return 0;

    TermsEnum termsEnum = null;

    String currentField = null;
    DocsEnum docs = null;

    assert checkDeleteTerm(null);

    boolean any = false;

    // System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader);
    for (Term term : termsIter) {
      // Since we visit terms sorted, we gain performance
      // by re-using the same TermsEnum and seeking only
      // forwards
      if (!term.field().equals(currentField)) {
        assert currentField == null || currentField.compareTo(term.field()) < 0;
        currentField = term.field();
        Terms terms = fields.terms(currentField);
        if (terms != null) {
          termsEnum = terms.iterator(termsEnum);
        } else {
          termsEnum = null;

      if (termsEnum == null) {
      assert checkDeleteTerm(term);

      // System.out.println("  term=" + term);

      if (termsEnum.seekExact(term.bytes())) {
        // we don't need term frequencies for this
        DocsEnum docsEnum =, docs, DocsEnum.FLAG_NONE);
        // System.out.println("BDS: got docsEnum=" + docsEnum);

        if (docsEnum != null) {
          while (true) {
            final int docID = docsEnum.nextDoc();
            // System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" +
            // docID);
            if (docID == DocIdSetIterator.NO_MORE_DOCS) {
            if (!any) {
              any = true;
            // NOTE: there is no limit check on the docID
            // when deleting by Term (unlike by Query)
            // because on flush we apply all Term deletes to
            // each segment.  So all Term deleting here is
            // against prior segments:
            if (rld.delete(docID)) {

    return delCount;
  public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =

    AnalysisEngineDescription segmenter =

    AnalysisEngineDescription metaCollector =
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());

    int i = 0;
    IndexReader index;
    try {
      index =;
      Fields fields = MultiFields.getFields(index);
      if (fields != null) {
        Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator(null);
          //                    Bits liveDocs = MultiFields.getLiveDocs(index);
          //                    DocsEnum docs =, null);
          //                    int docId;
          //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
          //                        index.g
          //                    }
          BytesRef text = null;
          while ((text = != null) {
            //                        System.out.println(text.utf8ToString() + " - " +
            // termsEnum.totalTermFreq());
            //                        System.out.println(termsEnum.docFreq());

            if (text.utf8ToString().equals("this")) {
              assertEquals(2, termsEnum.docFreq());
              assertEquals(3, termsEnum.totalTermFreq());

    } catch (Exception e) {
      throw new ResourceInitializationException(e);

    assertEquals(35, i);