Ejemplo n.º 1
0
 /**
  * Prepare a document reconstructor.
  *
  * @param reader IndexReader to read from.
  * @param fieldNames if non-null or not empty, data will be collected only from these fields,
  *     otherwise data will be collected from all fields
  * @param numTerms total number of terms in the index, or -1 if unknown (will be calculated)
  * @throws Exception
  */
 public DocReconstructor(IndexReader reader, String[] fieldNames, int numTerms) throws Exception {
   if (reader == null) {
     throw new Exception("IndexReader cannot be null.");
   }
   this.reader = reader;
   if (fieldNames == null || fieldNames.length == 0) {
     // collect fieldNames
     this.fieldNames = (String[]) reader.getFieldNames(FieldOption.ALL).toArray(new String[0]);
   } else {
     this.fieldNames = fieldNames;
   }
   if (numTerms == -1) {
     Fields fields = MultiFields.getFields(reader);
     numTerms = 0;
     FieldsEnum fe = fields.iterator();
     String fld = null;
     while ((fld = fe.next()) != null) {
       TermsEnum te = fe.terms();
       while (te.next() != null) {
         numTerms++;
       }
     }
     this.numTerms = numTerms;
   }
   deleted = MultiFields.getDeletedDocs(reader);
 }
  public void testGetTermVector() throws IOException {
    createIndexWithAlias();
    assertAcked(
        client()
            .admin()
            .indices()
            .preparePutMapping("test")
            .setType("type1")
            .setSource("field", "type=text,term_vector=with_positions_offsets_payloads")
            .get());
    ensureYellow("test");

    client()
        .prepareIndex(indexOrAlias(), "type1", "1")
        .setSource("field", "the quick brown fox jumps over the lazy dog")
        .get();
    refresh();

    TermVectorsResponse termVectorsResponse =
        client().prepareTermVectors(indexOrAlias(), "type1", "1").get();
    assertThat(termVectorsResponse.getIndex(), equalTo("test"));
    assertThat(termVectorsResponse.isExists(), equalTo(true));
    Fields fields = termVectorsResponse.getFields();
    assertThat(fields.size(), equalTo(1));
    assertThat(fields.terms("field").size(), equalTo(8L));
  }
    Query createCandidateQuery(IndexReader indexReader) throws IOException {
      List<Term> extractedTerms = new ArrayList<>();
      // include extractionResultField:failed, because docs with this term have no
      // extractedTermsField
      // and otherwise we would fail to return these docs. Docs that failed query term extraction
      // always need to be verified by MemoryIndex:
      extractedTerms.add(new Term(extractionResultField.name(), EXTRACTION_FAILED));

      LeafReader reader = indexReader.leaves().get(0).reader();
      Fields fields = reader.fields();
      for (String field : fields) {
        Terms terms = fields.terms(field);
        if (terms == null) {
          continue;
        }

        BytesRef fieldBr = new BytesRef(field);
        TermsEnum tenum = terms.iterator();
        for (BytesRef term = tenum.next(); term != null; term = tenum.next()) {
          BytesRefBuilder builder = new BytesRefBuilder();
          builder.append(fieldBr);
          builder.append(FIELD_VALUE_SEPARATOR);
          builder.append(term);
          extractedTerms.add(new Term(queryTermsField.name(), builder.toBytesRef()));
        }
      }
      return new TermsQuery(extractedTerms);
    }
Ejemplo n.º 4
0
 /**
  * Retrieve term vector for this document and field, or null if term vectors were not indexed. The
  * returned Fields instance acts like a single-document inverted index (the docID will be 0).
  */
 public final Terms getTermVector(int docID, String field) throws IOException {
   Fields vectors = getTermVectors(docID);
   if (vectors == null) {
     return null;
   }
   return vectors.terms(field);
 }
Ejemplo n.º 5
0
  public void listTokens(int freq) throws IOException {
    IndexReader ireader = null;
    TermsEnum iter = null;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory);
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.DEFS);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        // if (iter.term().field().startsWith("f")) {
        if (iter.docFreq() > 16 && iter.term().utf8ToString().length() > freq) {
          log.warning(iter.term().utf8ToString());
        }
        iter.next();
        /*} else {
        break;
        }*/
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Ejemplo n.º 6
0
  /**
   * List all of the files in this index database
   *
   * @throws IOException If an IO error occurs while reading from the database
   */
  public void listFiles() throws IOException {
    IndexReader ireader = null;
    TermsEnum iter;
    Terms terms = null;

    try {
      ireader = DirectoryReader.open(indexDirectory); // open existing index
      int numDocs = ireader.numDocs();
      if (numDocs > 0) {
        Fields uFields = MultiFields.getFields(ireader); // reader.getTermVectors(0);
        terms = uFields.terms(QueryBuilder.U);
      }
      iter = terms.iterator(null); // init uid iterator
      while (iter.term() != null) {
        log.fine(Util.uid2url(iter.term().utf8ToString()));
        iter.next();
      }
    } finally {

      if (ireader != null) {
        try {
          ireader.close();
        } catch (IOException e) {
          log.log(Level.WARNING, "An error occured while closing index reader", e);
        }
      }
    }
  }
Ejemplo n.º 7
0
  /**
   * Find words for a more-like-this query former.
   *
   * @param docNum the id of the lucene document from which to find terms
   */
  private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
      final Fields vectors = ir.getTermVectors(docNum);
      final Terms vector;
      if (vectors != null) {
        vector = vectors.terms(fieldName);
      } else {
        vector = null;
      }

      // field does not store term vector info
      if (vector == null) {
        Document d = ir.document(docNum);
        IndexableField[] fields = d.getFields(fieldName);
        for (IndexableField field : fields) {
          final String stringValue = field.stringValue();
          if (stringValue != null) {
            addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName);
          }
        }
      } else {
        addTermFrequencies(field2termFreqMap, vector, fieldName);
      }
    }

    return createQueue(field2termFreqMap);
  }
Ejemplo n.º 8
0
  private void printSegment(PrintWriter out, SegmentCommitInfo si) throws Exception {
    SegmentReader reader =
        new SegmentReader(si, DirectoryReader.DEFAULT_TERMS_INDEX_DIVISOR, newIOContext(random()));

    for (int i = 0; i < reader.numDocs(); i++) out.println(reader.document(i));

    Fields fields = reader.fields();
    for (String field : fields) {
      Terms terms = fields.terms(field);
      assertNotNull(terms);
      TermsEnum tis = terms.iterator(null);
      while (tis.next() != null) {

        out.print("  term=" + field + ":" + tis.term());
        out.println("    DF=" + tis.docFreq());

        DocsAndPositionsEnum positions = tis.docsAndPositions(reader.getLiveDocs(), null);

        while (positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
          out.print(" doc=" + positions.docID());
          out.print(" TF=" + positions.freq());
          out.print(" pos=");
          out.print(positions.nextPosition());
          for (int j = 1; j < positions.freq(); j++) out.print("," + positions.nextPosition());
          out.println("");
        }
      }
    }
    reader.close();
  }
Ejemplo n.º 9
0
  /*
   *  listTermVectors displays the term vectors for all of the fields
   *  in a document in an index (specified by reader).
   */
  static void listTermVectors(IndexReader reader, String docidString) throws IOException {

    System.out.println("\nTermVector:  docid " + docidString);

    int docid = Integer.parseInt(docidString);

    if ((docid < 0) || (docid >= reader.numDocs())) {
      System.out.println("ERROR:  " + docidString + " is a bad document id.");
      return;
    }
    ;

    /*
     *  Iterate over the fields in this document.
     */
    Fields fields = reader.getTermVectors(docid);
    Iterator<String> fieldIterator = fields.iterator();

    while (fieldIterator.hasNext()) {
      String fieldName = fieldIterator.next();
      System.out.println("  Field: " + fieldName);

      Terms terms = fields.terms(fieldName);
      termVectorDisplay(terms);
    }
    ;
  }
 @Override
 public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
   assert index != null;
   assert type != null;
   assert id != null;
   builder.field(FieldStrings._INDEX, index);
   builder.field(FieldStrings._TYPE, type);
   if (!isArtificial()) {
     builder.field(FieldStrings._ID, id);
   }
   builder.field(FieldStrings._VERSION, docVersion);
   builder.field(FieldStrings.FOUND, isExists());
   builder.field(FieldStrings.TOOK, tookInMillis);
   if (!isExists()) {
     return builder;
   }
   builder.startObject(FieldStrings.TERM_VECTORS);
   final CharsRefBuilder spare = new CharsRefBuilder();
   Fields theFields = getFields();
   Iterator<String> fieldIter = theFields.iterator();
   while (fieldIter.hasNext()) {
     buildField(builder, spare, theFields, fieldIter);
   }
   builder.endObject();
   return builder;
 }
Ejemplo n.º 11
0
 public int[] toDocsArray(Term term, Bits bits, IndexReader reader) throws IOException {
   Fields fields = MultiFields.getFields(reader);
   Terms cterms = fields.terms(term.field);
   TermsEnum ctermsEnum = cterms.iterator();
   if (ctermsEnum.seekExact(new BytesRef(term.text()))) {
     PostingsEnum postingsEnum =
         TestUtil.docs(random(), ctermsEnum, bits, null, PostingsEnum.NONE);
     return toArray(postingsEnum);
   }
   return null;
 }
Ejemplo n.º 12
0
  public void testChangeGaps() throws Exception {
    // LUCENE-5324: check that it is possible to change the wrapper's gaps
    final int positionGap = random().nextInt(1000);
    final int offsetGap = random().nextInt(1000);
    final Analyzer delegate = new MockAnalyzer(random());
    final Analyzer a =
        new DelegatingAnalyzerWrapper(delegate.getReuseStrategy()) {
          @Override
          protected Analyzer getWrappedAnalyzer(String fieldName) {
            return delegate;
          }

          @Override
          public int getPositionIncrementGap(String fieldName) {
            return positionGap;
          }

          @Override
          public int getOffsetGap(String fieldName) {
            return offsetGap;
          }
        };

    final RandomIndexWriter writer = new RandomIndexWriter(random(), newDirectory(), a);
    final Document doc = new Document();
    final FieldType ft = new FieldType();
    ft.setIndexOptions(IndexOptions.DOCS);
    ft.setTokenized(true);
    ft.setStoreTermVectors(true);
    ft.setStoreTermVectorPositions(true);
    ft.setStoreTermVectorOffsets(true);
    doc.add(new Field("f", "a", ft));
    doc.add(new Field("f", "a", ft));
    writer.addDocument(doc);
    final LeafReader reader = getOnlySegmentReader(writer.getReader());
    final Fields fields = reader.getTermVectors(0);
    final Terms terms = fields.terms("f");
    final TermsEnum te = terms.iterator();
    assertEquals(new BytesRef("a"), te.next());
    final PostingsEnum dpe = te.postings(null, PostingsEnum.ALL);
    assertEquals(0, dpe.nextDoc());
    assertEquals(2, dpe.freq());
    assertEquals(0, dpe.nextPosition());
    assertEquals(0, dpe.startOffset());
    final int endOffset = dpe.endOffset();
    assertEquals(1 + positionGap, dpe.nextPosition());
    assertEquals(1 + endOffset + offsetGap, dpe.endOffset());
    assertEquals(null, te.next());
    reader.close();
    writer.close();
    writer.w.getDirectory().close();
  }
Ejemplo n.º 13
0
  /**
   * @param reader
   * @param numTerms
   * @param field
   * @return TermStats[] ordered by terms with highest docFreq first.
   * @throws Exception
   */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String[] fieldNames)
      throws Exception {
    TermStatsQueue tiq = null;
    TermsEnum te = null;

    if (fieldNames != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      for (String field : fieldNames) {
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        LOG.info("Index with no fields - probably empty or corrupted");
        return EMPTY_STATS;
      }
      tiq = new TermStatsQueue(numTerms);
      Iterator<String> fieldIterator = fields.iterator();
      while (fieldIterator.hasNext()) {
        String field = fieldIterator.next();
        Terms terms = fields.terms(field);
        if (terms != null) {
          te = terms.iterator(te);
          fillQueue(te, tiq, field);
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
Ejemplo n.º 14
0
  public static Terms getTermVector(String fieldname, SolrIndexSearcher solrIndexSearcher)
      throws JATEException {
    try {
      Fields fields = MultiFields.getFields(solrIndexSearcher.getLeafReader());

      Terms vector = fields.terms(fieldname);
      if (vector == null)
        throw new JATEException(String.format("Cannot find expected field: %s", fieldname));
      return vector;
    } catch (IOException ioe) {
      StringBuilder sb =
          new StringBuilder(
              String.format("Cannot find expected field: %s. Error stacktrack: \n", fieldname));
      sb.append(org.apache.commons.lang.exception.ExceptionUtils.getFullStackTrace(ioe));
      throw new JATEException(sb.toString());
    }
  }
Ejemplo n.º 15
0
 private void verifyCount(IndexReader ir) throws Exception {
   Fields fields = MultiFields.getFields(ir);
   for (String field : fields) {
     Terms terms = fields.terms(field);
     if (terms == null) {
       continue;
     }
     int docCount = terms.getDocCount();
     FixedBitSet visited = new FixedBitSet(ir.maxDoc());
     TermsEnum te = terms.iterator();
     while (te.next() != null) {
       PostingsEnum de = TestUtil.docs(random(), te, null, PostingsEnum.NONE);
       while (de.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
         visited.set(de.docID());
       }
     }
     assertEquals(visited.cardinality(), docCount);
   }
 }
 private Fields mergeFields(Fields fields1, Fields fields2) throws IOException {
   ParallelFields parallelFields = new ParallelFields();
   for (String fieldName : fields2) {
     Terms terms = fields2.terms(fieldName);
     if (terms != null) {
       parallelFields.addField(fieldName, terms);
     }
   }
   for (String fieldName : fields1) {
     if (parallelFields.fields.containsKey(fieldName)) {
       continue;
     }
     Terms terms = fields1.terms(fieldName);
     if (terms != null) {
       parallelFields.addField(fieldName, terms);
     }
   }
   return parallelFields;
 }
  /**
   * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code>
   * O(numIndexedFields)</code>.
   *
   * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns
   *     will break out its in-heap bytes separately in the returned {@link CompletionStats}
   */
  public CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) {
    CompletionStats completionStats = new CompletionStats();
    for (LeafReaderContext atomicReaderContext : indexReader.leaves()) {
      LeafReader atomicReader = atomicReaderContext.reader();
      try {
        Fields fields = atomicReader.fields();
        for (String fieldName : fields) {
          Terms terms = fields.terms(fieldName);
          if (terms instanceof CompletionTerms) {
            CompletionTerms completionTerms = (CompletionTerms) terms;
            completionStats.add(completionTerms.stats(fieldNamePatterns));
          }
        }
      } catch (IOException ioe) {
        logger.error("Could not get completion stats", ioe);
      }
    }

    return completionStats;
  }
Ejemplo n.º 18
0
  public static DocSet createDocSet(SolrIndexSearcher searcher, Term term) throws IOException {
    DirectoryReader reader = searcher.getRawReader(); // raw reader to avoid extra wrapping overhead
    int maxDoc = searcher.getIndexReader().maxDoc();
    int smallSetSize = smallSetSize(maxDoc);

    String field = term.field();
    BytesRef termVal = term.bytes();

    int maxCount = 0;
    int firstReader = -1;
    List<LeafReaderContext> leaves = reader.leaves();
    PostingsEnum[] postList =
        new PostingsEnum
            [leaves
                .size()]; // use array for slightly higher scanning cost, but fewer memory
                          // allocations
    for (LeafReaderContext ctx : leaves) {
      assert leaves.get(ctx.ord) == ctx;
      LeafReader r = ctx.reader();
      Fields f = r.fields();
      Terms t = f.terms(field);
      if (t == null) continue; // field is missing
      TermsEnum te = t.iterator();
      if (te.seekExact(termVal)) {
        maxCount += te.docFreq();
        postList[ctx.ord] = te.postings(null, PostingsEnum.NONE);
        if (firstReader < 0) firstReader = ctx.ord;
      }
    }

    if (maxCount == 0) {
      return DocSet.EMPTY;
    }

    if (maxCount <= smallSetSize) {
      return createSmallSet(leaves, postList, maxCount, firstReader);
    }

    return createBigSet(leaves, postList, maxDoc, firstReader);
  }
Ejemplo n.º 19
0
 /**
  * Return a query that will return docs like the passed Fields.
  *
  * @return a query that will return docs like the passed Fields.
  */
 public Query like(Fields... likeFields) throws IOException {
   // get all field names
   Set<String> fieldNames = new HashSet<>();
   for (Fields fields : likeFields) {
     for (String fieldName : fields) {
       fieldNames.add(fieldName);
     }
   }
   // term selection is per field, then appended to a single boolean query
   BooleanQuery bq = new BooleanQuery();
   for (String fieldName : fieldNames) {
     Map<String, Int> termFreqMap = new HashMap<>();
     for (Fields fields : likeFields) {
       Terms vector = fields.terms(fieldName);
       if (vector != null) {
         addTermFrequencies(termFreqMap, vector, fieldName);
       }
     }
     addToQuery(createQueue(termFreqMap, fieldName), bq);
   }
   return bq;
 }
Ejemplo n.º 20
0
  /** Returns TermStats[] ordered by terms with highest docFreq first. */
  public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field)
      throws Exception {
    TermStatsQueue tiq = null;

    if (field != null) {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("field " + field + " not found");
      }
      Terms terms = fields.terms(field);
      if (terms != null) {
        TermsEnum termsEnum = terms.iterator(null);
        tiq = new TermStatsQueue(numTerms);
        tiq.fill(field, termsEnum);
      }
    } else {
      Fields fields = MultiFields.getFields(reader);
      if (fields == null) {
        throw new RuntimeException("no fields found for this index");
      }
      tiq = new TermStatsQueue(numTerms);
      for (String fieldName : fields) {
        Terms terms = fields.terms(fieldName);
        if (terms != null) {
          tiq.fill(fieldName, terms.iterator(null));
        }
      }
    }

    TermStats[] result = new TermStats[tiq.size()];
    // we want highest first so we read the queue and populate the array
    // starting at the end and work backwards
    int count = tiq.size() - 1;
    while (tiq.size() != 0) {
      result[count] = tiq.pop();
      count--;
    }
    return result;
  }
Ejemplo n.º 21
0
  public void collectTermContext(
      IndexReader reader,
      List<LeafReaderContext> leaves,
      TermContext[] contextArray,
      Term[] queryTerms)
      throws IOException {
    TermsEnum termsEnum = null;
    for (LeafReaderContext context : leaves) {
      final Fields fields = context.reader().fields();
      for (int i = 0; i < queryTerms.length; i++) {
        Term term = queryTerms[i];
        TermContext termContext = contextArray[i];
        final Terms terms = fields.terms(term.field());
        if (terms == null) {
          // field does not exist
          continue;
        }
        termsEnum = terms.iterator();
        assert termsEnum != null;

        if (termsEnum == TermsEnum.EMPTY) continue;
        if (termsEnum.seekExact(term.bytes())) {
          if (termContext == null) {
            contextArray[i] =
                new TermContext(
                    reader.getContext(),
                    termsEnum.termState(),
                    context.ord,
                    termsEnum.docFreq(),
                    termsEnum.totalTermFreq());
          } else {
            termContext.register(
                termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq());
          }
        }
      }
    }
  }
Ejemplo n.º 22
0
  public void testCodec() throws Exception {
    Directory dir = new AppendingRAMDirectory(new RAMDirectory());
    IndexWriterConfig cfg = new IndexWriterConfig(Version.LUCENE_40, new MockAnalyzer());

    cfg.setCodecProvider(new AppendingCodecProvider());
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundFile(false);
    ((LogMergePolicy) cfg.getMergePolicy()).setUseCompoundDocStore(false);
    IndexWriter writer = new IndexWriter(dir, cfg);
    Document doc = new Document();
    doc.add(new Field("f", text, Store.YES, Index.ANALYZED, TermVector.WITH_POSITIONS_OFFSETS));
    writer.addDocument(doc);
    writer.commit();
    writer.addDocument(doc);
    writer.optimize();
    writer.close();
    IndexReader reader = IndexReader.open(dir, null, true, 1, new AppendingCodecProvider());
    assertEquals(2, reader.numDocs());
    doc = reader.document(0);
    assertEquals(text, doc.get("f"));
    Fields fields = MultiFields.getFields(reader);
    Terms terms = fields.terms("f");
    assertNotNull(terms);
    TermsEnum te = terms.iterator();
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("quick")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("brown")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("fox")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("jumped")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("over")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("lazy")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("dog")));
    assertEquals(SeekStatus.FOUND, te.seek(new BytesRef("the")));
    DocsEnum de = te.docs(null, null);
    assertTrue(de.advance(0) != DocsEnum.NO_MORE_DOCS);
    assertEquals(2, de.freq());
    assertTrue(de.advance(1) != DocsEnum.NO_MORE_DOCS);
    assertTrue(de.advance(2) == DocsEnum.NO_MORE_DOCS);
    reader.close();
  }
  @Override
  protected ShardTermlistResponse shardOperation(ShardTermlistRequest request)
      throws ElasticSearchException {
    synchronized (termlistMutex) {
      InternalIndexShard indexShard =
          (InternalIndexShard)
              indicesService.indexServiceSafe(request.index()).shardSafe(request.shardId());
      indexShard.store().directory();
      Engine.Searcher searcher = indexShard.searcher();
      try {
        Set<String> set = new CompactHashSet();

        Fields fields = MultiFields.getFields(searcher.reader());
        if (fields != null) {
          for (Iterator<String> it = fields.iterator(); it.hasNext(); ) {
            String field = it.next();
            if (field.charAt(0) == '_') {
              continue;
            }
            if (request.getField() == null || field.equals(request.getField())) {
              Terms terms = fields.terms(field);
              if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                  set.add(text.utf8ToString());
                  System.out.println("field=" + field + "; text=" + text.utf8ToString());
                }
              }
            }
          }
        }
        return new ShardTermlistResponse(request.index(), request.shardId(), set);
      } catch (IOException ex) {
        throw new ElasticSearchException(ex.getMessage(), ex);
      }
    }
  }
 private void buildField(
     XContentBuilder builder,
     final CharsRefBuilder spare,
     Fields theFields,
     Iterator<String> fieldIter)
     throws IOException {
   String fieldName = fieldIter.next();
   builder.startObject(fieldName);
   Terms curTerms = theFields.terms(fieldName);
   // write field statistics
   buildFieldStatistics(builder, curTerms);
   builder.startObject(FieldStrings.TERMS);
   TermsEnum termIter = curTerms.iterator(null);
   for (int i = 0; i < curTerms.size(); i++) {
     buildTerm(builder, spare, curTerms, termIter);
   }
   builder.endObject();
   builder.endObject();
 }
Ejemplo n.º 25
0
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(TermsParams.TERMS, false)) return;

    String[] fields = params.getParams(TermsParams.TERMS_FIELD);

    NamedList<Object> termsResult = new SimpleOrderedMap<>();
    rb.rsp.add("terms", termsResult);

    if (fields == null || fields.length == 0) return;

    int limit = params.getInt(TermsParams.TERMS_LIMIT, 10);
    if (limit < 0) {
      limit = Integer.MAX_VALUE;
    }

    String lowerStr = params.get(TermsParams.TERMS_LOWER);
    String upperStr = params.get(TermsParams.TERMS_UPPER);
    boolean upperIncl = params.getBool(TermsParams.TERMS_UPPER_INCLUSIVE, false);
    boolean lowerIncl = params.getBool(TermsParams.TERMS_LOWER_INCLUSIVE, true);
    boolean sort =
        !TermsParams.TERMS_SORT_INDEX.equals(
            params.get(TermsParams.TERMS_SORT, TermsParams.TERMS_SORT_COUNT));
    int freqmin = params.getInt(TermsParams.TERMS_MINCOUNT, 1);
    int freqmax = params.getInt(TermsParams.TERMS_MAXCOUNT, UNLIMITED_MAX_COUNT);
    if (freqmax < 0) {
      freqmax = Integer.MAX_VALUE;
    }
    String prefix = params.get(TermsParams.TERMS_PREFIX_STR);
    String regexp = params.get(TermsParams.TERMS_REGEXP_STR);
    Pattern pattern = regexp != null ? Pattern.compile(regexp, resolveRegexpFlags(params)) : null;

    boolean raw = params.getBool(TermsParams.TERMS_RAW, false);

    final AtomicReader indexReader = rb.req.getSearcher().getAtomicReader();
    Fields lfields = indexReader.fields();

    for (String field : fields) {
      NamedList<Integer> fieldTerms = new NamedList<>();
      termsResult.add(field, fieldTerms);

      Terms terms = lfields == null ? null : lfields.terms(field);
      if (terms == null) {
        // no terms for this field
        continue;
      }

      FieldType ft = raw ? null : rb.req.getSchema().getFieldTypeNoEx(field);
      if (ft == null) ft = new StrField();

      // prefix must currently be text
      BytesRef prefixBytes = prefix == null ? null : new BytesRef(prefix);

      BytesRef upperBytes = null;
      if (upperStr != null) {
        upperBytes = new BytesRef();
        ft.readableToIndexed(upperStr, upperBytes);
      }

      BytesRef lowerBytes;
      if (lowerStr == null) {
        // If no lower bound was specified, use the prefix
        lowerBytes = prefixBytes;
      } else {
        lowerBytes = new BytesRef();
        if (raw) {
          // TODO: how to handle binary? perhaps we don't for "raw"... or if the field exists
          // perhaps we detect if the FieldType is non-character and expect hex if so?
          lowerBytes = new BytesRef(lowerStr);
        } else {
          lowerBytes = new BytesRef();
          ft.readableToIndexed(lowerStr, lowerBytes);
        }
      }

      TermsEnum termsEnum = terms.iterator(null);
      BytesRef term = null;

      if (lowerBytes != null) {
        if (termsEnum.seekCeil(lowerBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
          // Only advance the enum if we are excluding the lower bound and the lower Term actually
          // matches
          if (lowerIncl == false && term.equals(lowerBytes)) {
            term = termsEnum.next();
          }
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }

      int i = 0;
      BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
          (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(limit) : null);
      CharsRef external = new CharsRef();
      while (term != null && (i < limit || sort)) {
        boolean externalized = false; // did we fill in "external" yet for this term?

        // stop if the prefix doesn't match
        if (prefixBytes != null && !StringHelper.startsWith(term, prefixBytes)) break;

        if (pattern != null) {
          // indexed text or external text?
          // TODO: support "raw" mode?
          ft.indexedToReadable(term, external);
          externalized = true;
          if (!pattern.matcher(external).matches()) {
            term = termsEnum.next();
            continue;
          }
        }

        if (upperBytes != null) {
          int upperCmp = term.compareTo(upperBytes);
          // if we are past the upper term, or equal to it (when don't include upper) then stop.
          if (upperCmp > 0 || (upperCmp == 0 && !upperIncl)) break;
        }

        // This is a good term in the range.  Check if mincount/maxcount conditions are satisfied.
        int docFreq = termsEnum.docFreq();
        if (docFreq >= freqmin && docFreq <= freqmax) {
          // add the term to the list
          if (sort) {
            queue.add(new CountPair<>(BytesRef.deepCopyOf(term), docFreq));
          } else {

            // TODO: handle raw somehow
            if (!externalized) {
              ft.indexedToReadable(term, external);
            }
            fieldTerms.add(external.toString(), docFreq);
            i++;
          }
        }

        term = termsEnum.next();
      }

      if (sort) {
        for (CountPair<BytesRef, Integer> item : queue) {
          if (i >= limit) break;
          ft.indexedToReadable(item.key, external);
          fieldTerms.add(external.toString(), item.val);
          i++;
        }
      }
    }
  }
Ejemplo n.º 26
0
  @Override
  public void process(ResponseBuilder rb) throws IOException {
    SolrParams params = rb.req.getParams();
    if (!params.getBool(COMPONENT_NAME, false)) {
      return;
    }

    NamedList<Object> termVectors = new NamedList<Object>();
    rb.rsp.add(TERM_VECTORS, termVectors);

    IndexSchema schema = rb.req.getSchema();
    SchemaField keyField = schema.getUniqueKeyField();
    String uniqFieldName = null;
    if (keyField != null) {
      uniqFieldName = keyField.getName();
      termVectors.add("uniqueKeyFieldName", uniqFieldName);
    }

    FieldOptions allFields = new FieldOptions();
    // figure out what options we have, and try to get the appropriate vector
    allFields.termFreq = params.getBool(TermVectorParams.TF, false);
    allFields.positions = params.getBool(TermVectorParams.POSITIONS, false);
    allFields.offsets = params.getBool(TermVectorParams.OFFSETS, false);
    allFields.docFreq = params.getBool(TermVectorParams.DF, false);
    allFields.tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
    // boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
    // short cut to all values.
    if (params.getBool(TermVectorParams.ALL, false)) {
      allFields.termFreq = true;
      allFields.positions = true;
      allFields.offsets = true;
      allFields.docFreq = true;
      allFields.tfIdf = true;
    }

    // Build up our per field mapping
    Map<String, FieldOptions> fieldOptions = new HashMap<String, FieldOptions>();
    NamedList<List<String>> warnings = new NamedList<List<String>>();
    List<String> noTV = new ArrayList<String>();
    List<String> noPos = new ArrayList<String>();
    List<String> noOff = new ArrayList<String>();

    Set<String> fields = getFields(rb);
    if (null != fields) {
      // we have specific fields to retrieve, or no fields
      for (String field : fields) {

        // workarround SOLR-3523
        if (null == field || "score".equals(field)) continue;

        // we don't want to issue warnings about the uniqueKey field
        // since it can cause lots of confusion in distributed requests
        // where the uniqueKey field is injected into the fl for merging
        final boolean fieldIsUniqueKey = field.equals(uniqFieldName);

        SchemaField sf = schema.getFieldOrNull(field);
        if (sf != null) {
          if (sf.storeTermVector()) {
            FieldOptions option = fieldOptions.get(field);
            if (option == null) {
              option = new FieldOptions();
              option.fieldName = field;
              fieldOptions.put(field, option);
            }
            // get the per field mappings
            option.termFreq = params.getFieldBool(field, TermVectorParams.TF, allFields.termFreq);
            option.docFreq = params.getFieldBool(field, TermVectorParams.DF, allFields.docFreq);
            option.tfIdf = params.getFieldBool(field, TermVectorParams.TF_IDF, allFields.tfIdf);
            // Validate these are even an option
            option.positions =
                params.getFieldBool(field, TermVectorParams.POSITIONS, allFields.positions);
            if (option.positions && !sf.storeTermPositions() && !fieldIsUniqueKey) {
              noPos.add(field);
            }
            option.offsets =
                params.getFieldBool(field, TermVectorParams.OFFSETS, allFields.offsets);
            if (option.offsets && !sf.storeTermOffsets() && !fieldIsUniqueKey) {
              noOff.add(field);
            }
          } else { // field doesn't have term vectors
            if (!fieldIsUniqueKey) noTV.add(field);
          }
        } else {
          // field doesn't exist
          throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "undefined field: " + field);
        }
      }
    } // else, deal with all fields

    // NOTE: currently all typs of warnings are schema driven, and garunteed
    // to be consistent across all shards - if additional types of warnings
    // are added that might be differnet between shards, finishStage() needs
    // to be changed to account for that.
    boolean hasWarnings = false;
    if (!noTV.isEmpty()) {
      warnings.add("noTermVectors", noTV);
      hasWarnings = true;
    }
    if (!noPos.isEmpty()) {
      warnings.add("noPositions", noPos);
      hasWarnings = true;
    }
    if (!noOff.isEmpty()) {
      warnings.add("noOffsets", noOff);
      hasWarnings = true;
    }
    if (hasWarnings) {
      termVectors.add("warnings", warnings);
    }

    DocListAndSet listAndSet = rb.getResults();
    List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
    Iterator<Integer> iter;
    if (docIds != null && !docIds.isEmpty()) {
      iter = docIds.iterator();
    } else {
      DocList list = listAndSet.docList;
      iter = list.iterator();
    }
    SolrIndexSearcher searcher = rb.req.getSearcher();

    IndexReader reader = searcher.getIndexReader();
    // the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors

    // Only load the id field to get the uniqueKey of that
    // field

    final String finalUniqFieldName = uniqFieldName;

    final List<String> uniqValues = new ArrayList<String>();

    // TODO: is this required to be single-valued? if so, we should STOP
    // once we find it...
    final StoredFieldVisitor getUniqValue =
        new StoredFieldVisitor() {
          @Override
          public void stringField(FieldInfo fieldInfo, String value) {
            uniqValues.add(value);
          }

          @Override
          public void intField(FieldInfo fieldInfo, int value) {
            uniqValues.add(Integer.toString(value));
          }

          @Override
          public void longField(FieldInfo fieldInfo, long value) {
            uniqValues.add(Long.toString(value));
          }

          @Override
          public Status needsField(FieldInfo fieldInfo) {
            return (fieldInfo.name.equals(finalUniqFieldName)) ? Status.YES : Status.NO;
          }
        };

    TermsEnum termsEnum = null;

    while (iter.hasNext()) {
      Integer docId = iter.next();
      NamedList<Object> docNL = new NamedList<Object>();

      if (keyField != null) {
        reader.document(docId, getUniqValue);
        String uniqVal = null;
        if (uniqValues.size() != 0) {
          uniqVal = uniqValues.get(0);
          uniqValues.clear();
          docNL.add("uniqueKey", uniqVal);
          termVectors.add(uniqVal, docNL);
        }
      } else {
        // support for schemas w/o a unique key,
        termVectors.add("doc-" + docId, docNL);
      }

      if (null != fields) {
        for (Map.Entry<String, FieldOptions> entry : fieldOptions.entrySet()) {
          final String field = entry.getKey();
          final Terms vector = reader.getTermVector(docId, field);
          if (vector != null) {
            termsEnum = vector.iterator(termsEnum);
            mapOneVector(docNL, entry.getValue(), reader, docId, vector.iterator(termsEnum), field);
          }
        }
      } else {
        // extract all fields
        final Fields vectors = reader.getTermVectors(docId);
        for (String field : vectors) {
          Terms terms = vectors.terms(field);
          if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            mapOneVector(docNL, allFields, reader, docId, termsEnum, field);
          }
        }
      }
    }
  }
Ejemplo n.º 27
0
  /**
   * Returns a list of terms in the specified field along with the corresponding count of documents
   * in the set that match that constraint. This method uses the FilterCache to get the intersection
   * count between <code>docs</code> and the DocSet for each term in the filter.
   *
   * @see FacetParams#FACET_LIMIT
   * @see FacetParams#FACET_ZEROS
   * @see FacetParams#FACET_MISSING
   */
  public NamedList<Integer> getFacetTermEnumCounts(
      SolrIndexSearcher searcher,
      DocSet docs,
      String field,
      int offset,
      int limit,
      int mincount,
      boolean missing,
      String sort,
      String prefix,
      String contains,
      boolean ignoreCase,
      SolrParams params)
      throws IOException {

    /* :TODO: potential optimization...
     * cache the Terms with the highest docFreq and try them first
     * don't enum if we get our max from them
     */

    // Minimum term docFreq in order to use the filterCache for that term.
    int minDfFilterCache = global.getFieldInt(field, FacetParams.FACET_ENUM_CACHE_MINDF, 0);

    // make sure we have a set that is fast for random access, if we will use it for that
    DocSet fastForRandomSet = docs;
    if (minDfFilterCache > 0 && docs instanceof SortedIntDocSet) {
      SortedIntDocSet sset = (SortedIntDocSet) docs;
      fastForRandomSet = new HashDocSet(sset.getDocs(), 0, sset.size());
    }

    IndexSchema schema = searcher.getSchema();
    LeafReader r = searcher.getLeafReader();
    FieldType ft = schema.getFieldType(field);

    boolean sortByCount = sort.equals("count") || sort.equals("true");
    final int maxsize = limit >= 0 ? offset + limit : Integer.MAX_VALUE - 1;
    final BoundedTreeSet<CountPair<BytesRef, Integer>> queue =
        sortByCount ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(maxsize) : null;
    final NamedList<Integer> res = new NamedList<>();

    int min = mincount - 1; // the smallest value in the top 'N' values
    int off = offset;
    int lim = limit >= 0 ? limit : Integer.MAX_VALUE;

    BytesRef prefixTermBytes = null;
    if (prefix != null) {
      String indexedPrefix = ft.toInternal(prefix);
      prefixTermBytes = new BytesRef(indexedPrefix);
    }

    Fields fields = r.fields();
    Terms terms = fields == null ? null : fields.terms(field);
    TermsEnum termsEnum = null;
    SolrIndexSearcher.DocsEnumState deState = null;
    BytesRef term = null;
    if (terms != null) {
      termsEnum = terms.iterator();

      // TODO: OPT: if seek(ord) is supported for this termsEnum, then we could use it for
      // facet.offset when sorting by index order.

      if (prefixTermBytes != null) {
        if (termsEnum.seekCeil(prefixTermBytes) == TermsEnum.SeekStatus.END) {
          termsEnum = null;
        } else {
          term = termsEnum.term();
        }
      } else {
        // position termsEnum on first term
        term = termsEnum.next();
      }
    }

    PostingsEnum postingsEnum = null;
    CharsRefBuilder charsRef = new CharsRefBuilder();

    if (docs.size() >= mincount) {
      while (term != null) {

        if (prefixTermBytes != null && !StringHelper.startsWith(term, prefixTermBytes)) break;

        if (contains == null || contains(term.utf8ToString(), contains, ignoreCase)) {
          int df = termsEnum.docFreq();

          // If we are sorting, we can use df>min (rather than >=) since we
          // are going in index order.  For certain term distributions this can
          // make a large difference (for example, many terms with df=1).
          if (df > 0 && df > min) {
            int c;

            if (df >= minDfFilterCache) {
              // use the filter cache

              if (deState == null) {
                deState = new SolrIndexSearcher.DocsEnumState();
                deState.fieldName = field;
                deState.liveDocs = r.getLiveDocs();
                deState.termsEnum = termsEnum;
                deState.postingsEnum = postingsEnum;
              }

              c = searcher.numDocs(docs, deState);

              postingsEnum = deState.postingsEnum;
            } else {
              // iterate over TermDocs to calculate the intersection

              // TODO: specialize when base docset is a bitset or hash set (skipDocs)?  or does it
              // matter for this?
              // TODO: do this per-segment for better efficiency (MultiDocsEnum just uses base class
              // impl)
              // TODO: would passing deleted docs lead to better efficiency over checking the
              // fastForRandomSet?
              postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
              c = 0;

              if (postingsEnum instanceof MultiPostingsEnum) {
                MultiPostingsEnum.EnumWithSlice[] subs =
                    ((MultiPostingsEnum) postingsEnum).getSubs();
                int numSubs = ((MultiPostingsEnum) postingsEnum).getNumSubs();
                for (int subindex = 0; subindex < numSubs; subindex++) {
                  MultiPostingsEnum.EnumWithSlice sub = subs[subindex];
                  if (sub.postingsEnum == null) continue;
                  int base = sub.slice.start;
                  int docid;
                  while ((docid = sub.postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    if (fastForRandomSet.exists(docid + base)) c++;
                  }
                }
              } else {
                int docid;
                while ((docid = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                  if (fastForRandomSet.exists(docid)) c++;
                }
              }
            }

            if (sortByCount) {
              if (c > min) {
                BytesRef termCopy = BytesRef.deepCopyOf(term);
                queue.add(new CountPair<>(termCopy, c));
                if (queue.size() >= maxsize) min = queue.last().val;
              }
            } else {
              if (c >= mincount && --off < 0) {
                if (--lim < 0) break;
                ft.indexedToReadable(term, charsRef);
                res.add(charsRef.toString(), c);
              }
            }
          }
        }
        term = termsEnum.next();
      }
    }

    if (sortByCount) {
      for (CountPair<BytesRef, Integer> p : queue) {
        if (--off >= 0) continue;
        if (--lim < 0) break;
        ft.indexedToReadable(p.key, charsRef);
        res.add(charsRef.toString(), p.val);
      }
    }

    if (missing) {
      res.add(null, getFieldMissingCount(searcher, docs, field));
    }

    return res;
  }
  // NumericDocValues Updates
  // If otherFieldUpdates != null, we need to merge the updates into them
  private synchronized Map<String, NumericFieldUpdates> applyNumericDocValuesUpdates(
      Iterable<NumericUpdate> updates,
      ReadersAndUpdates rld,
      SegmentReader reader,
      Map<String, NumericFieldUpdates> otherFieldUpdates)
      throws IOException {
    Fields fields = reader.fields();
    if (fields == null) {
      // This reader has no postings
      return Collections.emptyMap();
    }

    // TODO: we can process the updates per DV field, from last to first so that
    // if multiple terms affect same document for the same field, we add an update
    // only once (that of the last term). To do that, we can keep a bitset which
    // marks which documents have already been updated. So e.g. if term T1
    // updates doc 7, and then we process term T2 and it updates doc 7 as well,
    // we don't apply the update since we know T1 came last and therefore wins
    // the update.
    // We can also use that bitset as 'liveDocs' to pass to TermEnum.docs(), so
    // that these documents aren't even returned.

    String currentField = null;
    TermsEnum termsEnum = null;
    DocsEnum docs = null;
    final Map<String, NumericFieldUpdates> result =
        otherFieldUpdates == null ? new HashMap<String, NumericFieldUpdates>() : otherFieldUpdates;
    // System.out.println(Thread.currentThread().getName() + " numericDVUpdate reader=" + reader);
    for (NumericUpdate update : updates) {
      Term term = update.term;
      int limit = update.docIDUpto;

      // TODO: we traverse the terms in update order (not term order) so that we
      // apply the updates in the correct order, i.e. if two terms udpate the
      // same document, the last one that came in wins, irrespective of the
      // terms lexical order.
      // we can apply the updates in terms order if we keep an updatesGen (and
      // increment it with every update) and attach it to each NumericUpdate. Note
      // that we cannot rely only on docIDUpto because an app may send two updates
      // which will get same docIDUpto, yet will still need to respect the order
      // those updates arrived.

      if (!term.field().equals(currentField)) {
        // if we change the code to process updates in terms order, enable this assert
        //        assert currentField == null || currentField.compareTo(term.field()) < 0;
        currentField = term.field();
        Terms terms = fields.terms(currentField);
        if (terms != null) {
          termsEnum = terms.iterator(termsEnum);
        } else {
          termsEnum = null;
          continue; // no terms in that field
        }
      }

      if (termsEnum == null) {
        continue;
      }
      // System.out.println("  term=" + term);

      if (termsEnum.seekExact(term.bytes())) {
        // we don't need term frequencies for this
        DocsEnum docsEnum = termsEnum.docs(rld.getLiveDocs(), docs, DocsEnum.FLAG_NONE);

        // System.out.println("BDS: got docsEnum=" + docsEnum);

        NumericFieldUpdates fieldUpdates = result.get(update.field);
        if (fieldUpdates == null) {
          fieldUpdates = new NumericFieldUpdates.PackedNumericFieldUpdates(reader.maxDoc());
          result.put(update.field, fieldUpdates);
        }
        int doc;
        while ((doc = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          // System.out.println(Thread.currentThread().getName() + " numericDVUpdate term=" + term +
          // " doc=" + docID);
          if (doc >= limit) {
            break; // no more docs that can be updated for this term
          }
          fieldUpdates.add(doc, update.value);
        }
      }
    }
    return result;
  }
  // Delete by Term
  private synchronized long applyTermDeletes(
      Iterable<Term> termsIter, ReadersAndUpdates rld, SegmentReader reader) throws IOException {
    long delCount = 0;
    Fields fields = reader.fields();
    if (fields == null) {
      // This reader has no postings
      return 0;
    }

    TermsEnum termsEnum = null;

    String currentField = null;
    DocsEnum docs = null;

    assert checkDeleteTerm(null);

    boolean any = false;

    // System.out.println(Thread.currentThread().getName() + " del terms reader=" + reader);
    for (Term term : termsIter) {
      // Since we visit terms sorted, we gain performance
      // by re-using the same TermsEnum and seeking only
      // forwards
      if (!term.field().equals(currentField)) {
        assert currentField == null || currentField.compareTo(term.field()) < 0;
        currentField = term.field();
        Terms terms = fields.terms(currentField);
        if (terms != null) {
          termsEnum = terms.iterator(termsEnum);
        } else {
          termsEnum = null;
        }
      }

      if (termsEnum == null) {
        continue;
      }
      assert checkDeleteTerm(term);

      // System.out.println("  term=" + term);

      if (termsEnum.seekExact(term.bytes())) {
        // we don't need term frequencies for this
        DocsEnum docsEnum = termsEnum.docs(rld.getLiveDocs(), docs, DocsEnum.FLAG_NONE);
        // System.out.println("BDS: got docsEnum=" + docsEnum);

        if (docsEnum != null) {
          while (true) {
            final int docID = docsEnum.nextDoc();
            // System.out.println(Thread.currentThread().getName() + " del term=" + term + " doc=" +
            // docID);
            if (docID == DocIdSetIterator.NO_MORE_DOCS) {
              break;
            }
            if (!any) {
              rld.initWritableLiveDocs();
              any = true;
            }
            // NOTE: there is no limit check on the docID
            // when deleting by Term (unlike by Query)
            // because on flush we apply all Term deletes to
            // each segment.  So all Term deleting here is
            // against prior segments:
            if (rld.delete(docID)) {
              delCount++;
            }
          }
        }
      }
    }

    return delCount;
  }
  @Test
  public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader =
        CollectionReaderFactory.createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            "src/test/resources/data/",
            TextReader.PARAM_LANGUAGE,
            "en",
            TextReader.PARAM_PATTERNS,
            "text*.txt");

    AnalysisEngineDescription segmenter =
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector =
        AnalysisEngineFactory.createEngineDescription(
            LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
      //            System.out.println(jcas.getDocumentText().length());
    }

    int i = 0;
    IndexReader index;
    try {
      index = DirectoryReader.open(FSDirectory.open(tmpDir));
      Fields fields = MultiFields.getFields(index);
      if (fields != null) {
        Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
        if (terms != null) {
          TermsEnum termsEnum = terms.iterator(null);
          //                    Bits liveDocs = MultiFields.getLiveDocs(index);
          //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
          //                    int docId;
          //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
          //                        index.g
          //                    }
          BytesRef text = null;
          while ((text = termsEnum.next()) != null) {
            //                        System.out.println(text.utf8ToString() + " - " +
            // termsEnum.totalTermFreq());
            //                        System.out.println(termsEnum.docFreq());

            if (text.utf8ToString().equals("this")) {
              assertEquals(2, termsEnum.docFreq());
              assertEquals(3, termsEnum.totalTermFreq());
            }

            i++;
          }
        }
      }
    } catch (Exception e) {
      throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
  }